In [1]:
from unml.modules.summarize import Summarizer

In [9]:
summarizer = Summarizer()
tokenizer = summarizer.summarizer.model.tokenizer

In [16]:

# Detect the end of a sentence
def is_eos(token: str) -> bool:
    """
    Detect if a token is the end of a sentence

    Parameters
    ----------
    `token` : `str`
        The token to be checked

    Returns
    -------
    `bool`
        True if the token is the end of a sentence, False otherwise
    """
    return token in [".", "!", "?"]



tokens = summarizer.summarizer.model.tokenizer.tokenize("Hello world! Je suis Clément et j'aime bien le foot." * 1000)

Token indices sequence length is longer than the specified maximum sequence length for this model (19000 > 1024). Running this sequence through the model will result in indexing errors


In [46]:

from typing import List

def chunk_tokens(tokens: List[str], max_chunk_size: int = 1024) -> List[str]:
    chunks = []
    current_chunk = []
    current_sentence = []
    
    for token in tokens:
        print("=" * 20)
        print(f'Token: {token}')
        print(f'Current sentence: {current_sentence}')
        print("=" * 20)
        current_sentence += [token]
        is_last = token == tokens[-1]
        
        if is_eos(token) or is_last:
            print(f'EOS: {token}')
            print(f'Condition: {len(current_chunk)} + {len(current_sentence)} <= {max_chunk_size} : {len(current_chunk) + len(current_sentence) <= max_chunk_size}')
            if len(current_chunk) + len(current_sentence) <= max_chunk_size:
                current_chunk.extend(current_sentence)
                current_sentence = []
            else:
                chunks.append(current_chunk)
                current_chunk = current_sentence.copy()
                current_sentence = []
        print(f'Chunks: {chunks}')
        print(f'Current chunk: {current_chunk}')
        print()

    chunks.append(current_chunk)
    return chunks

In [47]:
tokens = tokenizer.tokenize("Hello world! I am Clément and I like football. I love eating ice-cream, and sometimes pizza")
chunk_tokens(tokens, max_chunk_size=14)

Token: Hello
Current sentence: []
Chunks: []
Current chunk: []

Token: Ġworld
Current sentence: ['Hello']
Chunks: []
Current chunk: []

Token: !
Current sentence: ['Hello', 'Ġworld']
EOS: !
Condition: 0 + 3 <= 14 : True
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: ĠI
Current sentence: []
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: Ġam
Current sentence: ['ĠI']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: ĠCl
Current sentence: ['ĠI', 'Ġam']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: Ã©
Current sentence: ['ĠI', 'Ġam', 'ĠCl']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: ment
Current sentence: ['ĠI', 'Ġam', 'ĠCl', 'Ã©']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: Ġand
Current sentence: ['ĠI', 'Ġam', 'ĠCl', 'Ã©', 'ment']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: ĠI
Current sentence: ['ĠI', 'Ġam', 'ĠCl', 'Ã©', 'ment', 'Ġand']
Chunks: []
Current chunk: ['Hello', 'Ġworld', '!']

Token: Ġlike

[['Hello',
  'Ġworld',
  '!',
  'ĠI',
  'Ġam',
  'ĠCl',
  'Ã©',
  'ment',
  'Ġand',
  'ĠI',
  'Ġlike',
  'Ġfootball',
  '.'],
 ['ĠI',
  'Ġlove',
  'Ġeating',
  'Ġice',
  '-',
  'cream',
  ',',
  'Ġand',
  'Ġsometimes',
  'Ġpizza']]

In [48]:
tokenizer.tokenize('0\n1\n2')

['0', 'Ċ', '1', 'Ċ', '2']