In [1]:
from lm_from_scratch.corpus.decision_corpus import DecisionCorpus
from artifacts import DECISION_CORPUS_RAW
from transformers import T5Tokenizer
from tokenizers import AddedToken

import sentencepiece as spm

VOCAB_SIZE = 900
N_SEGMENTS = 2
MAX_LEN = 128 # 512 # what is the maximum context length for predictions?

BATCH_SIZE = 32 # how many independent sequences will we process in parallel?

MAX_SENTENCE_LEN = MAX_LEN // 2
MIN_SENTENCE_LEN = 10

In [2]:
corpus = DecisionCorpus()

corpus_df = corpus.df.sample(
        frac=1,
        random_state=42
    ).reset_index(
        drop=True)

# Train and test splits
n = int(0.9*len(corpus_df)) # first 90% will be train, rest val

data = corpus.get_text()

with open(DECISION_CORPUS_RAW, "w", encoding="utf-8") as f:
    for d in data:
        f.write(d + "\n")


In [3]:
PAD_TOKEN_ID = 0
EOS_TOKEN_ID = 1
UNK_TOKEN_ID = 2
BOS_TOKEN_ID = 3

spm.SentencePieceTrainer.Train(
    input=DECISION_CORPUS_RAW,
    model_prefix='sentencepiece_tokenizer',
    vocab_size=VOCAB_SIZE,
    pad_id=PAD_TOKEN_ID,
    unk_id=UNK_TOKEN_ID,
    eos_id=EOS_TOKEN_ID,
    bos_id=BOS_TOKEN_ID,
    pad_piece='<pad>',
    unk_piece='<unk>',
    eos_piece='</s>',
    bos_piece='<s>',
    model_type='unigram',
)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /home/clem/Source/sandbox/lm-from-scratch/artifacts/decision-raw.txt
  input_format: 
  model_prefix: sentencepiece_tokenizer
  model_type: UNIGRAM
  vocab_size: 900
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 2
  bos_id: 3
  eos_id: 1
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 


In [4]:
# tokenizer = spm.SentencePieceProcessor(model_file="sentencepiece_tokenizer.model")

tokenizer = T5Tokenizer("sentencepiece_tokenizer.model", extra_ids=0)

tokenizer.add_special_tokens({
    "additional_special_tokens": [
        AddedToken(content=f"<extra_id_{i}>", single_word=False, normalized=False, special=True) for i in range(100)
    ]})

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


100

In [6]:
tokenizer.encode('<pad></s><unk>')
tokenizer.decode([0,1,2,3,4,5])
list(tokenizer.get_vocab().items())[:10]
tokenizer.special_tokens_map

# Returns the number of added tokens when encoding a sequence with special tokens

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_39>',
  '<extra_id_95>',
  '<extra_id_93>',
  '<extra_id_79>',
  '<extra_id_32>',
  '<extra_id_15>',
  '<extra_id_73>',
  '<extra_id_36>',
  '<extra_id_1>',
  '<extra_id_19>',
  '<extra_id_26>',
  '<extra_id_60>',
  '<extra_id_51>',
  '<extra_id_5>',
  '<extra_id_3>',
  '<extra_id_31>',
  '<extra_id_58>',
  '<extra_id_23>',
  '<extra_id_50>',
  '<extra_id_78>',
  '<extra_id_96>',
  '<extra_id_72>',
  '<extra_id_74>',
  '<extra_id_4>',
  '<extra_id_48>',
  '<extra_id_42>',
  '<extra_id_90>',
  '<extra_id_82>',
  '<extra_id_57>',
  '<extra_id_34>',
  '<extra_id_16>',
  '<extra_id_33>',
  '<extra_id_35>',
  '<extra_id_69>',
  '<extra_id_14>',
  '<extra_id_65>',
  '<extra_id_88>',
  '<extra_id_97>',
  '<extra_id_94>',
  '<extra_id_27>',
  '<extra_id_80>',
  '<extra_id_41>',
  '<extra_id_55>',
  '<extra_id_68>',
  '<extra_id_98>',
  '<extra_id_24>',
  '<extra_id_22>',
  '<extra_id_4

In [5]:
text_test = data[0][280:315].replace("\n","").replace("_", "")
text_test

'COUR DE CASSATION, CHAMBRE CRIMINEL'

In [6]:
text_ids = tokenizer.encode(text_test, add_special_tokens=False)
text_ids

[215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174]

In [7]:
len(text_ids)

12

In [36]:
tokenizer.decode(text_ids)

'COUR DE CASSATION, CHAMBRE CRIMINEL'

In [23]:
tokenizer.prepare_seq2seq_batch(
    src_texts=[text_test],
    tgt_texts=[text_test],
    max_length=15,
    max_target_length=15,
    padding="max_length")

{'input_ids': [[215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174, 1, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], 'labels': [[215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174, 1, 0, 0]]}

In [24]:
tokenizer(text=text_test,
          text_target=None,
          padding="max_length",
          stride=2,
          max_length=15,
          truncation=True)

{'input_ids': [215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174, 1, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}

In [43]:
tokenizer.prepare_for_model(
    text_ids, max_length=20, truncation=True, padding=True)

{'input_ids': [215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Compare with pretrained

In [12]:
pretrained_tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
pretrained_tokenizer.special_tokens_map
list(pretrained_tokenizer.get_vocab().items())[:10]
pretrained_tokenizer.encode('<pad></s><unk>')
pretrained_tokenizer.encode('<s>')
pretrained_tokenizer.decode([0,1,2,3,4,5])

'<pad></s><unk>X.'

In [13]:
pretrained_tokenizer.encode(text_test)

[3,
 6727,
 5693,
 329,
 3,
 7874,
 276,
 12062,
 27872,
 377,
 16375,
 2,
 25018,
 71,
 12224,
 2,
 382,
 3396,
 5292,
 1]

In [14]:
pretrained_tokenizer.decode(pretrained_tokenizer.encode(text_test))

'AU NOM DU PEUPLE FRAN<unk> AIS ARR<unk> T DE LA</s>'

In [38]:
pretrained_tokenizer.prepare_for_model(
    text_ids, max_length=20, truncation=True, padding=True)

{'input_ids': [215, 123, 205, 4, 391, 226, 766, 139, 88, 198, 132, 174, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}