In [1]:
import pandas as pd
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from transformers import (
    BertConfig,
    BertForMaskedLM,
    BertModel,
    DataCollatorForLanguageModeling,
    RobertaTokenizer,
    Trainer,
    TrainingArguments,
)
from naat.data import list_files, OUTPUT_PATH, get_file_languages
from naat.data_files import ROOT
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from pathlib import Path
import os

2023-02-27 16:31:12.806466: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-27 16:31:13.508808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64${LD_LIBRARY_PATH:+:}
2023-02-27 16:31:13.508903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64${LD_LIBRARY_PATH:+:}


In [2]:
SEQUENCE_LEN = 512  # maximum length of embeddings
TOKENIZER_FILES = None  # files of raw text
VOCAB_SIZE = 32000  # vocabulary size for tokenizer
MIN_FREQ = 2  # minimum term frequency for the vocabulary

MODEL_PATH = None  # path to save the pretrained model

HIDDEN_LAYERS = 12  # number of hidden layers of BERT model
HIDDEN_SIZE = 768  # hidden size of BERT model
ATTENTION_HEADS = 12  # number of attention heads of BERT model

In [3]:
class LegalDataset(Dataset):
    def __init__(self, text):
        self.encodings = text

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, index):
        item = {"input_ids": torch.tensor(self.encodings.iloc[index])}
        return item

In [4]:
def create_tokenizer(files, vocab_size, min_freq, max_len, save_path: Path):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(
        files=files,
        vocab_size=vocab_size,
        min_frequency=min_freq,
        special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>",
        ],
    )
    tokenizer.save_model(str(save_path))
    tokenizer = ByteLevelBPETokenizer(
        str(save_path / "vocab.json"),
        str(save_path / "merges.txt"),
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=max_len)

    tokenizer.save(str(save_path / "tokenizer.json"))

In [5]:
def process_text(filenames, map_tokenize, encoding):
    texts = []
    for filename in filenames:
        with open(filename, "r", encoding=encoding) as file:
            text = file.read()
            
        corpus = PlaintextCorpusReader(filename.parent, filename.name)
        texts += corpus.sents()

    texts = pd.Series(texts)
    tqdm.pandas(desc="Tokenizing")
    texts = texts.progress_map(map_tokenize)
    dataset = LegalDataset(texts)
    texts = None
    
    torch.save(dataset, "dataset.pt")
    return "dataset.pt"

In [10]:
text_files = list(list_files(str(OUTPUT_PATH), verbose=False, extension="*"))
fr_text_files = []

for text_file in text_files:
    language = get_file_languages(text_file)
    if language == "fr":
        fr_text_files.append(text_file)

In [16]:
toto.name

'Calendrier des newsletters - Affaires Climatiques_.txt'

In [17]:
toto = fr_text_files[0]
coco = PlaintextCorpusReader(str(toto.parent), toto.name)
coco.sents()

[['Newsletter', '15', '/', '01', '/', '2019', 'CALENDRIER', 'DES', 'NEWSLETTERS', 'ET', 'THEMATIQUES', 'ABORDEES', 'AVANT', 'TOUT', ':', 'à', 'qui', 'on', 'a', 'envie', 'd', '’', 'envoyer', 'cette', 'newsletter', '?'], ['Etablir', 'une', 'liste', 'de', 'contacts', 'qui', 'pourraient', 'être', 'intéressé', '-', 'es', ':', 'groupe', 'juristes', '+', 'académiques', '+', 'qui', 'd', '’', 'autre', '?'], ...]

In [None]:
create_tokenizer(
    [str(text_file) for text_file in fr_text_files], VOCAB_SIZE, MIN_FREQ, SEQUENCE_LEN, ROOT.parent
)

In [None]:
# Load Tokenizer
tokenizer = RobertaTokenizer.from_pretrained(str(ROOT.parent), max_len=SEQUENCE_LEN)

In [None]:
def map_tokenize(text):
    return tokenizer.encode(text, max_length=SEQUENCE_LEN, truncation=True)


dataset_path = process_text(fr_text_files, map_tokenize, "utf8")
dataset = torch.load(dataset_path)

In [None]:
mlm_prob = 0.15 # mlm masking probability
# Create Masked Language Model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob
)

In [None]:
bert_config = BertConfig(
    vocab_size=VOCAB_SIZE,
    max_position_embeddings=SEQUENCE_LEN,
    num_hidden_layers=HIDDEN_LAYERS,  # L
    hidden_size=HIDDEN_SIZE,  # H
    num_attention_heads=ATTENTION_HEADS,  # A
    type_vocab_size=1,
)
model = BertForMaskedLM(config=bert_config)

In [None]:
training_args = TrainingArguments(
    output_dir=str(ROOT.parent / "checkpoint"),
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=5,
    prediction_loss_only=True,
    max_steps=0, # number of training steps, overwrites epochs
    learning_rate=1e-4,
    adam_beta1=0.9, # adam beta1 parameter
    adam_beta2=0.99, # adam beta2 parameter
    weight_decay=0.01, # weight decay
    lr_scheduler_type="linear", # learning rate scheduler type
    warmup_steps=10_000, # warmup steps
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
# Train
trainer.train()

# Save model
trainer.save_model(MODEL_PATH)