# But du notebook

Fine-tuner un modèle de summarization sur le corpus qu'on a constitué `imdb_wiki_corpus`. C'est un fichier CSV dans lequel chaque ligne correspond à une oeuvre. Dans la première colonne "document", on a le long plot wikipédia et dans la colonne "summary", on a le court synopsis IMDB.

# Importation des modules

In [1]:
# Pré-traitement des données
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer

# Fine-tuning
import torch
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer

# Evalution
from rouge_score import rouge_scorer
import nltk
import numpy as np

from huggingface_hub import notebook_login

checkpoint = "t5-small"

# Pre-processing des données d'entraînement

In [20]:
# Chargement du corpus depuis le CSV
raw_dataset = load_dataset("csv", data_files="../Data/imdb_wiki_corpus.csv", sep=",")
raw_dataset

Using custom data configuration default-270a1b2fd45c54ed
Found cached dataset csv (C:/Users/aengp/.cache/huggingface/datasets/csv/default-270a1b2fd45c54ed/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 9807
    })
})

In [21]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

max_input_length = 1024 # longueur max pour les plots wikipédia long
max_target_length = 128 # longueur max pour les synopsis IMDB court

def preprocess_function(datapoint):
    """Fonction pour le pré-traitement du corpus"""
    model_inputs = tokenizer(
        datapoint["document"],
        max_length=max_input_length,
        padding=True, # Ajout de tokens pour que tout ait la même longueur
    )
    labels = tokenizer(
        datapoint["summary"], 
        max_length=max_target_length, 
        padding=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Pré-traitement du corpus
tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)

tokenized_dataset

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/10 [00:00<?, ?ba/s]



In [23]:
# Suppression des colonnes avec du texte brut
tokenized_dataset = tokenized_dataset.remove_columns(
    raw_dataset["train"].column_names
)

features = [tokenized_dataset["train"][i] for i in range(2)]

tokenized_dataset

In [24]:
# Séparation en train/test
# train_dataset, val_dataset = train_test_split(tokenized_dataset["train"], test_size=0.2, random_state=42)

# Fine-tuning de mT5

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [27]:
# Pour le calcul du score ROUGE

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Décodage des synopsis générés en texte lisible
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Décodage des synopsis de référence en texte
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE a besoin d'une phrase par ligne
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Calcul du score ROUGE
    result = scorer.score(decoded_preds, decoded_labels)
    
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [33]:
batch_size = 128
num_train_epochs = 3
# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb-wiki",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
# trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
