# Finetuning de summarizer

Fine-tuner un modèle de summarization sur le corpus qu'on a constitué `imdb_wiki_corpus`. C'est un fichier CSV dans lequel chaque ligne correspond à une oeuvre. Dans la première colonne "document", on a le long plot wikipédia et dans la colonne "summary", on a le court synopsis IMDB. 

## Importation des modules

In [1]:
# Pré-traitement des données
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer

# Fine-tuning
import torch
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer

# Evaluation
from rouge_score import rouge_scorer
import nltk
import numpy as np

from huggingface_hub import notebook_login
import os

checkpoint = "t5-small"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=32'

## Pre-processing des données d'entraînement

In [2]:
# Chargement du corpus depuis le CSV
raw_dataset = load_dataset("csv", data_files="../Data/imdb_wiki_corpus.csv", sep=",")
raw_dataset

Using custom data configuration default-a78e702e1f65638d


Downloading and preparing dataset csv/default to C:/Users/aengp/.cache/huggingface/datasets/csv/default-a78e702e1f65638d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to C:/Users/aengp/.cache/huggingface/datasets/csv/default-a78e702e1f65638d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 9807
    })
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

max_input_length = 1024 # longueur max pour les plots wikipédia long
max_target_length = 128 # longueur max pour les synopsis IMDB court

def preprocess_function(datapoint):
    """Fonction pour le pré-traitement du corpus"""
    
    # Tokenization des longs plots wikipédia
    model_inputs = tokenizer(
        datapoint["document"],
        max_length=max_input_length,
        padding=True, # Ajout de tokens pour que tout ait la même longueur
    )
    
    # Tokenization des courts synopsis IMDB
    labels = tokenizer(
        datapoint["summary"], 
        max_length=max_target_length, 
        padding=True
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Pré-traitement du corpus
tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)

tokenized_dataset

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/10 [00:00<?, ?ba/s]



DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9807
    })
})

In [4]:
# Suppression des colonnes avec du texte brut
tokenized_dataset = tokenized_dataset.remove_columns(
    raw_dataset["train"].column_names
)

features = [tokenized_dataset["train"][i] for i in range(2)]

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9807
    })
})

In [6]:
# Séparation en train/test
# train_dataset, val_dataset = train_test_split(tokenized_dataset["train"], test_size=0.2, random_state=42)

## Fine-tuning de mT5

In [6]:
# Chargement du modèle depuis le checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [9]:
# Pour le calcul du score ROUGE

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Décodage des synopsis générés en texte lisible
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Décodage des synopsis de référence en texte
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE a besoin d'une phrase par ligne
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Calcul du score ROUGE
    result = scorer.score(decoded_preds, decoded_labels)
    
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [10]:
batch_size = 16
num_train_epochs = 3

logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = checkpoint.split("/")[-1] # Récupération du nom du modèle

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb-wiki",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Entraînement du modèle
trainer.train()
# trainer.evaluate()

Cloning https://huggingface.co/delphine-nguyen/t5-small-finetuned-imdb-wiki into local empty directory.
***** Running training *****
  Num examples = 9807
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1839
  Number of trainable parameters = 60506624


  0%|          | 0/1839 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 110.94 GiB (GPU 0; 6.00 GiB total capacity; 5.00 GiB already allocated; 0 bytes free; 5.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF