### 7.2. Approche générative (30%)

Entrainez un modèle séquence à séquence de type T5 partant de la __question_raw__ et qui génére la __question_tagged__ correspondante.

__Exemple:__

Jeu de validation:

|   question_id   |                  question_raw                          |
|    :----:       | :----------------------------------------------:            
|      1          | What is the country for head of state of Justin Trudeau |


__Entrée du modèle:__ What is the country for head of state of Justin Trudeau

__Prédiction du modèle__:  what is the <\<wd:Q6256\>> for <\<wdt:P35\>> of <\<wd:Q3099714\>>



#### 7.2.1. Modèle génératif (25%)

Le notebook suivant est le fruit de nombreux essais pour obtenir la baseline de 25% de f1 donnée sur le moodle, elle va beaucoup plus loin que l'utilisation de base d'un T5 car celui-ci nous faisait plafonner à 7% de f1

Voici la liste des solutions testées mais non présente dans ce notebook :
- Modele Seq2Seq pretrained sur le raw pour prédire la question tagged avec differents batch size, differents ajouts de token au model (juste '<', '<' 'wd' 'wdt', 'ps' et 'pq'), ajout de toutes les classes des jetons (wd:Q362736 par exemple) en nouveau token. Ces methodes n'ont pas porté leur fruit malheuseuemnt
- Model T5forCoditionnal avec les options citées prédédemment

Le dernier model testé est un T5ForConditionnal avec une loss custom qui penalise plus les labels wikidata mal générés que le reste du texte (ce qui ressemble plus ou moins à une partie 3 mais ce n'est pas faute d'avoir essayer d'obtenir un score normal autrement)

In [None]:
import torch
import numpy as np
import pandas as pd
import nltk
import spacy
import re
import matplotlib.pyplot as plt
from collections import Counter
# Check if CUDA is available and test it
cuda_available = torch.cuda.is_available()
print("CUDA Available? ", cuda_available)

In [None]:

# Download the required NLTK resources
nltk.download('punkt')


In [None]:
root = '/content/drive/MyDrive/traitement du langage/TP4/data/'
from google.colab import drive
drive.mount('/content/drive')

#root = './data/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_train = pd.read_csv(root+'train.csv',sep = '|')
data_test = pd.read_csv(root+'test.csv', sep='|')
data_validation = pd.read_csv(root+'validation.csv', sep ='|')
data_train.head()

In [None]:
regex = "<<[wdtpsq]{2,3}:[^>]*>>"
all_text = data_train['question_tagged'].sum()
list_balise = re.findall(regex, all_text)
balises_train = Counter(list_balise)

In [None]:
l_balises_train = balises_train.keys()
print(len(l_balises_train))

20034


In [None]:
#return the data to the following shape : array[phrases, word of phrases], list[phrases, label of word phrases]
def preprocess_test(df : pd.DataFrame) :
    original_tokens = []
    for _, row in df.iterrows():
        question = row['question_raw']
        tokens = question.split()
        original_tokens.append(tokens)
    return original_tokens

In [None]:
def retrieve_all_tags(df : pd.DataFrame) :
    all_tags = []
    regex = "<<[wdtpsq]{2,3}:[^>]*>>"
    for _, row in df.iterrows():
        question = row['question_tagged']
        tags = re.findall(regex, question)

        all_tags+=tags

    final_tags = list(set(all_tags))
    return final_tags

Process the train and val dataset

In [None]:
!pip install transformers -U
!pip install accelerate -U

In [None]:
import torch
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, AutoTokenizer
from torch.nn import CrossEntropyLoss



In [None]:
def get_number_token(df) :
    all_tags = []
    regex = "<<[wdtpsq]{2,3}:[PQ]([^>]*)>>"
    for _, row in df.iterrows():
        question = row['question_tagged']
        tags = re.findall(regex, question)

        all_tags+=tags
    return list(set(all_tags))


### Implementation d'un dataset custom
Ce dataset permet de masquer les endroits du textes ou la loss sera plus importantes si mal classifié (la ou sont les jetons wikidata)

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=210):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.start_token_id = tokenizer.encode('<<', add_special_tokens=False)[0]
        self.end_token_id = tokenizer.encode('>>', add_special_tokens=False)[0]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = "convert to Wikidata query: " + self.data.iloc[idx]['question_raw'] #ajout du prefix "convert to Wikidata query : " pour guider le modele de génération
        target_text = self.data.iloc[idx]['question_tagged']

        # Tokenizing inputs and labels
        inputs = self.tokenizer(source_text, return_tensors="pt", max_length=self.max_length, truncation=True, padding='max_length')
        labels = self.tokenizer(target_text, return_tensors="pt", max_length=self.max_length, truncation=True, padding='max_length')

        # Create mask for << and >> tokens in labels
        labels_ids = labels['input_ids'].squeeze()
        starts = labels_ids == self.start_token_id
        ends = labels_ids == self.end_token_id
        ids = []
        for i in range(len(labels_ids)):
            ids.append(self.tokenizer.decode(labels_ids[i]))
        # Use XOR for boolean tensors
        cumulative = torch.cumsum(starts ^ ends, dim=-1)
        cumulative = cumulative.masked_fill_(ends, 0)
        cumulative =cumulative.masked_fill_(starts, 0)
        #apply a %2 to the cumulative tensor to get the mask of all the tokens between << and >>
        inside_brackets_mask = cumulative % 2

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels_ids,
            'inside_brackets_mask': inside_brackets_mask
        }

class CustomGenDataset(torch.utils.data.Dataset):
    """
    Dataset pour la génération de test
    """
    def __init__(self, dataframe, tokenizer, max_length=210):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = "convert to Wikidata query: " + self.data.iloc[idx]['question_raw']
        # Tokenizing inputs
        inputs = self.tokenizer(source_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
        }


#### implementation de ce modele custom

In [None]:
class CustomT5Model(T5ForConditionalGeneration):
    #Model custom implementant une loss differente suivant les mask des tokens entre << >>
    def forward(self, input_ids=None, attention_mask=None, labels=None, inside_brackets_mask=None, pad_token_id=None,*args, **kwargs):
        # Standard forward pass
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels,*args, **kwargs)

        # Compute custom loss if labels and inside_brackets_mask are provided
        if labels is not None and inside_brackets_mask is not None and pad_token_id is not None:
            loss = self.compute_custom_loss(outputs, labels, inside_brackets_mask, pad_token_id)
            outputs.loss = loss

        return outputs

    @staticmethod
    def compute_custom_loss(model_output, labels, inside_brackets_mask, pad_token_id, inside_brackets_weight=5.0):
        logits = model_output.logits
        loss_fct = CrossEntropyLoss(ignore_index=pad_token_id, reduction='none')

        # Apply higher weight to tokens inside '<<' and '>>'
        loss_weights = torch.ones_like(labels, dtype=torch.float)
        loss_weights[inside_brackets_mask] = inside_brackets_weight

        # Calculating weighted loss
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        weighted_loss = (loss * loss_weights.view(-1)).mean()

        return weighted_loss

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract 'labels' and 'inside_brackets_mask' from inputs
        labels = inputs.pop("labels", None)
        inside_brackets_mask = inputs.pop("inside_brackets_mask", None)

        # Forward pass
        outputs = model(**inputs, labels=labels, inside_brackets_mask=inside_brackets_mask)

        # Compute custom loss using the outputs and the labels
        loss = outputs.loss if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss


### debut de l'entrainement, définition des parametres

In [None]:
model_name = 't5-base'

In [None]:

tokenizer = AutoTokenizer.from_pretrained(model_name)

list_number = get_number_token(data_train)

new_tokens = ['<<', '>>', 'wd:', 'wdt:', 'ps:', 'pq:']
# Add these special tokens to the tokenizer
tokenizer.add_tokens(new_tokens)
model = CustomT5Model.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

### Entrainement

In [None]:

# On crée 2 instances de CustomDataset : une pour le training et une pour la validation
dataset = CustomDataset(data_train, tokenizer)
eval_dataset=CustomDataset(data_validation, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=2,
    logging_dir='./logs',
    evaluation_strategy="steps",
    eval_steps=1000,
    save_total_limit=3,
    save_steps=5000,
    logging_steps=1000,
    save_strategy="steps",
)

# Create Trainer instance
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,  # Replace with your dataset
    eval_dataset=eval_dataset,  # Replace with your eval dataset
)

# On commence l'entraînement
trainer.train()

In [None]:

tokenizer.save_pretrained(root+'t5_tokenizer_2_base_custom_retrained')
#save the model
model.save_pretrained(root+'t5_model_2_base_custom_retrained')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(root+'t5_tokenizer_2_base_custom_retrained')
# Define your special tokens

# Add these special tokens to the tokenizer
model = CustomT5Model.from_pretrained(root+'t5_model_2_base_custom_retrained')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Scores sur l'ensemble de validation

Définition des fonctions de métrique

In [None]:

from transformers import AutoTokenizer, GenerationConfig, StoppingCriteriaList, LogitsProcessorList
#model to cuda
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
def retrieve_back_tokenized(tokenizer : AutoTokenizer, tokenized):
    return tokenizer.batch_decode(tokenized)

from torch.utils.data import DataLoader
def translate(model, tokenizer, dataset, max_new_tokens=210): #fonction pour générer la phrase output à partir de la phrase de base
    model.eval()  # Evaluation mode
    all_translated = []

    loader = DataLoader(dataset, batch_size=64, shuffle=False)
    with torch.no_grad():  # Disable gradient calculation
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Adapted to use the new generate function
            translated = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                generation_config=GenerationConfig(max_length=max_new_tokens),
            )

            # Retrieve tokenized translation
            translated = retrieve_back_tokenized(tokenizer, translated)
            all_translated += translated

    return all_translated



def extract_tags(prediction : pd.DataFrame, column_name : str):
    #extrait les tags à l'aide de regex pour les comparer aux tags attendus
    regex = "<<\s?(.[^>]*)>>"
    tags = []
    for _, row in prediction.iterrows():
        question = row[column_name]
        tag_list = re.findall(regex,question)
        new_tag_list = []
        for tag in tag_list :
          if 'wd:' in tag or 'wdt:' in tag or 'ps:' in tag or 'pq:' in tag :
            tag = tag.split()
            try :
              tag = tag[0]+tag[1]
            except :
              tag = tag[0]
          new_tag_list.append(tag)
        tags.append(new_tag_list)
    return tags


def pad_missing_tags(df : pd.DataFrame):
    #ajoute des tags vides manquants dans le cas ou trop ont été prédits ou pas assez
    predict_tags = extract_tags(df, 'question_tagged_new')
    wanted_tags = extract_tags(df, 'question_tagged')
    new_predict_tags = []
    for i in range(len(predict_tags)):
        if len(predict_tags[i]) < len(wanted_tags[i]):
            predict_tags[i] = predict_tags[i] + ['']*(len(wanted_tags[i])-len(predict_tags[i]))
        elif len(predict_tags[i]) > len(wanted_tags[i]):
            predict_tags[i] = predict_tags[i][:len(wanted_tags[i])]
        new_predict_tags.append(predict_tags[i])

    return new_predict_tags, wanted_tags


from sklearn.metrics import accuracy_score, f1_score
def compute_accuracy(predict,true):

    #flatten the list of list
    predict = [item for sublist in predict for item in sublist]
    true = [item for sublist in true for item in sublist]
    return accuracy_score(true,predict)

def compute_f1(predict,true):
    #flatten the list of list
    predict = [item for sublist in predict for item in sublist]
    true = [item for sublist in true for item in sublist]
    return f1_score(true,predict,average='macro')


In [None]:
# show what it gives on the validation set
data_train = pd.read_csv(root+'train.csv',sep = '|')
data_test = pd.read_csv(root+'test.csv', sep='|')
data_validation = pd.read_csv(root+'validation.csv', sep ='|')
dataset = CustomGenDataset(data_validation, tokenizer)
data_validation['question_tagged_new'] = translate(model, tokenizer, dataset,max_new_tokens=200)
data_validation['question_tagged_new'] = data_validation['question_tagged_new'].apply(lambda x: x.replace('<pad>',''))
data_validation['question_tagged_new'] = data_validation['question_tagged_new'].apply(lambda x: x.replace('</s>',''))


In [None]:
data_validation['question_tagged_new'].head()

0     what is the<<wd: Q12140>> for<<wdt: P2175>> o...
1     what is the<<pq: P2077>> for<<wd: Q633>> has<...
2     what is<<wdt: P175>> of<<wdt: P156>> of<<wd: ...
3               what is<<wdt: P2136>> of<<wd: Q1516>>?
4     did<<wd: Q9194>><<wdt: P1269>><<wd: Q43229>> ...
Name: question_tagged_new, dtype: object

> il y a un probleme d'espace que nous corrigeons ulterieurement

In [None]:
#performs scores
predict_tags, wanted_tags = pad_missing_tags(data_validation)
print(compute_accuracy(predict_tags, wanted_tags))
print(compute_f1(predict_tags, wanted_tags))

for predict, true in zip(predict_tags[:10], wanted_tags[:10]) :
    print(predict)
    print(true)
    print()


> Le score f1 de 15% reste malgré tout en dessous de la Baseline attendue de 25% donnée sur le moodle