## Execice 2 : Utilisation de transformers pour les deux datasets

### 2.1 :  Utilisation d'un transformer prêt à l'emploi

In [75]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch
import numpy as np

In [90]:
# Préparer les datasets
class FakeNewsData(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

from sklearn.model_selection import train_test_split

def split_and_prepare_datasets(dataframe, text_field, target_field, tokenizer, max_seq_length):
    texts_train, texts_test, labels_train, labels_test = train_test_split(
        dataframe[text_field].values,
        dataframe[target_field].values, 
        test_size=0.2,
        random_state=42 
    )
    train_data = FakeNewsData(texts_train, labels_train, tokenizer, max_seq_length)
    test_data = FakeNewsData(texts_test, labels_test, tokenizer, max_seq_length)

    return train_data, test_data

In [None]:
df_en = pd.read_csv("data_fake_news_en.csv")

df_fr = pd.read_csv("fake_news_fr.csv")

# Afficher les premières lignes pour vérifier la structure
print("Dataset en anglais :")
print(df_en.head())
print(df_en.columns)

print("\nDataset en français :")
print(df_fr.head())
print(df_fr.columns)

Dataset en anglais :
                                               title  \
0  Three more states refuse Trump commission's vo...   
1   Trump Crosses The Line, Attacks Civil Rights ...   
2  IS TRUMP A RACIST? Famous Italian-American “Ge...   
3   Ted Cruz Gets His Unethical A** Handed To Him...   
4  Putin says Trump hampered from delivering elec...   

                                                text  label  
0  WASHINGTON (Reuters) - Maryland, Delaware and ...      1  
1  Georgia Congressman John Lewis is one of Ameri...      0  
2  Robert Davi gives a great answer to Neil Cavut...      0  
3  Seth Meyers destroyed Republican presidential ...      0  
4  SOCHI, Russia (Reuters) - Russian President Vl...      1  
Index(['title', 'text', 'label'], dtype='object')

Dataset en français :
                                               title  \
0  Les chefs de service hospitaliers en appellent...   
1  L'origine des comportements alimentaires ident...   
2  Microsoft alerte sur de no

In [68]:
from sklearn.model_selection import train_test_split

# Diviser le dataset en anglais
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(
    df_en['text'], df_en['label'], test_size=0.2, random_state=42
)
print(X_train_en)
# Diviser le dataset en français
X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(
    df_fr['text'], df_fr['fake'], test_size=0.2, random_state=42
)
print(X_train_fr)

X_test_sample_en = X_test_en.sample(n=100, random_state=42)
y_test_sample_en = y_test_en.loc[X_test_sample_en.index].tolist()

X_test_sample_fr = X_test_fr.sample(n=100, random_state=42)
y_test_sample_fr = y_test_fr.loc[X_test_sample_fr.index].tolist()


X_test_sample_en = X_test_sample_en.tolist()
X_test_sample_fr = X_test_sample_fr.tolist()

36335    Democrats want to spend a whopping $2 billion ...
12384    TUNIS (Reuters) - Tunisia will continue with a...
24419    Wouldn t it be great if we had someone in gove...
24740     Last week, President Trump made a public anno...
27039    WASHINGTON (Reuters) - The United States, Cana...
                               ...                        
11284    Michelle Obama fed her husband s feud with Don...
44732    WASHINGTON (Reuters) - Republican Mitt Romney ...
38158    Climate grifter Al Gore is confronted about gl...
860      It s gotten to the point that if you re still ...
15795    WASHINGTON (Reuters) - The U.S. Supreme Court ...
Name: text, Length: 35918, dtype: object
2365    « L’Ukraine est à genoux et tout le monde s’en...
1100    Les 5 infos dans le rétro du week-end : Tensio...
1526    Un gagnant de l’Euromillion flambe et dilapide...
298     Dossier : balade dans le curieux monde des fra...
927     Municipales 2020 au Mans : Stéphane Le Foll dé...
                    

In [None]:
from transformers import pipeline
from sklearn.metrics import classification_report

classifier_en = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
X_test_sample_en = [text[:512] for text in X_test_sample_en]
y_pred_en = [classifier_en(text)[0]['label'] for text in X_test_sample_en]
y_pred_en = [1 if label == "POSITIVE" else 0 for label in y_pred_en]
print("Rapport de classification (anglais) :")
print(classification_report(y_test_sample_en, y_pred_en))

Device set to use mps:0


Rapport de classification (anglais) :
              precision    recall  f1-score   support

           0       0.47      0.82      0.60        45
           1       0.62      0.24      0.34        55

    accuracy                           0.50       100
   macro avg       0.54      0.53      0.47       100
weighted avg       0.55      0.50      0.46       100



In [None]:
from transformers import pipeline
from sklearn.metrics import classification_report

classifier_fr = pipeline("text-classification", model="camembert-base")
X_test_sample_fr = [text[:512] for text in X_test_sample_fr]
y_pred_fr = [classifier_fr(text)[0]['label'] for text in X_test_sample_fr]
y_pred_fr = [1 if label == "POSITIVE" else 0 for label in y_pred_fr]
print("Rapport de classification (français) :")
print(classification_report(y_test_sample_fr, y_pred_fr))

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


Rapport de classification (français) :
              precision    recall  f1-score   support

           0       0.66      1.00      0.80        66
           1       0.00      0.00      0.00        34

    accuracy                           0.66       100
   macro avg       0.33      0.50      0.40       100
weighted avg       0.44      0.66      0.52       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.2 Finetuning d'un modèle de transformers

In [94]:
model_englais = "distilbert-base-uncased"
model_français = "camembert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_length = 100

In [None]:
tokenizer_englais = AutoTokenizer.from_pretrained(model_englais)
train_englais, test_englais = split_and_prepare_datasets(df_en, "text", "label", tokenizer_englais, max_length)
tokenizer_français = AutoTokenizer.from_pretrained(model_français)
train_français, test_français = split_and_prepare_datasets(df_fr, "text", "fake", tokenizer_français, max_length)

In [97]:
from transformers import AdamW
import torch

def fine_tune_transformer(model, training_loader, validation_loader, num_epochs, device):
    model_optimizer = AdamW(model.parameters(), lr=5e-5)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        model.train() 
        epoch_loss, correct_predictions = 0, 0
        for data_batch in training_loader:
            input_ids = data_batch["input_ids"].to(device)
            attention_masks = data_batch["attention_mask"].to(device)
            target_labels = data_batch["label"].to(device)
            model_optimizer.zero_grad()
            model_outputs = model(input_ids, attention_mask=attention_masks)
            batch_loss = criterion(model_outputs.logits, target_labels)
            batch_loss.backward()
            model_optimizer.step()
            epoch_loss += batch_loss.item()
            correct_predictions += (model_outputs.logits.argmax(1) == target_labels).sum().item()
        avg_loss = epoch_loss / len(training_loader)
        accuracy = correct_predictions / len(training_loader.dataset)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

    return model

In [98]:
def Finetuning_model(train_dataset, test_dataset, model_name, epochs, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8)
    
    model = fine_tune_transformer(model, train_loader, test_loader, epochs, device)
    return model

In [None]:
Finetuning_model_français = Finetuning_model(train_français, test_français, model_français, epochs=3, device=device)

In [None]:
Finetuning_model_englais = Finetuning_model(train_englais, test_englais, model_englais, epochs=3, device=device)