# Instructions

Ce notebook ne doit contenir que votre script servant à l'entrainement de votre modèle. Nous devons pouvoir l'exécuter en cliquant sur *Exécution -> Tout exécuter*.

Veuillez également ajouter des commentaires dans votre code pour expliquer ce que vous faites. N'hésitez pas à ajouter des blocs de textes (cliquez sur le bouton *+ Texte* en dessous du menu) pour ajouter plus d'explications.

Vous devrez déposer sur Moodle une archive au format .zip contenant un dossier avec vos noms.

Dans ce dossier, nous devons retrouver les deux notebooks (training et testing) ainsi qu'un nouveau dossier *models* contenant les poids de vos modèles entrainés, et si nécessaire un dossier *datasets* contenant d'autres données utilisée pour effectuer l'apprentissage de vos modèles (données obtenues par récupération sur le web "web scraping"  ou bien augmentation de données "data augmentation"). Si vous effectuez de l'augmentation de données, fournissez aussi le code pour la réaliser dans le notebook.

# 1.1. Importation 

In [2]:
import numpy as np
import pandas as pd
import os
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding, pipeline, logging
import evaluate
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

import spacy
from gensim.models import KeyedVectors

#np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan', precision=3, suppress=False, threshold=1000, formatter=None)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
#logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


True
1
NVIDIA GeForce RTX 3060 Ti


# 2.1. Augmentation de données par vecteur le plus proche

https://embeddings.net/embeddings/frWac_non_lem_no_postag_no_phrase_500_skip_cut100.bin

In [15]:
spacy_model_fr = spacy.load("fr_core_news_sm")
model = KeyedVectors.load_word2vec_format("frWac_non_lem_no_postag_no_phrase_500_skip_cut100.bin", binary=True, unicode_errors="ignore")

In [16]:
def find_nearest_vector(word):
    try:
        similar_words = model.most_similar(word)
        return [w[0] for w in similar_words]
    except KeyError:
        return [word]

# Function to augment dataset
def augmented_vector_data(dataset):
    augmented_sentences = []
    for d in dataset:
        augmented_sentences.append(d['data'])

    for d in dataset:
        sentence = d['data']
        spacy_analysed = spacy_model_fr(sentence)
        augmented_sentence = []

        for token in spacy_analysed:
            if token.pos_ == "NOUN":
                nearest = find_nearest_vector(f"{token.text}")
                augmented_sentence.append(nearest[0])
            else:
                augmented_sentence.append(token.text)

        augmented_sentences.append(' '.join(augmented_sentence))

    ids = list(range(0, len(augmented_sentences)))

    # Prepare DataFrame to save
    data = {
        "": ids,
        "data": augmented_sentences,
        "label": [d['label'] for d in dataset] * 2,
        "target_name": [d['target_name'] for d in dataset] * 2
    }

    # Create a DataFrame
    df = pd.DataFrame(data)
    df.to_csv("augmented_vector_train.csv", index=False)
    print("CSV file created: augmented_vector_train.csv")


# 2.2. Augmentation de données avec model de language

In [4]:
def augmented_mask_data(dataset):
    device = 0 if torch.cuda.is_available() else -1
    lm_unmasker = pipeline('fill-mask', model='allenai/longformer-base-4096', device=device)
    augmented_sentences = []
    for d in dataset:
        augmented_sentences.append(d['data'])

    for j,d in enumerate(dataset):
        sentence = d['data']
        spacy_analysed = spacy_model_fr(sentence)
        augmented_sentence = []

        for i, token in enumerate(spacy_analysed):
            if token.pos_ == "NOUN":
                temp = ' '.join([t.text for t in spacy_analysed[:i]]) + " <mask> " + ' '.join([t.text for t in spacy_analysed[i+1:]])
                output = lm_unmasker(temp)
                augmented_sentence.append(output[0]['token_str'])
            else:
                augmented_sentence.append(token.text)


        print(f"{j+1}/{len(dataset)+1}",' '.join(augmented_sentence))
        augmented_sentences.append(' '.join(augmented_sentence))

    
    ids = list(range(0, len(augmented_sentences)))

    # Prepare DataFrame to save
    data = {
        "": ids,
        "data": augmented_sentences,
        "label": [d['label'] for d in dataset] * 2,
        "target_name": [d['target_name'] for d in dataset] * 2
    }

    # Create a DataFrame
    df = pd.DataFrame(data)
    df.to_csv("augmented_mask_train.csv", index=False)
    print("CSV file created: augmented_mask_train.csv")


# 2.3. Load Dataset

In [7]:
data_augment_methods = "vector_data" #@param ["basic_data","vector_data","mask_data"]
basic_data_path = 'fake_train.csv'
basic_dataset = Dataset.from_pandas(pd.read_csv(basic_data_path))

match(data_augment_methods):
    case 'basic_data':
        dataset = basic_dataset
    case 'vector_data':
        augmented_file_vector_path = 'augmented_vector_train.csv'
        if not os.path.exists(augmented_file_vector_path):
            augmented_vector_data(basic_dataset)
        dataset = Dataset.from_pandas(pd.read_csv(augmented_file_vector_path))
    case 'mask_data': # à éviter prend bcp de temps :sob: (genre +5h avec ma rtx3060ti)
        augmented_file_mask_path = 'augmented_mask_train.csv'
        if not os.path.exists(augmented_file_mask_path):
            augmented_mask_data(basic_dataset)
        dataset = Dataset.from_pandas(pd.read_csv(augmented_file_mask_path))

ds_train, ds_test = dataset.train_test_split(test_size=0.2).values()

# 3. Transformer model Roberta_small

# 3.1. Définition du model

In [8]:
id2label = {0: "News", 1: "Fake News"}
label2id = {"News": 0, "Fake News": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "klue/roberta-small",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
).cuda()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")

def preprocess_function(examples):
    return tokenizer(examples['data'], truncation=True, padding=True, max_length=512)

tokenized_train = ds_train.map(preprocess_function, batched=True)
tokenized_test = ds_test.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Map: 100%|██████████| 2332/2332 [00:00<00:00, 3001.57 examples/s]
Map: 100%|██████████| 584/584 [00:00<00:00, 2969.47 examples/s]


# 3.2. Lancement du train pour Roberta

In [10]:
training_args = TrainingArguments(
    output_dir="defi_3_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    push_to_hub=False,
)

# pour save un backup mais bug sur mon pc
#    save_strategy="epoch",
#    load_best_model_at_end=True,

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

                                                 
 20%|██        | 146/730 [01:32<03:42,  2.63it/s]

{'eval_loss': 0.36216431856155396, 'eval_accuracy': 0.8424657534246576, 'eval_runtime': 34.7971, 'eval_samples_per_second': 16.783, 'eval_steps_per_second': 0.287, 'epoch': 1.0}


                                                   
 40%|████      | 292/730 [03:16<03:07,  2.33it/s]

{'eval_loss': 0.21409215033054352, 'eval_accuracy': 0.9041095890410958, 'eval_runtime': 30.8541, 'eval_samples_per_second': 18.928, 'eval_steps_per_second': 0.324, 'epoch': 2.0}


                                                   
 60%|██████    | 438/730 [05:49<03:44,  1.30it/s]

{'eval_loss': 0.25225746631622314, 'eval_accuracy': 0.910958904109589, 'eval_runtime': 56.0042, 'eval_samples_per_second': 10.428, 'eval_steps_per_second': 0.179, 'epoch': 3.0}


 68%|██████▊   | 500/730 [06:39<02:49,  1.36it/s]  

{'loss': 0.3332, 'learning_rate': 6.301369863013699e-06, 'epoch': 3.42}


PermissionError: [Errno 13] Permission denied: 'defi_3_model\\checkpoint-500'

# 3.3. Save Model

In [None]:
trainer.save_model("saved_model_rob_small")
tokenizer.save_pretrained("saved_model_rob_small")

# 4. CNN

# 4.1. creation du model

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

dataset = Dataset.from_pandas(pd.read_csv("fake_train.csv"))

tokenizer = Tokenizer(num_words=10000)  # Use top 10,000 words

tokenizer.fit_on_texts(dataset['data'])
sequences = tokenizer.texts_to_sequences(dataset['data'])

# Pad sequences to ensure equal length
max_length = 1000  # Maximum sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length)

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, np.array(dataset['label']), test_size=0.2)


In [12]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.2),
    GlobalMaxPooling1D(),
    Dropout(0.2),
    Dense(10, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [13]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 76ms/step - accuracy: 0.5065 - loss: 0.6863 - val_accuracy: 0.6267 - val_loss: 0.6516
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.7418 - loss: 0.6087 - val_accuracy: 0.9075 - val_loss: 0.5695
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.8045 - loss: 0.5082 - val_accuracy: 0.8151 - val_loss: 0.4592
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.8522 - loss: 0.3767 - val_accuracy: 0.8836 - val_loss: 0.3665
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.8990 - loss: 0.2861 - val_accuracy: 0.9315 - val_loss: 0.2964
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.9450 - loss: 0.1730 - val_accuracy: 0.9418 - val_loss: 0.2325
Epoch 7/10
[1m19/19[0m [32m━━━━

In [14]:
test_dataset = Dataset.from_pandas(pd.read_csv("fake_test.csv"))

sequences_test = tokenizer.texts_to_sequences(test_dataset['data'])
tokenizer.fit_on_texts(test_dataset['data'])
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

loss, accuracy = model.evaluate(padded_sequences_test, np.array(test_dataset['label']))
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9665 - loss: 0.0968
Validation Accuracy: 93.21%
