# Library

In [None]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel, TFDistilBertModel

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model

import os
from google.colab import files

# Preprocessing

## Processing_text

In [2]:
# URL du dataset
url = 'https://fnd-jedha-project.s3.eu-west-3.amazonaws.com/0_WELFake_workbase.csv'

# Chargement du dataset
df = pd.read_csv(url)

# Affichage des premières lignes du dataset
print(df.head())


   label                                            message
0      0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1      0  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2      1  Bobby Jindal, raised Hindu, uses story of Chri...
3      0  SATAN 2: Russia unvelis an image of its terrif...
4      0  About Time! Christian Group Sues Amazon and SP...


In [3]:
# Fonction de suppression des URLs
def remove_urls(text):
    # Supprimer les URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text

# Appliquer la fonction de suppression des URLs aux textes
df['message'] = df['message'].apply(remove_urls)

# Affichage des premières lignes du dataset nettoyé
print(df.head())


   label                                            message
0      0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1      0  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2      1  Bobby Jindal, raised Hindu, uses story of Chri...
3      0  SATAN 2: Russia unvelis an image of its terrif...
4      0  About Time! Christian Group Sues Amazon and SP...


In [4]:
# Séparation des données
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42)

print(f'Training set size: {len(train_texts)}')
print(f'Validation set size: {len(val_texts)}')


Training set size: 50496
Validation set size: 12625


In [5]:
# Chargement du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Fonction de tokenisation
def encode_data(texts, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks)

# Tokenisation des textes
max_len = 128
train_input_ids, train_attention_masks = encode_data(train_texts, tokenizer, max_len)
val_input_ids, val_attention_masks = encode_data(val_texts, tokenizer, max_len)

# Affichage des dimensions des tenseurs
print(f'Train input_ids shape: {train_input_ids.shape}')
print(f'Train attention_masks shape: {train_attention_masks.shape}')
print(f'Val input_ids shape: {val_input_ids.shape}')
print(f'Val attention_masks shape: {val_attention_masks.shape}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train input_ids shape: (50496, 1, 128)
Train attention_masks shape: (50496, 1, 128)
Val input_ids shape: (12625, 1, 128)
Val attention_masks shape: (12625, 1, 128)


In [6]:
# Chargement du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Fonction de tokenisation
def encode_data(texts, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
            truncation=True  # Ajout de l'argument truncation
        )
        input_ids.append(encoded_dict['input_ids'][0])  # Suppression de la dimension supplémentaire
        attention_masks.append(encoded_dict['attention_mask'][0])  # Suppression de la dimension supplémentaire

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks)

# Tokenisation des textes
max_len = 128
train_input_ids, train_attention_masks = encode_data(train_texts, tokenizer, max_len)
val_input_ids, val_attention_masks = encode_data(val_texts, tokenizer, max_len)

# Affichage des dimensions des tenseurs
print(f'Train input_ids shape: {train_input_ids.shape}')
print(f'Train attention_masks shape: {train_attention_masks.shape}')
print(f'Val input_ids shape: {val_input_ids.shape}')
print(f'Val attention_masks shape: {val_attention_masks.shape}')


Train input_ids shape: (50496, 128)
Train attention_masks shape: (50496, 128)
Val input_ids shape: (12625, 128)
Val attention_masks shape: (12625, 128)


# Model

In [7]:
class NewsClassifier(tf.keras.Model):
    def __init__(self, n_classes):
        super(NewsClassifier, self).__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-uncased')
        self.dropout = Dropout(0.3)
        self.classifier = Dense(n_classes, activation='softmax')

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        return self.classifier(pooled_output)

# Création du modèle
model = NewsClassifier(n_classes=2)

# Affichage de la structure du modèle
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [8]:
# Affichage de la structure complète du modèle BERT
model.bert.summary()


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109482240 (417.64 MB)
Trainable params: 109482240 (417.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
class NewsClassifier(tf.keras.Model):
    def __init__(self, n_classes):
        super(NewsClassifier, self).__init__()
        self.bert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = Dropout(0.3)
        self.classifier = Dense(n_classes)  # Suppression de l'activation softmax

    def call(self, inputs, attention_mask=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]  # Utiliser la sortie du premier token [CLS]
        pooled_output = self.dropout(pooled_output, training=training)
        return self.classifier(pooled_output)

# Création du modèle
model = NewsClassifier(n_classes=2)

# Compilation du modèle avec un taux d'apprentissage plus élevé
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Création des datasets TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': train_input_ids, 'attention_mask': train_attention_masks}, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': val_input_ids, 'attention_mask': val_attention_masks}, val_labels))

# Batching et shuffling des datasets avec une taille de batch plus petite
train_dataset = train_dataset.shuffle(1000).batch(16)
val_dataset = val_dataset.batch(16)

# Entraînement du modèle avec plus d'époques
model.fit(train_dataset, epochs=10, validation_data=val_dataset)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 79ms/step - accuracy: 0.6610 - loss: 0.6416 - val_accuracy: 0.8265 - val_loss: 0.4063
Epoch 2/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 76ms/step - accuracy: 0.8183 - loss: 0.4063 - val_accuracy: 0.8516 - val_loss: 0.3538
Epoch 3/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 76ms/step - accuracy: 0.8467 - loss: 0.3601 - val_accuracy: 0.8661 - val_loss: 0.3277
Epoch 4/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 87ms/step - accuracy: 0.8594 - loss: 0.3374 - val_accuracy: 0.8734 - val_loss: 0.3105
Epoch 5/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 76ms/step - accuracy: 0.8626 - loss: 0.3234 - val_accuracy: 0.8790 - val_loss: 0.2993
Epoch 6/10
[1m3156/3156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 76ms/step - accuracy: 0.8697 - loss: 0.3130 - val_accuracy: 0.8805 - val_loss: 0.290

<keras.src.callbacks.history.History at 0x7fc961b758d0>

In [11]:
# Évaluation du modèle
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')


[1m790/790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - accuracy: 0.8877 - loss: 0.2746
Validation Loss: 0.2692555785179138
Validation Accuracy: 0.8910890817642212


# Saving

In [17]:
# Création du répertoire s'il n'existe pas
os.makedirs('fake_news_detector', exist_ok=True)

# Sauvegarde du modèle en format .keras
model.save('fake_news_detector/model.keras')


In [18]:
# Sauvegarde des poids du modèle
model.save_weights('fake_news_detector/model_weights.weights.h5')


## Téléchargement des Fichiers Sauvegardés sur pc

In [20]:
# Téléchargement du fichier du modèle
files.download('fake_news_detector/model.keras')

# Téléchargement du fichier des poids du modèle
files.download('fake_news_detector/model_weights.weights.h5')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
# Lister les fichiers dans le répertoire fake_news_detector
files = os.listdir('fake_news_detector')
print(files)


['model.pkl', 'model.keras', 'model_weights.weights.h5']


In [28]:
# Lister les fichiers dans le répertoire courant
files = os.listdir('.')
print(files)


['.config', 'fake_news_detector', 'sample_data']
