In [16]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import numpy as np
import time

In [2]:
print("GPU disponible :", torch.cuda.is_available())
print("Nom du GPU :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Aucun")

GPU disponible : True
Nom du GPU : NVIDIA A2


In [3]:
# Mise en forme dataset ISOT https://www.kaggle.com/datasets/csmalarkodi/isot-fake-news-dataset/
Isot_true_df = pd.read_csv("data/True.csv")
Isot_fake_df = pd.read_csv("data/Fake.csv")

#Création d'un dataset unique

Isot_true_df["label"] = 0  # Vraie news
Isot_fake_df["label"] = 1  # Fake news

Isot_data = pd.concat([Isot_true_df, Isot_fake_df], ignore_index=True)

Isot = Isot_data[['text', 'label']]

In [4]:
# Mise en forme dataset Fake_News https://www.kaggle.com/competitions/fake-news/data?select=train.csv
fake_news_data = pd.read_csv("data/train.csv")

fake_news_data = fake_news_data.dropna(subset=['text'])

fake_news = fake_news_data[['text', 'label']]

In [5]:
# Mise en forme dataset Fake_real https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news
fake_real_data = pd.read_csv("data/fake_or_real_news.csv")
fake_real_data['label'] = fake_real_data['label'].map({'FAKE': 1, 'REAL': 0})

# On ne garde que le label et le text de l'article et on lemmatize
fake_real = fake_real_data[['text', 'label']]

# Tokenisation avec Bert pré-entrainé puis modèle logistique

On met en place une tokenisation basique qui fonctionne avec la version pré-entrainée de Bert

In [13]:
# Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Pas d'entraînement

# Détection de l'appareil (GPU si dispo)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 3. Fonction pour encoder en batchs
def get_embeddings_batch(texts, batch_size=16):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Encodage en batchs"):
        batch_texts = texts[i:i+batch_size]

        # Tokenisation
        inputs = tokenizer(batch_texts,
                           return_tensors='pt',
                           truncation=True,
                           padding=True,
                           max_length=512)
        
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        
        # Moyenne des embeddings par séquence
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(batch_embeddings)

    return np.vstack(all_embeddings)

# Génération des embeddings
texts = Isot['text'].tolist()
labels = Isot['label'].values

start_time = time.time()
embeddings = get_embeddings_batch(texts, batch_size=64)
print(f"Temps pour embedding : {time.time() - start_time} secondes")

Encodage en batchs: 100%|██████████| 702/702 [39:27<00:00,  3.37s/it]

Temps pour embedding : 2367.1586334705353 secondes





In [17]:
# Split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# Entrainement d'une régression logistique
classifier = LogisticRegression(max_iter=1000)
start_time = time.time()
classifier.fit(X_train, y_train)
print(f"Temps pour fit : {time.time() - start_time} secondes")

y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Temps pour fit : 13.896960496902466 secondes
Accuracy: 0.9928730512249443
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4330
           1       0.99      0.99      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4306   24]
 [  40 4610]]
