In [1]:
import pandas as pd
import numpy as np
import time
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm 

In [3]:
# Mise en forme dataset ISOT https://www.kaggle.com/datasets/csmalarkodi/isot-fake-news-dataset/
Isot_true_df = pd.read_csv("data/True.csv")
Isot_fake_df = pd.read_csv("data/Fake.csv")

#Création d'un dataset unique

Isot_true_df["label"] = 0  # Vraie news
Isot_fake_df["label"] = 1  # Fake news

Isot_data = pd.concat([Isot_true_df, Isot_fake_df], ignore_index=True)

Isot = Isot_data[['text', 'label']]

In [4]:
# Mise en forme dataset Fake_News https://www.kaggle.com/competitions/fake-news/data?select=train.csv
fake_news_data = pd.read_csv("data/train.csv")

fake_news_data = fake_news_data.dropna(subset=['text'])

fake_news = fake_news_data[['text', 'label']]

In [7]:
# Mise en forme dataset Fake_real https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news
fake_real_data = pd.read_csv("data/fake_or_real_news.csv")
fake_real_data['label'] = fake_real_data['label'].map({'FAKE': 1, 'REAL': 0})

# On ne garde que le label et le text de l'article et on lemmatize
fake_real = fake_real_data[['text', 'label']]

# Tokenisation avec Bert

In [14]:
# Tokeniser Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():  # Pas besoin de calculer les gradients
        outputs = bert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # On prend le vecteur [CLS]
    return cls_embedding.squeeze().numpy()


In [None]:
# Extraire les embeddings avec BERT
start_time = time.time()
embeddings = []
for text in tqdm(Isot['text'], desc="Extraction des embeddings"):
    emb = get_bert_embedding(text)
    embeddings.append(emb.numpy())
emb_duration = time.time() - start_time

print(f"embedding terminé en {emb_duration:.2f} secondes.")

X = torch.tensor(embeddings)
y = torch.tensor(Isot['label'].values)

Extraction des embeddings:   0%|          | 0/44898 [00:00<?, ?it/s]

In [None]:
# Split et entraînement 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))