#### Packages

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

Ce notebook pour tester si le fait d'utiliser un Perceptron multi-couche (MLP) à la place de la regressionlogistique améliore la généralisabilité. 

Pour le choix du modèle, on s'appuie sur le résultat du GridSearch du script GreadSearch_TF_IDF_MLP.py qui donne 
2025-04-15 10:57:43,376 - Best parameters found:
2025-04-15 10:57:43,376 - {'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate_init': 0.01, 'svd__n_components': 200}

## Chargement des bases

#### Chargement et mise en forme de ISOT

In [2]:
# Mise en forme dataset ISOT https://www.kaggle.com/datasets/csmalarkodi/isot-fake-news-dataset/
Isot_true_df = pd.read_csv("data/True.csv")
Isot_fake_df = pd.read_csv("data/Fake.csv")

#Création d'un dataset unique

Isot_true_df["label"] = 0  # Vraie news
Isot_fake_df["label"] = 1  # Fake news

Isot_data = pd.concat([Isot_true_df, Isot_fake_df], ignore_index=True)

Isot_data.head(5)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [3]:
# On test la présence de NA
Isot_data.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [4]:
# On ne garde que le label et le text de l'article
Isot = Isot_data[['text', 'label']]

#### Chargement et mise en forme de fake_news

In [5]:
# Mise en forme dataset Fake_News https://www.kaggle.com/competitions/fake-news/data?select=train.csv
fake_news_data = pd.read_csv("data/train.csv")
fake_news_data.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
# On teste la présence de valeurs manquantes sur le texte, et on drop s'il y en a
fake_news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
fake_news_data = fake_news_data.dropna(subset=['text'])

In [8]:
# On ne garde que le label et le text de l'article et on lemmatise
fake_news = fake_news_data[['text', 'label']]
fake_news.head(3)

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1


#### Chargement et mise en forme de fake_real

In [9]:
# Mise en forme dataset Fake_real https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news
fake_real_data = pd.read_csv("data/fake_or_real_news.csv")
fake_real_data['label'] = fake_real_data['label'].map({'FAKE': 1, 'REAL': 0})
fake_real_data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [10]:
# On test la présence de NA
fake_real_data.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [11]:
# On ne garde que le label et le text de l'article et on lemmatize
fake_real = fake_real_data[['text', 'label']]
fake_real.head(3)

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,U.S. Secretary of State John F. Kerry said Mon...,0


# Construction d'un modèle TF-IDF-MLP sur ISOT

Pour le choix des hyperparamètres, voir résultat du script.

### Réalisation du modèle

In [12]:
# On réalise le split des données

X = Isot['text']  # Les articles/textes
y = Isot['label']  # Les labels (1 = Fake)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Définition du pipeline 

model_tfidf_mlp = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('svd', TruncatedSVD(n_components=200)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,), 
                          activation='relu', 
                          alpha=0.0001, 
                          learning_rate_init=0.01,
                          max_iter=500,
                          early_stopping=True,
                          random_state=42))
])

In [None]:
start_time = time.time()
model_tfidf_mlp.fit(X_train, y_train)
fit_duration = time.time() - start_time
print(f"fit terminé en {fit_duration:.2f} secondes.")

In [None]:
# Et on évalue notre petit modèle
start_time = time.time()
y_pred = model_tfidf_mlp.predict(X_test_tfidf)
pred_duration = time.time() - start_time
print(f"Prédiction terminé en {pred_duration:.2f} secondes.")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



### Test sur les autres datasets

In [None]:
# Définition d'une fonction de prédiction 

def apply_model_tfidf_mlp(new_data, text_column='text'):
    """
    Applique le modèle de détection de fake news entrainé sur ISOT et fonctionnant avec un tokenizer TF-IDF, une réduction de dimension avec TruncatedSVD et un prédicteur de type MLP à un nouveau DataFrame.
    Ajoute une colonne 'prediction' (0 = Real, 1 = Fake).
    """
    # Faire les prédictions
    predictions = model_tfidf_mlp.predict(X_new_tfidf)
    
    # Ajouter la colonne 'prediction'
    new_data = new_data.copy()  # Pour ne pas modifier le DataFrame original
    new_data['prediction'] = predictions
    
    return new_data
    

In [None]:
# Appliquer le modèle à Fake_News
start_time = time.time()
fake_news_pred = apply_model_tfidf_mlp(fake_news)
pred_duration = time.time() - start_time

print(f"Prédiction terminé en {pred_duration:.2f} secondes.")
print("\nÉvaluation sur Fake News :\n")
print("Accuracy:", accuracy_score(fake_news_pred['label'], fake_news_pred['prediction']))
print("\nClassification Report:\n", classification_report(fake_news_pred['label'], fake_news_pred['prediction']))
print("\nConfusion Matrix:\n", confusion_matrix(fake_news_pred['label'], fake_news_pred['prediction']))

In [None]:
# Appliquer le modèle à Fake_Real
start_time = time.time()
fake_news_pred = apply_model_tfidf_mlp(fake_real)
pred_duration = time.time() - start_time

print(f"Prédiction terminé en {pred_duration:.2f} secondes.")
print("\nÉvaluation sur Fake Real :\n")
print("Accuracy:", accuracy_score(fake_real_pred['label'], fake_real_pred['prediction']))
print("\nClassification Report:\n", classification_report(fake_real_pred['label'], fake_real_pred['prediction']))
print("\nConfusion Matrix:\n", confusion_matrix(fake_real_pred['label'], fake_real_pred['prediction']))