In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Charger les fichiers CSV
true_df = pd.read_csv("data/True.csv")
fake_df = pd.read_csv("data/Fake.csv")

#Création d'un dataset unique

true_df["label"] = 0  # Vraie news
fake_df["label"] = 1  # Fake news

data_all = pd.concat([true_df, fake_df], ignore_index=True)

In [4]:
data_all.head(10)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",0
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",0
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",0
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",0
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",0


A partir de la lecture de l'article référence, il apparait que sur ce jeu de données il est possible d'atteindre de très bonnes performances avec plusieurs features extraction et modèle de machine learning. On s'appuie donc pour commencer sur une approche semblable au V.E, qui repose sur l'utilisation d'un TF-IDF feature extracteur et une régression logistique (dans le but de relever ensuite les mots utilisés, à l'image de ce qu'ils font dans l'article).

A noter que l'on pourra ensuite imaginer d'autres approches, ou voir si l'on arrive à faire quelque chose de correct avec juste le titre etc...

# Construction d'un modèle "baseline" TF-IDF et régression logistique

In [6]:
# On ne garde que le label et le text de l'article
data = data_all[['text', 'label']]
data.head(3)

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0


In [14]:
# On réalise le split des données

X = data['text']  # Les articles/textes
y = data['label']  # Les labels (1 = Fake)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Vectorisation tfidf (ici probablement des paramètres à fine-tuned, notamment le max_df)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
#On applique une régression logistique
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [17]:
# Et on évalue notre petit modèle
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9863028953229399

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4330
           1       0.99      0.98      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4284   46]
 [  77 4573]]
