# Comparaison de modèles IA pour la détection de fake news, selon des métriques de performance standards.

## Objectif du notebook
Comparer plusieurs modèles (TF-IDF + LR, BERT, Azure GPT, etc.) sur un même jeu de données en évaluant :

Accuracy

Precision

Recall

F1-score

Temps d’inférence

Robustesse aux textes ambigus



## Import des packages

In [2]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import requests


## Chargement du dataset
Tu peux utiliser un dataset public comme :

FakeNewsNet

LIAR Dataset

Kaggle: Fake News

In [3]:
import pandas as pd

# Chargement des deux fichiers
true_df = pd.read_csv("data/true.csv")
fake_df = pd.read_csv("data/fake.csv")

# Ajout de la colonne "label"
true_df["label"] = 0  # 0 = REAL
fake_df["label"] = 1  # 1 = FAKE

# Harmonisation des colonnes (on garde "text" ou on renomme selon besoin)
# Si les colonnes sont 'title' et 'text', on peut les combiner
true_df["text"] = true_df["title"] + " " + true_df["text"]
fake_df["text"] = fake_df["title"] + " " + fake_df["text"]

# Fusion des deux datasets
df = pd.concat([true_df, fake_df], ignore_index=True)

# Nettoyage éventuel
df = df[["text", "label"]].dropna()

df.head()




Unnamed: 0,text,label
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0


## Séparation des données

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)


## Appel des modèles FastAPI

In [5]:
def predict_via_fastapi(texts, model_name="tfidf"):
    predictions = []
    for text in texts:
        try:
            response = requests.post(
                f"http://localhost:8001/predict?method={model_name}",
                json={"content": text},
                timeout=10
            )
            response.raise_for_status()
            predictions.append(int(response.json().get("is_fake", 0)))
        except Exception as e:
            print(f"Erreur pour {model_name}: {e}")
            predictions.append(0)
    return predictions


## Définition des modèles à tester

In [6]:
models = {
    "TF-IDF + Logistic Regression": lambda texts: predict_via_fastapi(texts, model_name="tfidf"),
    "BERT": lambda texts: predict_via_fastapi(texts, model_name="bert"),
    "Azure GPT": lambda texts: predict_via_fastapi(texts, model_name="azure")
}


## Boucle de benchmark

In [None]:
results = []

for name, model_fn in models.items():
    print(f"Évaluation de {name}...")
    start = time.time()
    y_pred = model_fn(X_test.tolist())
    end = time.time()

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "Inference Time (s)": end - start
    })


## Résultats tabulaires

In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values("F1-score", ascending=False)


## Visualisation

In [None]:
results_df.set_index("Model")[["Accuracy", "Precision", "Recall", "F1-score"]].plot.bar(
    figsize=(10, 6), title="Performance des modèles de détection de fake news"
)
plt.ylabel("Score")
plt.ylim(0, 1)
plt.grid(True)
plt.tight_layout()
plt.show()
