<a href="https://colab.research.google.com/github/FraSab98/Fake_news/blob/main/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import string
import joblib
import numpy as np
import pandas as pd
import torch
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from textblob import TextBlob
from transformers import BertTokenizer, BertModel
import transformers
print(transformers.__version__)


In [None]:
print("GPU disponibile?", torch.cuda.is_available())
print("Dispositivo selezionato:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


In [None]:
df = pd.read_csv("./dataset/WELFake_Dataset.csv")
df.head()
df = df.dropna()
df.reset_index(inplace=True)
df = df.drop(['Unnamed: 0', 'title'], axis=1)

def preprocess_text(text):
    lm = WordNetLemmatizer()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    # Rimozione di caratteri speciali e numeri
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenizzazione
    tokens = word_tokenize(text)
    # Rimozione di stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lm.lemmatize(x) for x in tokens]
    return ' '.join(tokens)


df['clean_text'] = df['text'].apply(preprocess_text)


def count_uppercase_by_label(text, label):
    if isinstance(text, str):  # Verifica che sia una stringa
        # Conta le lettere maiuscole
        uppercase_count = sum(1 for char in text if char.isupper())

        # # Restituisce il conteggio in base alla label
        # if label == 0:
        return uppercase_count  # Se label è 0, restituisce il conteggio delle maiuscole per la label 0
        # elif label == 1:
        #     return uppercase_count  # Se label è 1, restituisce il conteggio delle maiuscole per la label 1
    return 0


df['uppercase_count'] = df.apply(lambda row: count_uppercase_by_label(row['text'], row['label']), axis=1)

# Somma totale delle lettere maiuscole
total_uppercase = df['uppercase_count'].sum()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (8,5))
sns.countplot(x = df['label'], palette = 'Set1', alpha = 0.8)
plt.title('Distribution of Fake - 0 /Real - 1 News')
plt.savefig('./grafici/Distribution_of_Fake_Real.png')
plt.close()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df['num_words'] = df['text'].apply(lambda x: len(x.split()))
plt.figure(figsize = (14,5))
sns.histplot(df['num_words'], bins = range(1, 3000, 50), palette = 'Set1', alpha = 0.8)
plt.title('Distribution of the News Words count')
plt.savefig('./grafici/Distribution_of_num_words.png')
plt.close()

In [None]:
# Calcolo della frequenza delle parole
# Calcola il TF-IDF delle parole nel DataFrame
vectorizer = TfidfVectorizer(max_features=1000)
word_tfidf = vectorizer.fit_transform(df['clean_text']).toarray()

# Crea un DataFrame con i valori TF-IDF
df_tfidf = pd.DataFrame(word_tfidf, columns=vectorizer.get_feature_names_out())

# Somma i valori TF-IDF su tutte le righe (documenti) per ogni parola
word_tfidf_sum = df_tfidf.sum(axis=0)

# Converte i valori TF-IDF in un dizionario (parola -> TF-IDF)
word_tfidf_dict = word_tfidf_sum.to_dict()

# Calcolare la lunghezza del testo per ogni riga
df['text_length'] = df['text'].apply(lambda x: len(x.replace(" ", "")))

# Calcolare la media della lunghezza del testo
df['avg_text'] = mean_length = df['text_length'].mean()

df['polarity'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

struct_features = df[['polarity', 'subjectivity', 'text_length', 'uppercase_count']].values

In [None]:
import joblib

# Salva il vectorizer su disco
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

In [None]:
# Normalizzazione delle feature strutturali
scaler = StandardScaler()
normalized_structural_features = scaler.fit_transform(struct_features)

X_struct = np.hstack([struct_features])

# feature strutturali e tf-idf
X_tfidf = np.hstack([normalized_structural_features, df_tfidf])

In [None]:
joblib.dump(scaler, "scaler_struct_features.pkl")

In [None]:
class Evaluation:

    def __init__(self, model, x_train, x_test, y_train, y_test):
        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)

        acc_scr_train = accuracy_score(self.y_train, y_pred_train)
        print("Accuracy Score On Training Data Set :", acc_scr_train)
        print()

        # con_mat_train = confusion_matrix(self.y_train, y_pred_train)
        # print("Confusion Matrix On Training Data Set :\n", con_mat_train)
        # print()

        class_rep_train = classification_report(self.y_train, y_pred_train)
        print("Classification Report On Training Data Set :\n", class_rep_train)

    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)

        acc_scr_test = accuracy_score(self.y_test, y_pred_test)
        print("Accuracy Score On Testing Data Set :", acc_scr_test)
        print()

        # con_mat_test = confusion_matrix(self.y_test, y_pred_test)
        # print("Confusion Matrix On Testing Data Set :\n", con_mat_test)
        # print()

        class_rep_test = classification_report(self.y_test, y_pred_test)
        print("Classification Report On Testing Data Set :\n", class_rep_test)


FINE TUNING BERT FOR CLASSIFICATION


In [None]:
!pip install evaluate

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from transformers import DataCollatorWithPadding
import pandas as pd
import evaluate
from sklearn.model_selection import train_test_split


# Controlla se la GPU è disponibile
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Inizializza il tokenizer e il modello BERT base cased
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# bert_model = BertModel.from_pretrained('bert-base-cased').to(device)
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification


id2label = {0: "Fake", 1: "Real"}
label2id = {"Fake": 0, "Real": 1}

# Tokenizer e modello
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Dividi il dataset in 70k per training/validation e 5k per test finale
train_val_df, test_df = train_test_split(df, test_size=5000, random_state=42, stratify=df['label'])

# Verifica
print(len(train_val_df))  # 70000
print(len(test_df))       # 5000

# Dividi ulteriormente i 70k in training e validation
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, stratify=train_val_df['label'])

# Verifica
print(len(train_df))  # 56000
print(len(val_df))    # 14000

# Converte in Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['clean_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'label']])

# dataset = Dataset.from_pandas(df[['clean_text', 'label']])

def tokenize(batch):
    return tokenizer(batch["clean_text"], padding=True, truncation=True, max_length=128)

# dataset = dataset.map(tokenize, batched=True)
# Applica il tokenizer a ciascun set
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
# test_dataset = test_dataset.map(tokenize, batched=True)

# Carica BERT per classificazione binaria
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

# print layers
# for name, param in model.named_parameters():
#    print(name, param.requires_grad)

# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

# print layers
# for name, param in model.named_parameters():
#    print(name, param.requires_grad)



In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import json
import os
os.environ["WANDB_DISABLED"] = "true"

# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1_score = evaluate.load("f1")

def compute_metrics2(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'], 3)
    prec = np.round(precision.compute(predictions=predicted_classes, references=labels)['precision'], 3)
    rec = np.round(recall.compute(predictions=predicted_classes, references=labels)['recall'], 3)
    f1 = np.round(f1_score.compute(predictions=predicted_classes, references=labels)['f1'], 3)

    return {
        "Accuracy": acc,
        "AUC": auc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }

# with open("./risultati/log_history.json", "r") as f:
#     log_history = json.load(f)

# hyperparameters
lr = 2e-4
batch_size = 64
num_epochs = 50

training_args = TrainingArguments(
    output_dir="./risultati",
    learning_rate=lr,
    fp16=torch.cuda.is_available(),  # automatico e sicuro
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="tensorboard",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics2,
)

trainer.train()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import json
import os
from collections import defaultdict

# Assicurati che queste variabili siano definite prima di questo blocco
# (es. tokenizer, model, train_dataset, val_dataset, TrainingArguments, Trainer)
# ... (il tuo codice precedente fino a trainer.train()) ...

# Recupera le metriche loggate
log_history = trainer.state.log_history

# --- MODIFICA QUI PER CALCOLARE LE MEDIE PER EPOCA ---

# Dizionari per accumulare i valori per ogni epoca
# Useremo defaultdict per semplificare l'aggiunta di valori
epoch_data = defaultdict(lambda: defaultdict(list))

for log in log_history:
    if "epoch" in log:
        epoch = round(log["epoch"]) # Arrotonda l'epoca all'intero più vicino

        # Log di metriche di valutazione (eval_Accuracy, eval_AUC, ecc.)
        # Queste vengono loggate una volta per epoca, quindi le prendiamo direttamente
        if "eval_Accuracy" in log:
            epoch_data[epoch]["eval_Accuracy"].append(log["eval_Accuracy"])
            epoch_data[epoch]["eval_AUC"].append(log["eval_AUC"])
            epoch_data[epoch]["eval_Precision"].append(log["eval_Precision"])
            epoch_data[epoch]["eval_Recall"].append(log["eval_Recall"])
            epoch_data[epoch]["eval_F1"].append(log["eval_F1"])

        # Log della loss di training
        # La loss viene loggata più volte per epoca (ogni logging_steps)
        # Se logging_strategy="epoch", 'loss' dovrebbe già essere la media dell'epoca,
        # ma per sicurezza o se vuoi una granularità diversa, potresti voler sommare.
        # Per semplicità, se 'loss' è presente, lo consideriamo come il valore di training per quel punto.
        # Se vuoi la media su tutti gli step dell'epoca, dovresti accumulare
        # e poi fare la media alla fine dell'epoca.
        # Dato che logging_strategy="epoch", il log con 'loss' dovrebbe essere già la media dell'epoca.
        if "loss" in log:
            # Assumiamo che 'loss' quando loggato con logging_strategy="epoch" sia la media dell'epoca.
            # Se fosse la loss di un singolo step, dovremmo accumularla e fare la media.
            # Per il Trainer di HF con logging_strategy="epoch", 'loss' è la media dell'epoca.
            epoch_data[epoch]["train_loss"].append(log["loss"])


# Estrai i valori medi per ogni epoca
epochs_list = sorted(epoch_data.keys())
eval_accuracy = [np.mean(epoch_data[e]["eval_Accuracy"]) if epoch_data[e]["eval_Accuracy"] else None for e in epochs_list]
eval_auc = [np.mean(epoch_data[e]["eval_AUC"]) if epoch_data[e]["eval_AUC"] else None for e in epochs_list]
eval_precision = [np.mean(epoch_data[e]["eval_Precision"]) if epoch_data[e]["eval_Precision"] else None for e in epochs_list]
eval_recall = [np.mean(epoch_data[e]["eval_Recall"]) if epoch_data[e]["eval_Recall"] else None for e in epochs_list]
eval_f1 = [np.mean(epoch_data[e]["eval_F1"]) if epoch_data[e]["eval_F1"] else None for e in epochs_list]

# Per la train_loss, se logging_strategy="epoch", ci sarà un solo valore 'loss' per epoca.
# Se ci fossero più valori (es. per logging_steps), np.mean li media.
train_loss = [np.mean(epoch_data[e]["train_loss"]) if epoch_data[e]["train_loss"] else None for e in epochs_list]

# Rimuovi i None se alcune metriche non sono state loggate per tutte le epoche
# (utile se l'addestramento si è interrotto o se ci sono log incompleti)
# Filtra solo le epoche per cui tutti i dati necessari sono disponibili per evitare None nel plot
valid_epochs_indices = [i for i, val in enumerate(eval_accuracy) if val is not None and eval_auc[i] is not None and train_loss[i] is not None]
epochs_filtered = [epochs_list[i] for i in valid_epochs_indices]
eval_accuracy_filtered = [eval_accuracy[i] for i in valid_epochs_indices]
eval_auc_filtered = [eval_auc[i] for i in valid_epochs_indices]
eval_precision_filtered = [eval_precision[i] for i in valid_epochs_indices]
eval_recall_filtered = [eval_recall[i] for i in valid_epochs_indices]
eval_f1_filtered = [eval_f1[i] for i in valid_epochs_indices]
train_loss_filtered = [train_loss[i] for i in valid_epochs_indices]


# --- PLOTTING ---

# Plot 1: Validation Metrics over Time
plt.figure(figsize=(10,6))
plt.plot(epochs_filtered, eval_accuracy_filtered, label="Accuracy", marker='o')
plt.plot(epochs_filtered, eval_auc_filtered, label="AUC", marker='x')
plt.plot(epochs_filtered, eval_precision_filtered, label="Precision", marker='^')
plt.plot(epochs_filtered, eval_recall_filtered, label="Recall", marker='s')
plt.plot(epochs_filtered, eval_f1_filtered, label="F1 Score", marker='d')
plt.xlabel("Epoch")
plt.ylabel("Metric Value")
plt.title("Validation Metrics over Time")
plt.legend()
plt.grid()
plt.tight_layout()
plt.savefig("./grafici/val_all_metrics.png")
plt.close()


# Plot 2: Accuracy e AUC
plt.figure(figsize=(10,6))
plt.plot(epochs_filtered, eval_accuracy_filtered, label="Validation Accuracy", marker='o')
plt.plot(epochs_filtered, eval_auc_filtered, label="Validation AUC", marker='x')
plt.xlabel("Epoch")
plt.ylabel("Metric Value")
plt.title("Validation Accuracy & AUC over Time")
plt.legend()
plt.grid()
plt.savefig("./grafici/val_accuracy_auc.png")
plt.close()

# Plot 3: Loss di Training (media per epoca)
plt.figure(figsize=(10,6))
plt.plot(epochs_filtered, train_loss_filtered, label="Training Loss", color="red")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss over Time (Average per Epoch)") # Titolo più descrittivo
plt.grid()
plt.savefig("./grafici/train_loss.png")
plt.close()

print("Grafici generati e salvati nella cartella ./grafici/")

Grafici generati e salvati nella cartella ./grafici/


In [None]:
import shutil


shutil.make_archive('model', 'zip', 'modelli')
shutil.make_archive('graf', 'zip', 'grafici')
# shutil.make_archive('ris', 'zip', 'risultati')
shutil.make_archive('checkpoint-41600', 'zip', './risultati/checkpoint-41600')

from google.colab import files

files.download('graf.zip')
files.download('model.zip')
# files.download('ris.zip')
files.download('checkpoint-41600.zip')



In [None]:
trainer.save_model("./modello_finale")
tokenizer.save_pretrained("./modello_finale")


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.metrics import classification_report, roc_auc_score

# === Carica modello e tokenizer ===
tokenizer = DistilBertTokenizer.from_pretrained("./risultati/checkpoint-41600")
model = DistilBertForSequenceClassification.from_pretrained("./risultati/checkpoint-41600")
model.eval()

# === Dataset personalizzato ===
class TestDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# === Prepara dati ===
texts = test_dataset['clean_text']    # lista di stringhe del nuovo dataset
labels = test_dataset['label']    # lista di etichette vere
test_dataset = TestDataset(texts, labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64)

# === Valutazione ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_batch = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)

        preds = torch.argmax(probs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs[:, 1].cpu().numpy())  # classe positiva
        all_labels.extend(labels_batch.cpu().numpy())

# === Metriche finali ===
print(classification_report(all_labels, all_preds))
print("ROC-AUC:", roc_auc_score(all_labels, all_probs))

# === Stampa etichetta predetta e vera per ogni campione ===
for pred, true_label in zip(all_preds, all_labels):
    print(f"Predetto: Fake " if pred == 0 else f"Predetto: Real")
    print(f"Reale: {true_label}")

import pandas as pd

results_df = pd.DataFrame({
    'predicted': all_preds,
    'true': all_labels
})

results_df.to_csv('predizioni_test.csv', index=False)


ValueError: Column 'clean_text' doesn't exist.

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_curve, auc

# Converti in array numpy per sklearn metrics
all_preds = np.array(all_preds)
all_probs_positive_class = np.array(all_probs)
all_labels = np.array(all_labels)

# --- 5. Calcolo delle Metriche Finali ---
print("\n" + "="*30)
print("REPORT DI CLASSIFICAZIONE:")
print(classification_report(all_labels, all_preds))
print("="*30 + "\n")

acc = accuracy_score(all_labels, all_preds)
roc_auc = roc_auc_score(all_labels, all_probs_positive_class) # Usiamo roc_auc_score direttamente qui

print(f"Accuracy: {acc:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}")

# Calcola le metriche per classe
prec_per_class, rec_per_class, f1_per_class, _ = precision_recall_fscore_support(all_labels, all_preds, labels=[0, 1])

# Calcola le metriche aggregate (weighted average)
prec_weighted, rec_weighted, f1_weighted, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')


# 6.1. Matrice di Confusione
cm = confusion_matrix(all_labels, all_preds)
labels_cm = ["Classe 0", "Classe 1"] # O "Fake", "Real" a seconda delle tue classi

plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels_cm, yticklabels=labels_cm, cbar=False)
plt.ylabel('Etichetta Vera', fontsize=12)
plt.xlabel('Etichetta Predetta', fontsize=12)
plt.title('Matrice di Confusione', fontsize=14)
plt.tight_layout()
plt.savefig("grafici/confusion_matrix.png")
plt.close()

# 6.2. Precision, Recall, F1-Score per Classe
plt.figure(figsize=(9, 6))
bar_width = 0.25
x = np.arange(2) # Per Classe 0 e Classe 1

plt.bar(x - bar_width, prec_per_class, width=bar_width, label='Precision', color='skyblue')
plt.bar(x, rec_per_class, width=bar_width, label='Recall', color='lightcoral')
plt.bar(x + bar_width, f1_per_class, width=bar_width, label='F1-Score', color='lightgreen')

plt.xticks(x, labels_cm, fontsize=10) # Usa le stesse etichette della matrice di confusione
plt.ylabel('Valore Metrica', fontsize=12)
plt.ylim(0, 1.05)
plt.title('Precision, Recall, F1-Score per Classe', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("grafici/metrics_per_class.png")
plt.close()

# 6.3. ROC Curve
fpr, tpr, _ = roc_curve(all_labels, all_probs_positive_class) # Usa all_probs_positive_class
roc_auc_final = auc(fpr, tpr) # Calcola AUC dalla curva ROC

plt.figure(figsize=(8, 7))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Curva ROC (AUC = {roc_auc_final:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Classificatore Casuale')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasso di Falsi Positivi (FPR)', fontsize=12)
plt.ylabel('Tasso di Veri Positivi (TPR)', fontsize=12)
plt.title('Curva ROC', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid()
plt.tight_layout()
plt.savefig("grafici/roc_curve_final.png")
plt.close()

# 6.4. Metriche di Performance Complessive (OVERALL)
metrics_names_overall = ['Accuratezza', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
metrics_values_overall = [acc, prec_weighted, rec_weighted, f1_weighted, roc_auc_final]

plt.figure(figsize=(10, 6))
bars = plt.bar(metrics_names_overall, metrics_values_overall, color=sns.color_palette("viridis", len(metrics_names_overall)))

# Aggiungi i valori sulle barre
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.3f}', ha='center', va='bottom', fontsize=10)

plt.ylabel('Valore Metrica', fontsize=12)
plt.xlabel('Metrica', fontsize=12)
plt.title('Metriche di Performance Complessive', fontsize=14)
plt.ylim(0.0, 1.05)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("grafici/overall_performance_metrics.png")
plt.close()

# 6.5. Distribuzione delle Probabilità Predette (Classe positiva)
plt.figure(figsize=(8, 6))
plt.hist(all_probs_positive_class, bins=30, color='skyblue', edgecolor='black')
plt.xlabel("Probabilità della classe '1' (Positiva)", fontsize=12) # Specifica la classe
plt.ylabel("Frequenza", fontsize=12)
plt.title("Distribuzione delle Probabilità Predette", fontsize=14)
plt.grid()
plt.tight_layout()
plt.savefig("grafici/prob_distribution.png")
plt.close()

# 7. Salvataggio DataFrame dei risultati
results_df = pd.DataFrame({
    'predicted': all_preds,
    'true': all_labels,
    'prob_positive': all_probs_positive_class
})
results_df.to_csv('predizioni_test.csv', index=False)

# 8. Salvataggio report di classificazione finale su file
with open("risultati/final_metrics_report.txt", "w") as f:
    f.write("--- REPORT DI CLASSIFICAZIONE DETTAGLIATO ---\n\n")
    f.write(classification_report(all_labels, all_preds, digits=3)) # digits=3 per più precisione
    f.write(f"\nAccuracy (globale): {acc:.3f}")
    f.write(f"\nROC-AUC (globale): {roc_auc:.3f}")
    f.write(f"\nPrecisione (Weighted): {prec_weighted:.3f}")
    f.write(f"\nRichiamo (Weighted): {rec_weighted:.3f}")
    f.write(f"\nF1-Score (Weighted): {f1_weighted:.3f}")

print("\nReport di classificazione finale salvato in 'risultati/final_metrics_report.txt'")
print("Tutti i grafici di performance sono stati generati nella cartella 'grafici/'.")


REPORT DI CLASSIFICAZIONE:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87      2448
           1       0.86      0.90      0.88      2552

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000


Accuracy: 0.873
ROC-AUC: 0.952

Report di classificazione finale salvato in 'risultati/final_metrics_report.txt'
Tutti i grafici di performance sono stati generati nella cartella 'grafici/'.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Dati della matrice di confusione calcolati
# Queste sono le count, non le percentuali
tn = 2022  # True Negatives (Reale 0, Predetto 0)
fp = 426   # False Positives (Reale 0, Predetto 1)
fn = 193   # False Negatives (Reale 1, Predetto 0)
tp = 2359  # True Positives (Reale 1, Predetto 1)

confusion_matrix = np.array([[tn, fp],
                             [fn, tp]])
# Nomi delle classi
class_names = ['0', '1']

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix, annot=True, fmt='.0f', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names, cbar=False, linewidths=.5)


plt.xlabel('Predict label', fontsize=14)
plt.ylabel('True label', fontsize=14)
plt.title('Confusion Matrix', fontsize=16)


# Aggiustamenti per visualizzare correttamente
plt.tight_layout()

plt.savefig("./grafici/conf_matrix_test.png")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Dataset: già disponibili
# - X_struct (scaled)
# - df_tfidf (TF-IDF as DataFrame)
# - y = df['label'].values

# Combinazioni di feature
X_tfidf
y = df['label'].values

# Definizione set
feature_sets = {
    'Struttural only': X_struct,
    'Struttural + TF-IDF': X_tfidf
}


from sklearn.model_selection import train_test_split

generalization_test_size = 5000
random_state = 42

# Primo split: estraiamo 5k esempi per test di generalizzazione (random stratificato)
X_struct_rest, X_struct_generalization, y_rest, y_generalization = train_test_split(
    X_struct, y,
    test_size=generalization_test_size,
    random_state=random_state,
    stratify=y
)

X_tfidf_rest, X_tfidf_generalization, _, _ = train_test_split(
    X_tfidf, y,
    test_size=generalization_test_size,
    random_state=random_state,
    stratify=y
)

# Ora split train/test interno sul resto (ad esempio 70/30)
test_size_internal = 0.3

X_train_dict = {}
X_test_dict = {}

for name, X_rest in {'Struttural only': X_struct_rest, 'Struttural + TF-IDF': X_tfidf_rest}.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X_rest, y_rest,
        test_size=test_size_internal,
        random_state=random_state,
        stratify=y_rest
    )
    X_train_dict[name] = X_train
    X_test_dict[name] = X_test

# Ora hai:
# - X_train_dict, X_test_dict, y_train, y_test per train/test interno
# - X_struct_generalization, X_tfidf_generalization, y_generalization per test di generalizzazione finale


# === MODELLI E PARAMETRI ===
models = {
    # 'Random Forest': {
    #     'model': RandomForestClassifier(),
    #     'params': {
    #         'n_estimators': [100, 200],
    #         'max_depth': [None, 10, 20]
    #     }
    # },
    'SVM': {
        'model': LinearSVC(dual=False, max_iter=5000),
        'params': {
            'C': [0.1, 0.5, 1]
        }
    }
}

# === CICLO DI VALUTAZIONE ===
results = []

for model_name, config in models.items():
    for feature_name in feature_sets.keys():
        print(f"\n {model_name} con {feature_name}")
        X_train = X_train_dict[feature_name]
        X_test = X_test_dict[feature_name]

        print(X_train.shape)

        grid = GridSearchCV(
            config['model'],
            config['params'],
            # scoring='f1_weighted',
            scoring='accuracy',
            cv=5,
            n_jobs=1
        )
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_

        # Valutazione training set
        y_pred_train = best_model.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        report_train = classification_report(y_train, y_pred_train, output_dict=True, zero_division=0)
        f1_train = report_train['weighted avg']['f1-score']

        # Valutazione test set
        y_pred_test = best_model.predict(X_test)

        # cm = confusion_matrix(y_test, y_pred_test)
        # # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)

        # plt.figure(figsize=(5, 4))
        # cm.plot(cmap="Blues", values_format='d')
        # plt.title(f"{model_name} | {feature_name}")
        # plt.grid(False)
        # plt.tight_layout()
        # plt.savefig("./grafici/confusion_matrix_" + model_name + "_" + feature_name)
        # plt.close()


        acc_test = accuracy_score(y_test, y_pred_test)
        prec = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred_test, average='weighted', zero_division=0)
        report_test = classification_report(y_test, y_pred_test, output_dict=True, zero_division=0)
        f1_test = report_test['weighted avg']['f1-score']
        # Addestramento e valutazione con il modello ottimizzato
        Evaluation(best_model, X_train, X_test, y_train, y_test).train_evaluation()
        Evaluation(best_model, X_train, X_test, y_train, y_test).test_evaluation()

        # Cross-validation score (5-fold sul training set)
        cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
        cv_mean = np.mean(cv_scores)
        cv_std = np.std(cv_scores)

        # Salvataggio modello e parametri
        model_filename = f"modelli/{model_name}_{feature_name}_best_model.pkl"
        joblib.dump(best_model, model_filename)

        params_filename = f"modelli/{model_name}_{feature_name}_params.json"
        with open(params_filename, "w") as f:
            import json
            json.dump(grid.best_params_, f)

            # ROC AUC - train
            if hasattr(best_model, "predict_proba"):
                y_train_score = best_model.predict_proba(X_train)[:, 1]
                y_test_score = best_model.predict_proba(X_test)[:, 1]
            elif hasattr(best_model, "decision_function"):
                y_train_score = best_model.decision_function(X_train)
                y_test_score = best_model.decision_function(X_test)
            else:
                y_train_score = None
                y_test_score = None

            train_auc = roc_auc_score(y_train, y_train_score) if y_train_score is not None else None
            test_auc = roc_auc_score(y_test, y_test_score) if y_test_score is not None else None

        metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
        metrics_values = [acc_test, prec, rec, f1, test_auc]

        plt.figure(figsize=(8, 5))
        sns.barplot(x=metrics_names, y=metrics_values, palette="viridis")
        plt.ylim(0, 1)
        plt.title(f"Performance Metrics - {model_name} with {feature_name}")
        for i, v in enumerate(metrics_values):
            plt.text(i, v + 0.02, f"{v:.2f}", ha='center', fontweight='bold')
        plt.tight_layout()
        plt.savefig(f"./grafici/metrics_summary_{model_name}_{feature_name}.png")
        plt.close()

        results.append({
            'Model': model_name,
            'Features': feature_name,
            'Best Params': grid.best_params_,
            'Train Accuracy': acc_train,
            'Train F1': f1_train,
            'Test Accuracy': acc_test,
            'Test F1': f1_test,
            'CV Accuracy Mean': cv_mean,
            'CV Accuracy Std': cv_std,
            'CV Scores': cv_scores,
            'Train AUC': train_auc,
            'Test AUC': test_auc
        })

        print(f"Best Params: {grid.best_params_}")
        # print(f"Train Accuracy: {acc_train:.4f} - Train F1: {f1_train:.4f}")
        # print(f"Test Accuracy: {acc_test:.4f} - Test F1: {f1_test:.4f}")
        print(f"CV F1 Mean: {cv_mean:.4f} - CV F1 Std: {cv_std:.4f}")
        # print("Classification Report Test Set:\n", classification_report(y_test, y_pred_test, zero_division=0))

        # ROC Curve per il test set)
        if y_test_score is not None:
            fpr, tpr, _ = roc_curve(y_test, y_test_score)
            plt.figure()
            plt.plot(fpr, tpr, label=f"{model_name} ({feature_name}) AUC = {test_auc:.2f}")
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"ROC Curve - {model_name} ({feature_name})")
            plt.legend(loc="lower right")
            plt.tight_layout()
            plt.savefig(f"./grafici/ROC Curve - {model_name} ({feature_name}).png")
            plt.close()



In [None]:
# === VISUALIZZAZIONE ===
df_results = pd.DataFrame(results)

# Esporto in CSV
df_results.to_csv("./risultati/risultati_model_selection.csv", index=False)

print("\n📊 Ecco il riepilogo dei risultati:\n", df_results)

plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_results.sort_values(by="Test Accuracy", ascending=False),
    x="Features",
    y="Test Accuracy",
    hue="Model",
    palette="Set2"
)
plt.title("Confronto Accuracy tra Modelli e Feature (ordinato)")
plt.ylim(0.5, 1.0)
plt.ylabel("Accuracy")
plt.xlabel("Tipo di Feature")
plt.legend(title="Modello")
plt.tight_layout()
plt.savefig("./grafici/confronto_accuracy.png")
plt.close()

plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_results.sort_values(by="Test F1", ascending=False),
    x="Features",
    y="Test F1",
    hue="Model",
    palette="Set1"
)
plt.title("Confronto F1-score tra Modelli e Feature (ordinato)")
plt.ylim(0.5, 1.0)
plt.ylabel("F1-score")
plt.xlabel("Tipo di Feature")
plt.legend(title="Modello")
plt.tight_layout()
plt.savefig("./grafici/confronto_f1_score.png")
plt.close()


# --- Grafico a barre con media e deviazione standard (accuracy) ---

plt.figure(figsize=(12, 6))
labels = df_results['Model'] + " con " + df_results['Features']
means = df_results['CV Accuracy Mean']
stds = df_results['CV Accuracy Std']

plt.bar(labels, means, yerr=stds, capsize=5, color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.ylabel("Accuracy media (cross-validation)")
plt.title("Confronto modelli: media Accuracy cross-validation con deviazione standard")
plt.tight_layout()

# Salvo il grafico in file PNG
plt.savefig("./grafici/grafico_accuracy_cv.png")
plt.close()

print("\n Grafico salvato come 'grafico_accuracy_cv.png'")

# BOXPLOT singoli fold score
# HEATMAP F1 SCORE
heatmap_df = df_results.pivot(index='Model', columns='Features', values='Test Accuracy')
plt.figure(figsize=(8, 5))
sns.heatmap(heatmap_df, annot=True, fmt=".3f", cmap="YlGnBu")
plt.title("F1 Score sul Test Set (modelli ottimizzati)")
plt.ylabel("Modello")
plt.xlabel("Feature Set")
plt.tight_layout()
plt.savefig("./grafici/grafico_Accuracy_test_set.png")
plt.close()


df_results['Model+Feature'] = df_results['Model'] + " + " + df_results['Features']
# ✅ Salva su CSV
df_results.to_csv("./risultati/risultati_finali_pipeline.csv", index=False)



📊 Ecco il riepilogo dei risultati:
 Empty DataFrame
Columns: []
Index: []


KeyError: 'Test Accuracy'

<Figure size 1200x600 with 0 Axes>

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


generalization_test_size = 5000
random_state = 42

# Combinazioni di feature
X_tfidf
y = df['label'].values

# Primo split: estraiamo 5k esempi per test di generalizzazione (random stratificato)
X_struct_rest, X_struct_generalization, y_rest, y_generalization = train_test_split(
    X_struct, y,
    test_size=generalization_test_size,
    random_state=random_state,
    stratify=y
)

X_tfidf_rest, X_tfidf_generalization, _, _ = train_test_split(
    X_tfidf, y,
    test_size=generalization_test_size,
    random_state=random_state,
    stratify=y
)


X_test_final = X_tfidf_generalization
y_test = y_generalization


# 🧠 Carica modelli
# rf_model = joblib.load("./modelli/Random Forest_Struttural + TF-IDF_best_model.pkl")
svm_model = joblib.load("./modelli/SVM_Struttural + TF-IDF_best_model.pkl")
# rf_model2 = joblib.load("./modelli/Random Forest_Struttural only_best_model.pkl")
# svm_model2 = joblib.load("./modelli/SVM_Struttural only_best_model.pkl")

# ✅ Valutazione modelli classici
def evaluate_model(model, X, y, name):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None
    print(f"\n===== {name} =====")
    print("Accuracy:", accuracy_score(y, y_pred))
    print("F1-score:", f1_score(y, y_pred, average="weighted"))
    print("Precision: ", precision_score(y, y_pred, average="weighted"))
    print("Recall", recall_score(y, y_pred, average="weighted"))
    if y_prob is not None:
        print("ROC AUC:", roc_auc_score(y, y_prob))
    print("Classification Report:\n", classification_report(y, y_pred))
    return y_pred, y_prob

# y_pred_rf, y_prob_rf = evaluate_model(rf_model, X_test_final, y_test, "Random Forest")
y_pred_svm, y_prob_svm = evaluate_model(svm_model, X_test_final, y_test, "SVM")


result_df = pd.DataFrame()
result_df["Label esatta"] = y_test
result_df["Label predetta"] = y_pred_svm

# Mappatura dei valori numerici a stringhe
label_map = {0: "fake", 1: "real"}
result_df["Label esatta"] = result_df["Label esatta"].map(label_map)
result_df["Label predetta"] = result_df["Label predetta"].map(label_map)

print(result_df.head())

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, roc_auc_score, accuracy_score,
    f1_score, classification_report, precision_score, recall_score
)
import numpy as np

def plot_confusion(y_true, y_pred, name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    plt.figure(figsize=(5, 4))
    disp.plot(cmap="Blues", values_format='d')
    plt.title(f"Confusion Matrix - {name}")
    plt.grid(False)
    plt.tight_layout()
    plt.savefig(f"./grafici/conf_matrix_{name}.png")
    plt.close()

def plot_roc(y_true, y_score, name):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc = roc_auc_score(y_true, y_score)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve - {name}")
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(f"./grafici/roc_curve_{name}.png")
    plt.close()

def plot_metrics(y_true, y_pred, y_score, name):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    auc = roc_auc_score(y_true, y_score) if y_score is not None else 0
    pre = precision_score(y_true, y_pred, average="weighted")
    rec = recall_score(y_true, y_pred, average="weighted")

    metrics_names = ['Accuracy', 'F1 Score', 'ROC AUC', 'Recall', 'Precision']
    values = [acc, f1, auc, rec, pre]

    plt.figure(figsize=(7, 4))
    sns.barplot(x=metrics_names, y=values, palette="viridis")
    plt.ylim(0, 1)
    for i, v in enumerate(values):
        plt.text(i, v + 0.02, f"{v:.2f}", ha='center', fontweight='bold')
    plt.title(f"Metrics - {name}")
    plt.tight_layout()
    plt.savefig(f"./grafici/metrics_{name}.png")
    plt.close()

# === RF ===
# plot_confusion(y_test, y_pred_rf, "Random_Forest")
# if y_prob_rf is not None:
#     plot_roc(y_test, y_prob_rf, "Random_Forest")
# plot_metrics(y_test, y_pred_rf, y_prob_rf, "Random_Forest")

# === SVM ===
plot_confusion(y_test, y_pred_svm, "SVM")
if y_prob_svm is not None:
    plot_roc(y_test, y_prob_svm, "SVM")
plot_metrics(y_test, y_pred_svm, y_prob_svm, "SVM")



===== SVM =====
Accuracy: 0.9396
F1-score: 0.9395842532112936
Precision:  0.9397142276995968
Recall 0.9396
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      2448
           1       0.93      0.95      0.94      2552

    accuracy                           0.94      5000
   macro avg       0.94      0.94      0.94      5000
weighted avg       0.94      0.94      0.94      5000

  Label esatta Label predetta
0         fake           fake
1         fake           fake
2         fake           real
3         real           real
4         real           real



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=metrics_names, y=values, palette="viridis")


<Figure size 500x400 with 0 Axes>