<a href="https://colab.research.google.com/github/Aqsaabbasi2690/Fake_News_Detection/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

GPU Available: True
GPU Name: Tesla T4


In [3]:
!pip install transformers datasets accelerate evaluate
!pip install scikit-learn pandas numpy matplotlib seaborn joblib

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [4]:
from google.colab import files
uploaded = files.upload()


Saving test.tsv to test.tsv
Saving train.tsv to train.tsv
Saving valid.tsv to valid.tsv


In [5]:
import pandas as pd

train_df = pd.read_csv("train.tsv", sep="\t")
valid_df = pd.read_csv("valid.tsv", sep="\t")
test_df  = pd.read_csv("test.tsv", sep="\t")

train_df.head()

Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece


In [6]:
import pandas as pd

# Load with NO header because your file has no header row
train_df = pd.read_csv("train.tsv", sep="\t", header=None)
valid_df = pd.read_csv("valid.tsv", sep="\t", header=None)
test_df = pd.read_csv("test.tsv", sep="\t", header=None)

# Correct LIAR dataset column names
cols = [
    "id",
    "label",
    "statement",
    "subject",
    "speaker",
    "job_title",
    "state_info",
    "party_affiliation",
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "context"
]

train_df.columns = cols
valid_df.columns = cols
test_df.columns = cols

# Combine into one dataframe
df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Keep only needed columns
df = df[["statement", "context", "label"]]

# Encode labels
label_map = {
    "pants-fire": 0,
    "false": 1,
    "barely-true": 2,
    "half-true": 3,
    "mostly-true": 4,
    "true": 5
}

df["label_encoded"] = df["label"].map(label_map)

# Save processed file
df.to_csv("liar_processed.csv", index=False)

df.head()


Unnamed: 0,statement,context,label,label_encoded
0,Says the Annies List political group supports ...,a mailer,false,1
1,When did the decline of coal start? It started...,a floor speech.,half-true,3
2,"Hillary Clinton agrees with John McCain ""by vo...",Denver,mostly-true,4
3,Health care reform legislation is likely to ma...,a news release,false,1
4,The economic turnaround started at the end of ...,an interview on CNN,half-true,3


In [7]:
# SLM Experiments (TF-IDF + ML)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs("outputs_slm", exist_ok=True)

df = pd.read_csv("liar_processed.csv")

# compute consistency
from sklearn.metrics.pairwise import cosine_similarity
vec = TfidfVectorizer(max_features=20000)
vec.fit(pd.concat([df["statement"].fillna(""), df["context"].fillna("")]))

s_vec = vec.transform(df["statement"].fillna(""))
c_vec = vec.transform(df["context"].fillna(""))
num = (s_vec.multiply(c_vec)).sum(axis=1)
den = np.sqrt((s_vec.multiply(s_vec)).sum(axis=1).A1) * np.sqrt((c_vec.multiply(c_vec)).sum(axis=1).A1)
df["consistency"] = np.squeeze(np.array(num)) / (den + 1e-12)

# TF-IDF combined
tf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
combined = df["statement"].fillna("") + " " + df["context"].fillna("")
X_text = tf.fit_transform(combined)

X = hstack([X_text, df["consistency"].values.reshape(-1,1)])
y = df["label_encoded"].values

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "NaiveBayes": MultinomialNB(),
    "LinearSVC": LinearSVC(max_iter=20000),
    "RandomForest": RandomForestClassifier(n_estimators=200)
}

results = []

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")

    print(name, acc, f1)
    results.append((name, acc, f1))

    with open(f"outputs_slm/{name}_report.txt", "w") as f:
        f.write(classification_report(y_test, preds))

# summary
pd.DataFrame(results, columns=["Model", "Accuracy", "F1"]).to_csv(
    "outputs_slm/slm_summary.csv", index=False
)

print("SLM Experiments Complete")


Training LogReg...
LogReg 0.26159458051068263 0.24311573747762724
Training NaiveBayes...
NaiveBayes 0.24804585721730069 0.18428483355430156
Training LinearSVC...
LinearSVC 0.23866597186034394 0.23501342991038796
Training RandomForest...
RandomForest 0.2623762376237624 0.24005468494782523
SLM Experiments Complete


In [8]:
# llm experiment file
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import os

os.makedirs("outputs_llm", exist_ok=True)

# Check if liar_processed.csv exists, if not, generate it
if not os.path.exists("liar_processed.csv"):
    print("liar_processed.csv not found. Generating it now...")
    # Load with NO header because your file has no header row
    train_df_raw = pd.read_csv("train.tsv", sep="\t", header=None)
    valid_df_raw = pd.read_csv("valid.tsv", sep="\t", header=None)
    test_df_raw = pd.read_csv("test.tsv", sep="\t", header=None)

    # Correct LIAR dataset column names
    cols = [
        "id",
        "label",
        "statement",
        "subject",
        "speaker",
        "job_title",
        "state_info",
        "party_affiliation",
        "barely_true_counts",
        "false_counts",
        "half_true_counts",
        "mostly_true_counts",
        "pants_on_fire_counts",
        "context"
    ]

    train_df_raw.columns = cols
    valid_df_raw.columns = cols
    test_df_raw.columns = cols

    # Combine into one dataframe
    df_temp = pd.concat([train_df_raw, valid_df_raw, test_df_raw], ignore_index=True)

    # Keep only needed columns
    df_temp = df_temp[["statement", "context", "label"]]

    # Encode labels
    label_map = {
        "pants-fire": 0,
        "false": 1,
        "barely-true": 2,
        "half-true": 3,
        "mostly-true": 4,
        "true": 5
    }

    df_temp["label_encoded"] = df_temp["label"].map(label_map)

    # Save processed file
    df_temp.to_csv("liar_processed.csv", index=False)
    print("liar_processed.csv generated.")

df = pd.read_csv("liar_processed.csv")
df["text"] = df["statement"].fillna("") + " " + df["context"].fillna("")
df = df[["text", "label_encoded"]]

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["label_encoded"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label_encoded"])

class LiarDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze() for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.texts)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

models = {
    "bert-base-uncased": "BERT",
    "distilbert-base-uncased": "DistilBERT",
    "roberta-base": "RoBERTa"
}

results = []

for model_name, alias in models.items():
    print(f"\n===== Training {alias} =====")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

    train_dataset = LiarDataset(train_df.text.tolist(), train_df.label_encoded.tolist(), tokenizer)
    val_dataset = LiarDataset(val_df.text.tolist(), val_df.label_encoded.tolist(), tokenizer)
    test_dataset = LiarDataset(test_df.text.tolist(), test_df.label_encoded.tolist(), tokenizer)

    args = TrainingArguments(
        output_dir=f"outputs_llm/{alias}",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        load_best_model_at_end=True,
        report_to="none" # Disable wandb reporting
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate(test_dataset)

    # Access metrics using 'eval_' prefix
    results.append((alias, metrics["eval_accuracy"], metrics["eval_f1_macro"]))

pd.DataFrame(results, columns=["Model", "Accuracy", "F1"]).to_csv("outputs_llm/llm_summary.csv", index=False)

print("LLM Experiments Complete")


===== Training BERT =====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.7111,1.687178,0.266285,0.220942



===== Training DistilBERT =====


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.6941,1.680513,0.2642,0.232764



===== Training RoBERTa =====


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.7604,1.757587,0.205315,0.056781


LLM Experiments Complete
