In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
DATA_PATH = "/content/drive/MyDrive/FAKE_NEWS_DATA/"


In [5]:
import os
os.listdir(DATA_PATH)


['valid.tsv',
 'test.tsv',
 'gossipcop_real.csv',
 'gossipcop_fake.csv',
 'train.tsv',
 'True.csv',
 'Fake.csv']

In [6]:
!pip -q install transformers lime


In [7]:
import pandas as pd
import numpy as np
import re
import torch
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader


# DATA SET PREPARATION

In [8]:


fake_df = pd.read_csv(DATA_PATH + "Fake.csv")
true_df = pd.read_csv(DATA_PATH + "True.csv")

print(fake_df.columns)
print(true_df.columns)


Index(['title', 'text', 'subject', 'date'], dtype='object')
Index(['title', 'text', 'subject', 'date'], dtype='object')


In [9]:
fake_df['label'] = 0   # Fake news
true_df['label'] = 1   # Real news

isot_df = pd.concat([fake_df[['text', 'label']],
                     true_df[['text', 'label']]])

print("ISOT shape:", isot_df.shape)
isot_df.head()


ISOT shape: (44898, 2)


Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [10]:
gossip_fake = pd.read_csv(DATA_PATH + "gossipcop_fake.csv")
gossip_real = pd.read_csv(DATA_PATH + "gossipcop_real.csv")

print(gossip_fake.columns)
print(gossip_real.columns)


Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')
Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')


In [11]:
gossip_fake['label'] = 0
gossip_real['label'] = 1

gossip_df = pd.concat([
    gossip_fake[['title', 'label']],
    gossip_real[['title', 'label']]
])

gossip_df.rename(columns={'title': 'text'}, inplace=True)

print("GossipCop shape:", gossip_df.shape)
gossip_df.head()


GossipCop shape: (22140, 2)


Unnamed: 0,text,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,0
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,0
2,Celebrities Join Tax March in Protest of Donal...,0
3,Cindy Crawford's daughter Kaia Gerber wears a ...,0
4,Full List of 2018 Oscar Nominations – Variety,0


In [16]:
liar_train = pd.read_csv(DATA_PATH + "train.tsv", sep='\t', header=None)
liar_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [13]:
liar_train = pd.read_csv(DATA_PATH + "train.tsv", sep='\t', header=None)
liar_valid = pd.read_csv(DATA_PATH + "valid.tsv", sep='\t', header=None)
liar_test  = pd.read_csv(DATA_PATH + "test.tsv",  sep='\t', header=None)

liar_df = pd.concat([liar_train, liar_valid, liar_test])

# Keep only statement and label
liar_df = liar_df[[2, 1]]
liar_df.columns = ['text', 'label']

fake_labels = ['false', 'pants-fire', 'barely-true']
liar_df['label'] = liar_df['label'].apply(
    lambda x: 0 if x in fake_labels else 1
)

print("LIAR shape:", liar_df.shape)
liar_df.head()


LIAR shape: (12791, 2)


Unnamed: 0,text,label
0,Says the Annies List political group supports ...,0
1,When did the decline of coal start? It started...,1
2,"Hillary Clinton agrees with John McCain ""by vo...",1
3,Health care reform legislation is likely to ma...,0
4,The economic turnaround started at the end of ...,1


In [17]:
combined_df = pd.concat([isot_df, gossip_df, liar_df])
combined_df.dropna(inplace=True)

combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Total combined shape:", combined_df.shape)
combined_df.head()


Total combined shape: (79829, 2)


Unnamed: 0,text,label
0,Jane Fonda and Lily Tomlin Play ‘Never Have I ...,1
1,"In the fall of 2009, only 21 percent of Texas ...",1
2,"AUSTIN, Texas (Reuters) - Abortion providers i...",1
3,"Listening to Zeke Emanuel, awkward brother of ...",0
4,This isn t the first time Pelosi has lost it i...,0


In [18]:
combined_df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,45368
0,34461


In [19]:
combined_df.to_csv("/content/drive/MyDrive/FAKE_NEWS_DATA/combined_news.csv", index=False)


# **DistilBERT**

In [20]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=192):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item


In [21]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [22]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    combined_df,
    test_size=0.1,
    random_state=42,
    stratify=combined_df['label']
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))


Train size: 71846
Val size: 7983


In [23]:
train_dataset = NewsDataset(
    train_df['text'],
    train_df['label'],
    tokenizer,
    max_len=192
)

val_dataset = NewsDataset(
    val_df['text'],
    val_df['label'],
    tokenizer,
    max_len=192
)

len(train_dataset), len(val_dataset)


(71846, 7983)

In [24]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)

len(train_loader), len(val_loader)


(2246, 250)

In [25]:
from transformers import DistilBertForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

model.to(device)
device


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda')

In [26]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 2
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

total_steps


4492

In [27]:
from tqdm import tqdm
import torch.nn.functional as F

def train_one_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


In [28]:
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    avg_loss = train_one_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Average Loss: {avg_loss:.4f}")



Epoch 1/2


100%|██████████| 2246/2246 [20:05<00:00,  1.86it/s]


Average Loss: 0.2475

Epoch 2/2


100%|██████████| 2246/2246 [20:04<00:00,  1.87it/s]

Average Loss: 0.1721





TEST

In [29]:
@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    all_labels = []
    all_preds = []

    for batch in tqdm(loader):
        labels = batch["labels"].numpy()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_labels.extend(labels)
        all_preds.extend(preds)

    return all_labels, all_preds


In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_true, y_pred = evaluate(model, val_loader, device)

print("Validation Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred))


100%|██████████| 250/250 [00:56<00:00,  4.45it/s]

Validation Accuracy: 0.9016660403357134

Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.84      0.88      3446
           1       0.89      0.94      0.92      4537

    accuracy                           0.90      7983
   macro avg       0.90      0.89      0.90      7983
weighted avg       0.90      0.90      0.90      7983






In [34]:
val_df = val_df.copy()
val_df["len"] = val_df["text"].apply(len)

val_df["len"].describe()


Unnamed: 0,len
count,7983.0
mean,1395.519354
std,1983.302619
min,1.0
25%,79.0
50%,487.0
75%,2300.0
max,35941.0


In [35]:
def detect_domain(length):
    if length < 150:
        return "GossipCop (Headlines)"
    elif length < 1000:
        return "LIAR (Political Statements)"
    else:
        return "ISOT (News Articles)"

val_df["domain"] = val_df["len"].apply(detect_domain)

val_df["domain"].value_counts()


Unnamed: 0_level_0,count
domain,Unnamed: 1_level_1
ISOT (News Articles),3487
GossipCop (Headlines),3467
LIAR (Political Statements),1029


In [36]:
from sklearn.metrics import accuracy_score, f1_score

def eval_domain(domain_name):
    subset = val_df[val_df["domain"] == domain_name]

    ds = NewsDataset(subset["text"], subset["label"], tokenizer)
    dl = DataLoader(ds, batch_size=32, shuffle=False)

    y_t, y_p = evaluate(model, dl, device)

    acc = accuracy_score(y_t, y_p)
    f1 = f1_score(y_t, y_p, average="macro")

    print(f"\n📌 {domain_name}")
    print("Samples:", len(subset))
    print("Accuracy:", acc)
    print("Macro F1:", f1)

for d in val_df["domain"].unique():
    eval_domain(d)


100%|██████████| 109/109 [00:27<00:00,  3.90it/s]



📌 ISOT (News Articles)
Samples: 3487
Accuracy: 0.9988528821336392
Macro F1: 0.998841004573495


100%|██████████| 109/109 [00:19<00:00,  5.48it/s]



📌 GossipCop (Headlines)
Samples: 3467
Accuracy: 0.7980963368906836
Macro F1: 0.7633181221332713


100%|██████████| 33/33 [00:05<00:00,  5.57it/s]


📌 LIAR (Political Statements)
Samples: 1029
Accuracy: 0.9212827988338192
Macro F1: 0.9072705851201714





In [37]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import os

def compute_ece(y_true, y_prob, n_bins=15):
    conf = y_prob.max(axis=1)
    preds = y_prob.argmax(axis=1)
    correct = (preds == y_true).astype(float)

    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        lo, hi = bins[i], bins[i + 1]
        idx = np.where((conf > lo) & (conf <= hi))[0]
        if len(idx) == 0:
            continue
        acc_bin = correct[idx].mean()
        conf_bin = conf[idx].mean()
        ece += (len(idx) / len(y_true)) * abs(acc_bin - conf_bin)
    return float(ece)

def compute_brier(y_true, y_prob):
    p_true = y_prob[np.arange(len(y_true)), y_true]
    return float(np.mean((1.0 - p_true) ** 2))

def compute_overconfidence(y_true, y_prob):
    conf = y_prob.max(axis=1)
    preds = y_prob.argmax(axis=1)
    acc = (preds == y_true).mean()
    return float(conf.mean() - acc)

def save_confusion(cm, out_path):
    plt.figure(figsize=(5,4))
    plt.imshow(cm, cmap="Blues")
    plt.colorbar()
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.savefig(out_path)
    plt.close()

def save_roc(y_true, y_prob, out_path):
    auc = roc_auc_score(y_true, y_prob[:,1])
    fpr, tpr, _ = roc_curve(y_true, y_prob[:,1])
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1],[0,1],'--')
    plt.legend()
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("ROC Curve")
    plt.savefig(out_path)
    plt.close()
    return auc


In [38]:
@torch.no_grad()
def get_probs(model, loader, device):
    model.eval()
    y_true, y_prob = [], []

    for batch in loader:
        labels = batch["labels"].numpy()
        batch = {k: v.to(device) for k, v in batch.items()}

        logits = model(**batch).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()

        y_true.extend(labels)
        y_prob.extend(probs)

    return np.array(y_true), np.array(y_prob)

y_true_val, y_prob_val = get_probs(model, val_loader, device)


In [39]:
y_pred_val = y_prob_val.argmax(axis=1)

acc = accuracy_score(y_true_val, y_pred_val)
f1 = f1_score(y_true_val, y_pred_val, average="macro")
bal_acc = balanced_accuracy_score(y_true_val, y_pred_val)

ece = compute_ece(y_true_val, y_prob_val)
brier = compute_brier(y_true_val, y_prob_val)
overconf = compute_overconfidence(y_true_val, y_prob_val)

print("Accuracy:", acc)
print("Macro F1:", f1)
print("Balanced Acc:", bal_acc)
print("ECE:", ece)
print("Brier:", brier)
print("Overconfidence:", overconf)


Accuracy: 0.9016660403357134
Macro F1: 0.8986550688605319
Balanced Acc: 0.8948225213697245
ECE: 0.020666496124719123
Brier: 0.06736675649881363
Overconfidence: 0.017369155968607886


In [40]:
os.makedirs("artifacts", exist_ok=True)

cm = confusion_matrix(y_true_val, y_pred_val)
save_confusion(cm, "artifacts/confusion_matrix.png")

auc = save_roc(y_true_val, y_prob_val, "artifacts/roc_curve.png")

print("AUROC:", auc)
print("Artifacts saved in /artifacts")


AUROC: 0.9718770703409678
Artifacts saved in /artifacts


In [41]:
model.save_pretrained("distilbert_fake_news_model")
tokenizer.save_pretrained("distilbert_fake_news_model")


('distilbert_fake_news_model/tokenizer_config.json',
 'distilbert_fake_news_model/special_tokens_map.json',
 'distilbert_fake_news_model/vocab.txt',
 'distilbert_fake_news_model/added_tokens.json',
 'distilbert_fake_news_model/tokenizer.json')

In [42]:
summary = {
    "accuracy": acc,
    "macro_f1": f1,
    "balanced_accuracy": bal_acc,
    "auroc": auc,
    "ece": ece,
    "brier": brier,
    "overconfidence": overconf
}

import json
with open("artifacts/summary.json", "w") as f:
    json.dump(summary, f, indent=2)
