In [1]:
# === 1. SETUP ===
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import json
import random
import warnings

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === 2. DATA LOADING & PREPROCESSING ===
def load_and_preprocess_data():
    true_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
    fake_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

    true_df['label'] = 0
    fake_df['label'] = 1
    df = pd.concat([true_df, fake_df], ignore_index=True)

    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
        return text

    df['cleaned_text'] = df['text'].apply(clean_text)

    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

df = load_and_preprocess_data()

# === 3. TRAIN-TEST SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# === 4. BERT TOKENIZATION ===
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_texts(texts, labels, max_len=128):
    input_ids, attention_masks = [], []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels)

train_inputs, train_masks, train_labels = tokenize_texts(X_train, y_train)
test_inputs, test_masks, test_labels = tokenize_texts(X_test, y_test)

# === 5. DATALOADERS ===
batch_size = 32
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_masks, test_labels)

train_dataloader = DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=torch.utils.data.SequentialSampler(test_dataset), batch_size=batch_size)

# === 6. MODEL INITIALIZATION ===
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# === 7. TRAINING SETUP ===
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# === 8. TRAINING LOOP ===
def train_model():
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Average training loss: {total_loss / len(train_dataloader):.4f}")

# === 9. EVALUATION ===
def evaluate_model(dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        predictions.append(logits.detach().cpu().numpy())
        true_labels.append(b_labels.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    pred_labels = np.argmax(predictions, axis=1)

    from sklearn.metrics import classification_report
    print(classification_report(true_labels, pred_labels))

# === 10. CUSTOM TEST EVALUATION ===
def load_custom_test_data():
    real_samples = [
        "The Prime Minister held a press conference to discuss new economic reforms.",
        ...  # 49 more realistic samples
    ]
    fake_samples = [
        "Aliens spotted participating in the Olympics disguised as athletes.",
        ...  # 49 more fake samples
    ]
    samples = [(text, 0) for text in real_samples] + [(text, 1) for text in fake_samples]
    random.shuffle(samples)

    with open("custom_test_samples.json", "w") as f:
        json.dump(samples, f)

    return samples

def evaluate_custom_samples(samples):
    model.eval()
    correct = 0

    print(f"{'Sample':<3} | {'Prediction':<9} | {'Expected':<8} | {'Correct?':<8} | {'Confidence (Fake%)':<20}")
    print("-" * 80)

    for idx, (text, expected_label) in enumerate(samples):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_id = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_id, attention_mask=attention_mask)

        probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        pred_label = int(np.argmax(probs))
        confidence = probs[1] * 100

        is_correct = "Yes" if pred_label == expected_label else "No"
        label_str = lambda x: "Fake" if x == 1 else "Real"

        print(f"{idx+1:<3} | {label_str(pred_label):<9} | {label_str(expected_label):<8} | {is_correct:<8} | {confidence:.2f}%")

        if pred_label == expected_label:
            correct += 1

    print(f"\nAccuracy on custom test set: {correct} / {len(samples)} = {correct / len(samples) * 100:.2f}%")

# === 11. MAIN EXECUTION ===
if __name__ == '__main__':
    train_model()
    print("\nEvaluation on Test Set:")
    evaluate_model(test_dataloader)

    print("\nEvaluation on Custom Sample Set:")
    custom_samples = load_custom_test_data()
    evaluate_custom_samples(custom_samples)


2025-06-10 18:12:04.054133: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749579124.334184      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749579124.400099      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda

Class Distribution:
label
1    23481
0    21417
Name: count, dtype: int64


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average training loss: 0.0168

Evaluation on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284
           1       1.00      1.00      1.00      4696

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980


Evaluation on Custom Sample Set:


TypeError: Object of type ellipsis is not JSON serializable