In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Load ISOT data
fake_df = pd.read_csv('isot/Fake.csv')
true_df = pd.read_csv('isot/True.csv')

# Add labels: 0 for fake, 1 for true
fake_df['label'] = 0
true_df['label'] = 1

# Combine and shuffle
isot_df = pd.concat([fake_df, true_df], ignore_index=True)
isot_df = isot_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Use 'text' column for news content
print(isot_df['label'].value_counts())
print(isot_df.head())

label
0    23481
1    21417
Name: count, dtype: int64
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4       June 24, 2016       

In [3]:
from sklearn.model_selection import train_test_split

X = isot_df['text']
y = isot_df['label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", len(X_train), "Valid:", len(X_valid), "Test:", len(X_test))

Train: 31428 Valid: 6735 Test: 6735


In [4]:
# Use your preprocess function from LIAR-2
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data once (run this once in your environment)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

X_train_prep = X_train.apply(preprocess)
X_valid_prep = X_valid.apply(preprocess)
X_test_prep = X_test.apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_prep)
X_valid_tfidf = vectorizer.transform(X_valid_prep)
X_test_tfidf = vectorizer.transform(X_test_prep)

SVM

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC(class_weight='balanced', random_state=42))
])

pipeline.fit(X_train, y_train)

import joblib
joblib.dump(pipeline, 'svm_model.pkl')

# Evaluation
from sklearn.metrics import accuracy_score, classification_report

print("Validation Accuracy:", accuracy_score(y_valid, pipeline.predict(X_valid)))
print(classification_report(y_valid, pipeline.predict(X_valid), target_names=['fake', 'real']))

print("Test Accuracy:", accuracy_score(y_test, pipeline.predict(X_test)))
print(classification_report(y_test, pipeline.predict(X_test), target_names=['fake', 'real']))


Validation Accuracy: 0.9946547884187082
              precision    recall  f1-score   support

        fake       1.00      0.99      0.99      3522
        real       0.99      1.00      0.99      3213

    accuracy                           0.99      6735
   macro avg       0.99      0.99      0.99      6735
weighted avg       0.99      0.99      0.99      6735

Test Accuracy: 0.9951002227171493
              precision    recall  f1-score   support

        fake       0.99      1.00      1.00      3523
        real       1.00      0.99      0.99      3212

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735



BERT

In [14]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# ---- Step 1: Dataset Class ----
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# ---- Step 2: Metrics ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

# ---- Step 3: Tokenizer & Dataset ----
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_texts = X_train.tolist()
train_labels = y_train.tolist()
valid_texts = X_valid.tolist()
valid_labels = y_valid.tolist()
test_texts = X_test.tolist()
test_labels = y_test.tolist()

train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
valid_dataset = NewsDataset(valid_texts, valid_labels, tokenizer)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer)

# ---- Step 4: Model ----
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# ---- Step 5: Training Arguments ----
training_args = TrainingArguments(
    output_dir='./bert_isot_results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    save_total_limit=1,
)

# ---- Step 6: Trainer ----
trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

# ---- Step 7: Train & Evaluate ----
trainer.train()

# Validation performance
eval_results = trainer.evaluate()
print("📊 BERT Validation Accuracy:", eval_results['eval_accuracy'])

# ---- Step 8: Test Evaluation ----
test_results = trainer.predict(test_dataset)
test_preds = test_results.predictions.argmax(axis=1)

print("BERT Test Accuracy:", accuracy_score(test_labels, test_preds))
print(classification_report(test_labels, test_preds, target_names=['Real', 'Fake']))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.5052
20,0.1476
30,0.0815
40,0.005
50,0.0018
60,0.0009
70,0.0778
80,0.0021
90,0.0007
100,0.0004


📊 BERT Validation Accuracy: 0.9998515219005196
BERT Test Accuracy: 0.9992576095025983
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00      3523
        Fake       1.00      1.00      1.00      3212

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735



In [15]:
# Save the trained BERT model and tokenizer for ISOT
model_bert.save_pretrained("bert_isot_model")
tokenizer.save_pretrained("bert_isot_model")
print("✅ BERT model and tokenizer saved to 'bert_isot_model'")

✅ BERT model and tokenizer saved to 'bert_isot_model'
