<a href="https://colab.research.google.com/github/Amit-sheikh/Amit-sheikh/blob/main/ml_dl_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================================
# 🧠 Fake News Detection – Comparative Pipeline (5 Models)
# Models: Logistic Regression | Naive Bayes | LSTM | RoBERTa | BERT
# ===============================================================

!pip install transformers datasets torch scikit-learn pandas numpy tqdm --quiet

import zipfile, os, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# === 1️⃣ Extract Dataset ===
zip_path = "News-_dataset.zip"
extract_dir = "news_dataset_extracted"
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("✅ Dataset extracted!")

# === 2️⃣ Load CSVs ===
fake = pd.read_csv(os.path.join(extract_dir, "Fake.csv"), on_bad_lines='skip', encoding='utf-8')
true = pd.read_csv(os.path.join(extract_dir, "True.csv"), on_bad_lines='skip', encoding='utf-8')

def find_text_col(df):
    for col in df.columns:
        if df[col].dtype == object and len(str(df[col].iloc[0])) > 20:
            return col
    return df.columns[0]

fake_col = find_text_col(fake)
true_col = find_text_col(true)

fake_df = pd.DataFrame({'text': fake[fake_col].astype(str), 'label': 0})
true_df = pd.DataFrame({'text': true[true_col].astype(str), 'label': 1})
df = pd.concat([fake_df, true_df], ignore_index=True).sample(frac=1, random_state=42)

print(f"📊 Total samples: {len(df)}")

# === Split ===
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# ===============================================================
# 1️⃣ Logistic Regression (TF-IDF)
# ===============================================================
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, train_labels)
pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(test_labels, pred_lr)
print(f"🔹 Logistic Regression Accuracy: {acc_lr:.4f}")

# ===============================================================
# 2️⃣ Naive Bayes
# ===============================================================
nb = MultinomialNB()
nb.fit(X_train, train_labels)
pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(test_labels, pred_nb)
print(f"🔹 Naive Bayes Accuracy: {acc_nb:.4f}")

# ===============================================================
# 3️⃣ LSTM (Deep Learning)
# ===============================================================
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_len)

model_lstm = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_seq, np.array(train_labels), epochs=2, batch_size=64, validation_split=0.2, verbose=1)
loss, acc_lstm = model_lstm.evaluate(X_test_seq, np.array(test_labels), verbose=0)
print(f"🔹 LSTM Accuracy: {acc_lstm:.4f}")

# ===============================================================
# 4️⃣ RoBERTa-base (Transformers Fine-Tuning)
# ===============================================================
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer_roberta(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer_roberta(test_texts.tolist(), truncation=True, padding=True, max_length=128)

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels.iloc[idx]))
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
args_roberta = TrainingArguments(
    output_dir='./roberta_results', evaluation_strategy="epoch", num_train_epochs=1,
    per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=2e-5
)

trainer_roberta = Trainer(
    model=model_roberta, args=args_roberta,
    train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer_roberta
)
trainer_roberta.train()
metrics_roberta = trainer_roberta.evaluate()
acc_roberta = metrics_roberta.get("eval_accuracy", None)
print(f"🔹 RoBERTa-base Accuracy: {acc_roberta:.4f}")

# ===============================================================
# 5️⃣ BERT-base (Transformers Fine-Tuning)
# ===============================================================
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer_bert(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer_bert(test_texts.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
args_bert = TrainingArguments(
    output_dir='./bert_results', evaluation_strategy="epoch", num_train_epochs=1,
    per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=2e-5
)

trainer_bert = Trainer(
    model=model_bert, args=args_bert,
    train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer_bert
)
trainer_bert.train()
metrics_bert = trainer_bert.evaluate()
acc_bert = metrics_bert.get("eval_accuracy", None)
print(f"🔹 BERT-base Accuracy: {acc_bert:.4f}")

# ===============================================================
# 📊 Summary Table
# ===============================================================
results = {
    "Logistic Regression": acc_lr,
    "Naive Bayes": acc_nb,
    "LSTM": acc_lstm,
    "RoBERTa-base": acc_roberta,
    "BERT-base": acc_bert
}

print("\n📈 Model Performance Summary:")
for k, v in results.items():
    print(f"{k:20s}: {v:.4f}")
