<a href="https://colab.research.google.com/github/AverageWeebo101/CS20L-12044-Machine-Learning-Project/blob/main/C20MLModelsStuff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Setup

!pip install -q numpy pandas scikit-learn tensorflow transformers matplotlib seaborn kaggle

import os
if not os.path.exists('/content/fake-and-real-news-dataset'):
    from google.colab import files
    files.upload()  # Upload kaggle.json here

    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
    !unzip fake-and-real-news-dataset.zip -d fake-and-real-news-dataset
    !rm fake-and-real-news-dataset.zip

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 1.29GB/s]
Archive:  fake-and-real-news-dataset.zip
  inflating: fake-and-real-news-dataset/Fake.csv  
  inflating: fake-and-real-news-dataset/True.csv  


In [None]:
# Preprocessing
# Note to use GPU for runtime if possible

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

fake_df = pd.read_csv('/content/fake-and-real-news-dataset/Fake.csv')
true_df = pd.read_csv('/content/fake-and-real-news-dataset/True.csv')

fake_df['label'] = 1  # 1 for fake news
true_df['label'] = 0  # 0 for real news
df = pd.concat([fake_df, true_df], axis=0)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df['full_text'] = df['title'] + " " + df['text']

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['full_text'].apply(preprocess)

df = df[df['clean_text'].str.split().str.len() > 20].reset_index(drop=True)

X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

df['text_length'] = df['clean_text'].apply(len)
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
df['title_length'] = df['title'].apply(len)

df.to_csv('preprocessed_news.csv', index=False)
print(f"Dataset size: {len(df)} (Fake: {len(fake_df)}, Real: {len(true_df)})")

In [None]:
# Feature Engineering

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, save_npz, load_npz

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

def get_metadata_features(df, indices):
    subset = df.loc[indices]
    return np.vstack([
        subset['text_length'] / 1000,
        subset['word_count'] / 100,
        subset['title_length'] / 100
    ]).T

X_train_meta = get_metadata_features(df, X_train.index)
X_test_meta = get_metadata_features(df, X_test.index)

X_train_combined = hstack([X_train_tfidf, X_train_meta])
X_test_combined = hstack([X_test_tfidf, X_test_meta])

save_npz('X_train_combined.npz', X_train_combined)
save_npz('X_test_combined.npz', X_test_combined)
np.save('y_train.npy', y_train.values)
np.save('y_test.npy', y_test.values)

In [None]:
# OPTIONAL TO RUN

df['augmented_text'] = df.apply(
    lambda x: f"{x['title']}. {x['title']}. {x['clean_text']}",
    axis=1
)

# To use augmented data:
# X = df['augmented_text']
# X_train, X_test, y_train, y_test = train_test_split(...)

In [None]:
# Logistic Regression

rom sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

X_train_combined = load_npz('X_train_combined.npz')
X_test_combined = load_npz('X_test_combined.npz')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

lr = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42, class_weight='balanced')
lr.fit(X_train_combined, y_train)
lr_pred = lr.predict(X_test_combined)

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, lr_pred):.4f}")
print(classification_report(y_test, lr_pred, target_names=['Real', 'Fake']))

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=42, class_weight='balanced')
rf.fit(X_train_combined, y_train)
rf_pred = rf.predict(X_test_combined)

print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, rf_pred):.4f}")
print(classification_report(y_test, rf_pred, target_names=['Real', 'Fake']))

In [None]:
# LSTM

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# Enables GPU
if tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

max_len = 256
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_len)

model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    X_train_pad, y_train,
    epochs=8,
    batch_size=128,
    validation_split=0.1,
    callbacks=[EarlyStopping(patience=2, restore_best_weights=True)]
)

lstm_pred = (model.predict(X_test_pad) > 0.5).astype(int)
print("\nLSTM Performance:")
print(f"Accuracy: {accuracy_score(y_test, lstm_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, lstm_pred):.4f}")
print(classification_report(y_test, lstm_pred, target_names=['Real', 'Fake']))

In [None]:
# DistilBERT

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="tf"
    )

train_encodings = tokenize_texts(X_train)
test_encodings = tokenize_texts(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

optimizer = Adam(learning_rate=5e-5)
model.compile(
    optimizer=optimizer,
    loss=model.compute_loss,
    metrics=['accuracy']
)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3
)

bert_pred = np.argmax(model.predict(test_dataset).logits, axis=1)
print("\nDistilBERT Performance:")
print(f"Accuracy: {accuracy_score(y_test, bert_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, bert_pred):.4f}")
print(classification_report(y_test, bert_pred, target_names=['Real', 'Fake']))

In [None]:
# Results and Visualization Part

results = []
models = [
    ('Logistic Regression', lr_pred),
    ('Random Forest', rf_pred),
    ('LSTM', lstm_pred.flatten()),
    ('DistilBERT', bert_pred)
]

for name, pred in models:
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, pred),
        'F1 Score': f1_score(y_test, pred),
        'Precision': precision_score(y_test, pred),
        'Recall': recall_score(y_test, pred)
    })

# Comparison table
results_df = pd.DataFrame(results)
print("\n=== MODEL COMPARISON ===")
print(results_df)

# Plot results
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='F1 Score', data=results_df, palette='viridis')
plt.title('F1 Score Comparison')
plt.ylim(0.8, 1.0)
plt.xticks(rotation=15)

plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='Accuracy', data=results_df, palette='mako')
plt.title('Accuracy Comparison')
plt.ylim(0.8, 1.0)
plt.xticks(rotation=15)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300)
plt.show()

results_df.to_csv('model_results.csv', index=False)

# Confusion matrix
plt.figure(figsize=(15, 10))
for i, (name, pred) in enumerate(models, 1):
    cm = confusion_matrix(y_test, pred)
    plt.subplot(2, 2, i)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Real', 'Fake'],
                yticklabels=['Real', 'Fake'])
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300)
plt.show()