Imports below:

In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jekyt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Step 1 & 2: Load and Combine Data

In [2]:
# Set random seed for reproducibility
import random
random.seed(42)
np.random.seed(42)

In [3]:
# Files are stored in the 'content' folder.
fake_path = os.path.join("content", "Fake.csv")
true_path = os.path.join("content", "True.csv")

In [4]:
fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)

# Add a label column: 0 for fake, 1 for true
fake_df['label'] = 0
true_df['label'] = 1

In [5]:
# Concatenate the DataFrames
df = pd.concat([fake_df, true_df], ignore_index=True)

# Keep only the necessary columns: title, text, and label.
df = df[['title', 'text', 'label']]

3: Data Cleaning and Preprocessing

In [6]:
# Check and drop missing values
print("Missing values before:", df.isnull().sum())
df.dropna(inplace=True)
print("Missing values after:", df.isnull().sum())

Missing values before: title    0
text     0
label    0
dtype: int64
Missing values after: title    0
text     0
label    0
dtype: int64


In [7]:
# Create a new column 'combined_text' that concatenates the title and text.
df['combined_text'] = df['title'] + ' ' + df['text']

In [8]:
# --- Text Cleaning Function ---
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['combined_text'] = df['combined_text'].apply(clean_text)

Step 4: Tokenization

In [9]:
# We use a simple whitespace split. (Alternatively, nltk.word_tokenize can be used)
def tokenize(text):
    return text.split()

df['tokens'] = df['combined_text'].apply(tokenize)

Build Vocabulary

In [10]:
MAX_VOCAB_SIZE = 20000  # Limit vocabulary size
counter = Counter([word for tokens in df['tokens'] for word in tokens])
most_common = counter.most_common(MAX_VOCAB_SIZE - 2)  # Reserve indices for <PAD> and <UNK>

5: Create a word-to-index mapping. Reserve index 0 for padding, 1 for OOV.

In [11]:
word2idx = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = idx

Convert Tokens to Sequences

In [12]:
def tokens_to_sequence(tokens, word2idx):
    return [word2idx.get(word, word2idx["<UNK>"]) for word in tokens]

df['sequence'] = df['tokens'].apply(lambda x: tokens_to_sequence(x, word2idx))


Padding Sequences

In [13]:
MAX_SEQUENCE_LENGTH = 200  # Adjust this value based on distribution or experimentation

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['padded_seq'] = df['sequence'].apply(lambda x: pad_sequence(x, MAX_SEQUENCE_LENGTH))

In [14]:
# Prepare features and labels
X = np.array(df['padded_seq'].tolist())
y = df['label'].values

print("Feature shape:", X.shape)
print("Labels shape:", y.shape)

Feature shape: (44898, 200)
Labels shape: (44898,)


Step 6: Pre-trained Embeddings (GloVe)

In [20]:
# Prepare embedding matrix
EMBEDDING_DIM = 100  # Using 100d embeddings
GLOVE_DIR = os.path.join("content", "glove.6B")  # Ensure 'glove.6B.100d.txt' is in this folder

embeddings_index = {}

glove_file = os.path.join(GLOVE_DIR, "glove.6B.100d.txt")
try:
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    print("Found %d word vectors in GloVe." % len(embeddings_index))
except FileNotFoundError:
    print("GloVe file not found. Please download it and place it in:", GLOVE_DIR)
    exit()

# --- Create Embedding Matrix ---
vocab_size = len(word2idx)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, idx in word2idx.items():
    vector = embeddings_index.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector
    # else, the vector remains as zeros (or you can randomize)

print("Embedding matrix shape:", embedding_matrix.shape)

Found 400000 word vectors in GloVe.
Embedding matrix shape: (20000, 100)


In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

Create a Custom Dataset

In [22]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

Split into training and testing sets

In [23]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)

BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

Define the LSTM Model

In [24]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix, trainable=False):
        super(FakeNewsClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # Load pre-trained embeddings
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = trainable  # Freeze or fine-tune embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, sequence_length]
        embedded = self.embedding(x)  # [batch_size, sequence_length, embedding_dim]
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Use the last hidden state as representation
        hidden = self.dropout(hidden[-1])  # [batch_size, hidden_dim]
        out = self.fc(hidden)
        return self.sigmoid(out)

HIDDEN_DIM = 128
OUTPUT_DIM = 1

model = FakeNewsClassifier(vocab_size=vocab_size,
                           embedding_dim=EMBEDDING_DIM,
                           hidden_dim=HIDDEN_DIM,
                           output_dim=OUTPUT_DIM,
                           embedding_matrix=embedding_matrix,
                           trainable=False)

print(model)

FakeNewsClassifier(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


