Imports below:

In [5]:
# Imports
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

ImportError: cannot import name 'clip_to_image_size' from 'keras.src.layers.preprocessing.image_preprocessing.bounding_boxes.converters' (C:\Users\jekyt\anaconda3\Lib\site-packages\keras\src\layers\preprocessing\image_preprocessing\bounding_boxes\converters.py)

Step 1 & 2: Load and Combine Data

In [4]:
# Set random seed for reproducibility
import random
random.seed(42)
np.random.seed(42)

In [6]:
# --- Load Data ---
# Files are stored in the 'content' folder.
fake_path = os.path.join("content", "Fake.csv")
true_path = os.path.join("content", "True.csv")

In [8]:
fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)

# Add a label column: 0 for fake, 1 for true
fake_df['label'] = 0
true_df['label'] = 1

In [10]:
# Concatenate the DataFrames
df = pd.concat([fake_df, true_df], ignore_index=True)

# Keep only the necessary columns: title, text, and label.
df = df[['title', 'text', 'label']]

3: Data Cleaning and Preprocessing

In [12]:
# Create a new column 'combined_text' that concatenates the title and text.
df['combined_text'] = df['title'] + ' ' + df['text']

Number of missing values before handling:
 title    0
text     0
label    0
dtype: int64
Number of missing values after handling:
 title    0
text     0
label    0
dtype: int64


In [14]:
# --- Text Cleaning Function ---
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['combined_text'] = df['combined_text'].apply(clean_text)

Step 4: Tokenization

In [15]:
# We use a simple whitespace split. (Alternatively, nltk.word_tokenize can be used)
def tokenize(text):
    return text.split()

df['tokens'] = df['combined_text'].apply(tokenize)

Build Vocabulary

In [None]:
MAX_VOCAB_SIZE = 20000  # Limit vocabulary size
counter = Counter([word for tokens in df['tokens'] for word in tokens])
most_common = counter.most_common(MAX_VOCAB_SIZE - 2)  # Reserve indices for <PAD> and <UNK>

5: Create a word-to-index mapping. Reserve index 0 for padding, 1 for OOV.

In [17]:
word2idx = {"<PAD>": 0, "<UNK>": 1}
for idx, (word, _) in enumerate(most_common, start=2):
    word2idx[word] = idx

Convert Tokens to Sequences

In [18]:
def tokens_to_sequence(tokens, word2idx):
    return [word2idx.get(word, word2idx["<UNK>"]) for word in tokens]

df['sequence'] = df['tokens'].apply(lambda x: tokens_to_sequence(x, word2idx))


Shape of x_train: (35918, 200)
Shape of y_train: (35918,)
Shape of x_test: (8980, 200)
Shape of y_test: (8980,)


Padding Sequences

In [None]:
MAX_SEQUENCE_LENGTH = 200  # Adjust this value based on distribution or experimentation

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

df['padded_seq'] = df['sequence'].apply(lambda x: pad_sequence(x, MAX_SEQUENCE_LENGTH))

In [None]:
X = np.array(df['padded_seq'].tolist())
y = df['label'].values

print("Feature shape:", X.shape)
print("Labels shape:", y.shape)

Step 6: Pre-trained Embeddings (GloVe)

In [1]:
EMBEDDING_DIM = 100  # Dimensionality of the GloVe embeddings
GLOVE_DIR = 'glove.6B' #  Create a folder named 'glove.6B' in the 'content' directory. Place glove.6B.100d.txt inside it.
embeddings_index = {}

try:
    with open(os.path.join('content', GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
except FileNotFoundError:
    print("GloVe file not found. Please download 'glove.6B.zip' from https://nlp.stanford.edu/projects/glove/,")
    print("extract it, and place 'glove.6B.100d.txt' in a folder named 'glove.6B' inside the 'content' directory.")
    exit() # Stop execution if the file is not found.

print('Found %s word vectors.' % len(embeddings_index))

NameError: name 'os' is not defined

In [20]:
# Prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector


print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (20000, 100)
