Imports below:

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Step 1 & 2: Load and Combine Data

In [3]:
# Define the paths to your data files
fake_news_path = os.path.join("content", "Fake.csv")
true_news_path = os.path.join("content", "True.csv")

In [5]:
# Load the data into pandas DataFrames
fake_df = pd.read_csv(fake_news_path)
true_df = pd.read_csv(true_news_path)

In [None]:
# Add a 'label' column (0 for fake, 1 for real)
fake_df['label'] = 0
true_df['label'] = 1

In [None]:
# Concatenate the DataFrames
news_df = pd.concat([fake_df, true_df], ignore_index=True)

Step 3: Data Cleaning

In [None]:
# Remove unnecessary columns
news_df = news_df[['title', 'text', 'label']]

# Handle missing values (if any) - Check and remove rows
print("Number of missing values before handling:\n", news_df.isnull().sum())
news_df.dropna(inplace=True)
print("Number of missing values after handling:\n", news_df.isnull().sum())

In [None]:
# Combine title and text into a single 'combined_text' column
news_df['combined_text'] = news_df['title'] + ' ' + news_df['text']

# Clean the text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

news_df['combined_text'] = news_df['combined_text'].apply(clean_text)

Step 4: Tokenization

In [None]:
MAX_VOCAB_SIZE = 20000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<unk>") #oov = Out of Vocabulary
tokenizer.fit_on_texts(news_df['combined_text'])
sequences = tokenizer.texts_to_sequences(news_df['combined_text'])

Step 5: Padding

In [None]:
MAX_SEQUENCE_LENGTH = 200  # Maximum sequence length (adjust as needed after looking at sequence length distribution)
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

Step 6: Create x and y, then split data

In [None]:
x = padded_sequences
y = news_df['label'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) #80% training, 20% testing

print("Shape of x_train:", x_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_test:", y_test.shape)


Step 6: Pre-trained Embeddings (GloVe)

In [None]:
EMBEDDING_DIM = 100  # Dimensionality of the GloVe embeddings
GLOVE_DIR = 'glove.6B' #  Create a folder named 'glove.6B' in the 'content' directory. Place glove.6B.100d.txt inside it.
embeddings_index = {}

try:
    with open(os.path.join('content', GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
except FileNotFoundError:
    print("GloVe file not found. Please download 'glove.6B.zip' from https://nlp.stanford.edu/projects/glove/,")
    print("extract it, and place 'glove.6B.100d.txt' in a folder named 'glove.6B' inside the 'content' directory.")
    exit() # Stop execution if the file is not found.

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# Prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector


print("Embedding matrix shape:", embedding_matrix.shape)