# Loading the Datasets

We need to load both CSV files and label them appropriately (e.g., 1 for fake news and 0 for real news).

In [1]:
import pandas as pd

# Load the two datasets
fake_df = pd.read_csv('/Users/sora/Downloads/Fake.csv')
true_df = pd.read_csv('/Users/sora/Downloads/True.csv')

# Preview the datasets
print(fake_df.head())
print(true_df.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

Both datasets contain a text column with the articles, we can create labels for them.

# Labeling the Datasets

Add a label column to each dataset to differentiate fake and real news. For example, 1 for fake news and 0 for real news.

In [2]:
# Add a label column: 1 for fake news, 0 for real news
fake_df['label'] = 1
true_df['label'] = 0

# Combining the Datasets

Once labeled, we can combine both datasets into a single DataFrame.

In [3]:
# Combine the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df = df.sample(frac=1).reset_index(drop=True)

# Check the structure of the combined dataset
print(df.head())
print(df['label'].value_counts())  # Check the distribution of fake and real news

                                               title  \
0   While We Were Sleeping, Trump Declared Civil War   
1  Iraqi forces complete Kirkuk province takeover...   
2   Mike Pence’s DC Neighbors Just Invited Him To...   
3  UK counter-terrorism police arrest 11 in far-r...   
4  TRUMP CHALLENGES FAKE MEDIA: “Are we going to ...   

                                                text    subject  \
0  I woke up this morning completely unfazed over...       News   
1  BAGHDAD/ERBIL, Iraq (Reuters) - Iraqi forces o...  worldnews   
2  A group of Vice President-elect Mike Pence s n...       News   
3  LONDON (Reuters) - British police said 11 peop...  worldnews   
4  You have to give it to President Trump who wen...  left-news   

                  date  label  
0     January 25, 2017      1  
1    October 20, 2017       0  
2    December 13, 2016      1  
3  September 27, 2017       0  
4         Aug 15, 2017      1  
label
1    23481
0    21417
Name: count, dtype: int64


# Cleaning the Dataset

Now that the data is combined, we can proceed with cleaning by removing missing values and any irrelevant columns (if present).

In [4]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values in the 'text' column
df = df.dropna(subset=['text'])

# If any unnecessary columns are present, you can drop them (e.g., 'title', 'date')
if 'title' in df.columns:
    df = df.drop(['title', 'date'], axis=1)

# Final structure check
print(df.head())

title      0
text       0
subject    0
date       0
label      0
dtype: int64
                                                text    subject  label
0  I woke up this morning completely unfazed over...       News      1
1  BAGHDAD/ERBIL, Iraq (Reuters) - Iraqi forces o...  worldnews      0
2  A group of Vice President-elect Mike Pence s n...       News      1
3  LONDON (Reuters) - British police said 11 peop...  worldnews      0
4  You have to give it to President Trump who wen...  left-news      1


# Preprocessing the Text Data

Next, we’ll preprocess the text data, including tokenization, normalization, and padding.

## a. Preprocessing Text

Normalize the text by removing punctuation, stopwords, and converting everything to lowercase.

In [5]:
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocess the text: lowercase, remove punctuation and stopwords
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply preprocessing to the 'text' column
df['text'] = df['text'].apply(preprocess_text)

# Check an example after preprocessing
print(df['text'].head())

[nltk_data] Downloading package stopwords to /Users/sora/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    woke morning completely unfazed idea donald tr...
1    baghdaderbil iraq reuters iraqi forces friday ...
2    group vice presidentelect mike pence new washi...
3    london reuters british police said 11 people a...
4    give president trump went fake news media big ...
Name: text, dtype: object


## b. Tokenization and Padding

Tokenize the preprocessed text and pad the sequences to ensure all articles are of the same length for model training. 

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum vocabulary size and sequence length
vocab_size = 10000
maxlen = 500

# Initialize the tokenizer and fit it on the combined text data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['text'].values)

# Convert the news articles to sequences of integers
sequences = tokenizer.texts_to_sequences(df['text'].values)

# Pad the sequences to ensure uniform length
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

print(f"Padded sequences shape: {padded_sequences.shape}")

Padded sequences shape: (44898, 500)


# Preparing the Labels

You can extract the labels into a numpy array for model training. 

In [7]:
import numpy as np

# Convert labels to numpy array
labels = df['label'].values
print(f"Labels shape: {labels.shape}")

Labels shape: (44898,)


# Splitting the Dataset for Training and Testing

Next, split the data into training and test sets to evaluate the model later. 

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Check the shape of the training and test sets
print(f"Training data shape: {x_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test data shape: {x_test.shape}, Test labels shape: {y_test.shape}")

Training data shape: (35918, 500), Training labels shape: (35918,)
Test data shape: (8980, 500), Test labels shape: (8980,)


# Saving the Preprocessed Dataset

Finally, save the cleaned and preprocessed dataset for later use in model training.

In [9]:
import pickle

# Save the preprocessed dataset as a binary file
with open('fake_news_preprocessed_dataset.pkl', 'wb') as f:
    pickle.dump((x_train, y_train, x_test, y_test), f)

print("Preprocessed Fake News Detection dataset saved.")

Preprocessed Fake News Detection dataset saved.
