Imports below:

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Step 1 & 2: Load and Combine Data

In [3]:
# Define the paths to your data files
fake_news_path = os.path.join("content", "Fake.csv")
true_news_path = os.path.join("content", "True.csv")

In [5]:
# Load the data into pandas DataFrames
fake_df = pd.read_csv(fake_news_path)
true_df = pd.read_csv(true_news_path)

In [None]:
# Add a 'label' column (0 for fake, 1 for real)
fake_df['label'] = 0
true_df['label'] = 1

In [None]:
# Concatenate the DataFrames
news_df = pd.concat([fake_df, true_df], ignore_index=True)

Step 3: Data Cleaning

In [None]:
# Remove unnecessary columns
news_df = news_df[['title', 'text', 'label']]

# Handle missing values (if any) - Check and remove rows
print("Number of missing values before handling:\n", news_df.isnull().sum())
news_df.dropna(inplace=True)
print("Number of missing values after handling:\n", news_df.isnull().sum())

In [None]:
# Combine title and text into a single 'combined_text' column
news_df['combined_text'] = news_df['title'] + ' ' + news_df['text']

# Clean the text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

news_df['combined_text'] = news_df['combined_text'].apply(clean_text)