Preprocessing the Data

In [1]:
import boto3
import os

# Define S3 bucket and file names
s3_bucket = "fake-news-raw-data"
s3_files = ["Liar.csv", "Synthetic Financial Datasets.csv", "WELFake_Dataset.csv"]
local_folder = "/home/ec2-user/SageMaker/data/"

# Ensure local directory exists
os.makedirs(local_folder, exist_ok=True)

# Initialize S3 client
s3_client = boto3.client("s3")

# Download files from S3
for file in s3_files:
    local_path = os.path.join(local_folder, file)
    s3_client.download_file(s3_bucket, file, local_path)
    print(f"✅ Downloaded {file} to {local_path}")

✅ Downloaded Liar.csv to /home/ec2-user/SageMaker/data/Liar.csv
✅ Downloaded Synthetic Financial Datasets.csv to /home/ec2-user/SageMaker/data/Synthetic Financial Datasets.csv
✅ Downloaded WELFake_Dataset.csv to /home/ec2-user/SageMaker/data/WELFake_Dataset.csv


## Import Python package "stopwords" to overlook commonly used words and articles (English)

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Import required modules

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

## Ensure NLTK stopwords are available

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Liar.csv dataset and make pandas dataframe

In [8]:
liar_clean = pd.read_csv("Liar.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Liar.csv'

In [2]:


# Function to clean text using NLTK stopwords
def clean_text_nltk(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df_clean["clean_statement"] = df_clean["Statement"].apply(clean_text_nltk)

# Convert numerical columns to integers
count_cols = ["barely_true_counts", "false_counts", "half-true_counts", "mostly_true_counts", "pants_on_fire_counts"]
df_clean[count_cols] = df_clean[count_cols].fillna(0).astype(int)

# Create a total misinformation score feature
df_clean["total_misinfo_score"] = df_clean[count_cols].sum(axis=1)

# One-Hot Encoding for Speaker Party
encoder = OneHotEncoder(sparse=False, drop="first")
encoded_party = encoder.fit_transform(df_clean[["Speaker_party"]])
party_columns = encoder.get_feature_names_out(["Speaker_party"])
df_encoded_party = pd.DataFrame(encoded_party, columns=party_columns, index=df_clean.index)

# Merge and drop original categorical column
df_clean = pd.concat([df_clean, df_encoded_party], axis=1)
df_clean.drop(["Speaker_party"], axis=1, inplace=True)

# Balance dataset by oversampling minority classes
majority_class = df_clean[df_clean["Lie_label"] == "FALSE"]
minority_classes = df_clean[df_clean["Lie_label"] != "FALSE"]
minority_classes_upsampled = resample(minority_classes, replace=True, n_samples=len(majority_class), random_state=42)
df_balanced = pd.concat([majority_class, minority_classes_upsampled])

# Split dataset
train_data, test_data = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced["Lie_label"])

# Save the preprocessed data
train_data.to_csv("/mnt/data/Liar_train.csv", index=False)
test_data.to_csv("/mnt/data/Liar_test.csv", index=False)

print("Preprocessing complete. Training and test datasets saved.")


**References**

Python Tutorials. (2021, July 22). *NLTK stop words.* pythonspot. Accessed March 20, 2025 from https://pythonspot.com/nltk-stop-words/