## Katie Kimberling - Preprocessing of Liar Dataset

# Preprocessing the Data

In [None]:
import boto3
import os

# Define S3 bucket and file names
s3_bucket = "fake-news-raw-data"
s3_files = ["Liar.csv", "Synthetic Financial Datasets.csv", "WELFake_Dataset.csv"]
local_folder = "/home/ec2-user/SageMaker/data/"

# Ensure local directory exists
os.makedirs(local_folder, exist_ok=True)

# Initialize S3 client
s3_client = boto3.client("s3")

# Download files from S3
for file in s3_files:
    local_path = os.path.join(local_folder, file)
    s3_client.download_file(s3_bucket, file, local_path)
    print(f"✅ Downloaded {file} to {local_path}")

## Import Python package "stopwords" to overlook commonly used words and articles (English)

In [None]:
import nltk
nltk.download('stopwords')

## Import required modules

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

## Ensure NLTK stopwords are available

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

## Load Liar.csv dataset and make pandas dataframe

In [None]:
liar_clean = pd.read_csv("/home/ec2-user/SageMaker/data/Liar.csv")

# Display dataset info

liar_clean.info()

## Function to clean text using NLTK stopwords

In [None]:
def clean_text_nltk(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

liar_clean["clean_statement"] = liar_clean["Statement"].apply(clean_text_nltk)

## Convert numerical columns to integers

In [None]:
count_cols = ["barely_true_counts", "false_counts", "half-true_counts", "mostly_true_counts", "pants_on_fire_counts"]
liar_clean[count_cols] = liar_clean[count_cols].fillna(0).astype(int)

## Create a total misinformation score feature

In [None]:
liar_clean["total_misinfo_score"] = liar_clean[count_cols].sum(axis=1)

## One-Hot Encoding for Speaker Party

In [None]:
encoder = OneHotEncoder(sparse_output=False,drop="first")
encoded_party = encoder.fit_transform(liar_clean[["Speaker_party"]])
party_columns = encoder.get_feature_names_out(["Speaker_party"])
liar_encoded_party = pd.DataFrame(encoded_party, columns=party_columns, index=liar_clean.index)

## Merge and drop original categorical column

In [None]:
liar_clean = pd.concat([liar_clean, liar_encoded_party], axis=1)
liar_clean.drop(["Speaker_party"], axis=1, inplace=True)

## Balance dataset by oversampling minority classes

In [None]:
majority_class = liar_clean[liar_clean["Lie_label"] == "FALSE"]
minority_classes = liar_clean[liar_clean["Lie_label"] != "FALSE"]
minority_classes_upsampled = resample(minority_classes, replace=True, n_samples=len(majority_class), random_state=42)
liar_balanced = pd.concat([majority_class, minority_classes_upsampled])

## Split dataset

In [None]:
train_data, test_data = train_test_split(liar_balanced, test_size=0.2, random_state=42, stratify=liar_balanced["Lie_label"])

## Save the preprocessed data

In [None]:
train_data.to_csv("/home/ec2-user/SageMaker/data/Liar_train.csv", index=False)
test_data.to_csv("/home/ec2-user/SageMaker/data/Liar_test.csv", index=False)

print("Preprocessing on Liar dataset complete. Training and test datasets saved.")

**References**

OpenAI. (2025). ChatGPT (March 20 version). [LLM]. https://chatgpt.com

Python Tutorials. (2021, July 22). *NLTK stop words.* pythonspot. Accessed March 20, 2025 from https://pythonspot.com/nltk-stop-words/

## Katie Kimberling's Data Training Liar Dataset

## Import necessary packages

In [None]:
import pandas as pd
import boto3
import sagemaker
from sagemaker import get_execution_role

## Reload the train/test datasets

In [None]:
train_data = pd.read_csv("/home/ec2-user/SageMaker/data/Liar_train.csv")
test_data = pd.read_csv("/home/ec2-user/SageMaker/data/Liar_test.csv")

## Define BlazingText format function and output paths

In [None]:
def prepare_blazingtext_format(df, text_col, label_col, output_file):
    with open(output_file, 'w') as f:
        for _, row in df.iterrows():
            label = f"__label__{row[label_col]}"
            text = row[text_col]
            f.write(f"{label} {text}\n")

# Paths to save formatted training and test data
train_txt_path = "/home/ec2-user/SageMaker/data/liar_train_blazing.txt"
test_txt_path = "/home/ec2-user/SageMaker/data/liar_test_blazing.txt"

## Format data for BlazingText

In [None]:
prepare_blazingtext_format(train_data, text_col="clean_statement", label_col="Lie_label", output_file=train_txt_path)
prepare_blazingtext_format(test_data, text_col="clean_statement", label_col="Lie_label", output_file=test_txt_path)

print("Training and test data formatted for BlazingText.")

## Upload formatted data to s3

In [None]:
import sagemaker
sess = sagemaker.Session()

# S3 upload paths
s3_bucket = sess.default_bucket()
s3_prefix = 'blazingtext-data'

train_s3_path = f'{s3_prefix}/liar_train_blazing.txt'
test_s3_path = f'{s3_prefix}/liar_test_blazing.txt'

# Upload to S3
s3 = boto3.client('s3')
s3.upload_file(train_txt_path, s3_bucket, train_s3_path)
s3.upload_file(test_txt_path, s3_bucket, test_s3_path)

print("Files uploaded to S3.")

## Set up role, session and input paths

In [None]:
# Role and session
role = get_execution_role()
sess = sagemaker.Session()

# S3 input paths
s3_train_input = f's3://{s3_bucket}/{train_s3_path}'
s3_test_input = f's3://{s3_bucket}/{test_s3_path}'

## Define BlazingText estimator

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

bt_image = sagemaker.image_uris.retrieve("blazingtext", sess.boto_region_name)

bt_estimator = sagemaker.estimator.Estimator(
    image_uri=bt_image,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size=5,
    max_run=3600,
    input_mode='File',
    output_path=f's3://{s3_bucket}/blazingtext-output',
    sagemaker_session=sess
)

# Set hyperparameters

In [None]:
bt_estimator.set_hyperparameters(
    mode='supervised',
    epochs=10,
    learning_rate=0.05,
    vector_dim=100,
    min_count=2,
    early_stopping=True
)

## Launch training and deploy model

In [None]:
import json
import boto3
import sagemaker
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(s3_data=s3_train_input, content_type='text/plain')
test_input = TrainingInput(s3_data=s3_test_input, content_type='text/plain')
bt_estimator.fit({'train': train_input, 'validation': test_input})

# Deploy the model
predictor = bt_estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')

## Evaluate the model

In [None]:
from sklearn.metrics import classification_report
import json
import boto3
import sagemaker
import json

# Prepare test input
test_statements = test_data['clean_statement'].tolist()
payload = {"instances": test_statements}

# Get Response
runtime = boto3.client("sagemaker-runtime")
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType="application/json",  # Ensure JSON content type
    Body=json.dumps(payload).encode("utf-8")  # Encode the JSON payload
)

# Actual true labels
true_labels = test_data['Lie_label'].tolist()

# Parse response
result = json.loads(response["Body"].read().decode())
predicted_labels = ["TRUE" if "TRUE" in p["label"][0] else "FALSE" for p in result]

# Evaluate
print(classification_report(true_labels, predicted_labels))

## Cleanup the endpoint, or else goodbye, $$

In [None]:
# Cleanup
predictor.delete_endpoint()