## Copying Data from S3 to Local SageMaker

In [1]:
import boto3
import os

# Define S3 bucket and file names
s3_bucket = "fake-news-raw-data"
s3_files = ["Liar.csv", "Synthetic Financial Datasets.csv", "WELFake_Dataset.csv"]
local_folder = "/home/ec2-user/SageMaker/data/"

# Ensure local directory exists
os.makedirs(local_folder, exist_ok=True)

# Initialize S3 client
s3_client = boto3.client("s3")

# Download files from S3
for file in s3_files:
    local_path = os.path.join(local_folder, file)
    s3_client.download_file(s3_bucket, file, local_path)
    print(f"✅ Downloaded {file} to {local_path}")

✅ Downloaded Liar.csv to /home/ec2-user/SageMaker/data/Liar.csv
✅ Downloaded Synthetic Financial Datasets.csv to /home/ec2-user/SageMaker/data/Synthetic Financial Datasets.csv
✅ Downloaded WELFake_Dataset.csv to /home/ec2-user/SageMaker/data/WELFake_Dataset.csv


## Data Integrity

In [2]:
import pandas as pd

# Load and inspect datasets
for file in s3_files:
    df = pd.read_csv(f"/home/ec2-user/SageMaker/data/{file}")
    print(f"📌 {file} - Shape: {df.shape}")
    print(df.head(), "\n")

📌 Liar.csv - Shape: (10240, 14)
  Statement ID    Lie_label  \
0    2635.json        FALSE   
1   10540.json    half-true   
2     324.json  mostly-true   
3    1123.json        FALSE   
4    9028.json    half-true   

                                           Statement  \
0  Says the Annies List political group supports ...   
1  When did the decline of coal start? It started...   
2  Hillary Clinton agrees with John McCain "by vo...   
3  Health care reform legislation is likely to ma...   
4  The economic turnaround started at the end of ...   

                                Topic         Speaker     Speaker_Job_Title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-c

## Explore the Data

In [3]:
for file in s3_files:
    df = pd.read_csv(f"/home/ec2-user/SageMaker/data/{file}")
    
    print(f"📊 Dataset: {file}")
    print(df.info())  # Data types & missing values
    print(df.describe())  # Summary statistics
    print("\nUnique values per column:")
    print(df.nunique())  # Unique values per column
    print("-" * 50, "\n")

📊 Dataset: Liar.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Statement ID          10240 non-null  object 
 1   Lie_label             10240 non-null  object 
 2   Statement             10240 non-null  object 
 3   Topic                 10238 non-null  object 
 4   Speaker               10238 non-null  object 
 5   Speaker_Job_Title     7342 non-null   object 
 6   State                 8030 non-null   object 
 7   Speaker_party         10238 non-null  object 
 8   barely_true_counts    10238 non-null  float64
 9   false_counts          10238 non-null  float64
 10  half-true_counts      10238 non-null  float64
 11  mostly_true_counts    10238 non-null  float64
 12  pants_on_fire_counts  10238 non-null  float64
 13  statement_mode        10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB
Non