In [3]:
import pandas as pd
import numpy as np
import re

In [6]:
# 1) Read raw CSV
df = pd.read_csv(r'/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 6/Group1DatasetRaw.csv')

# Drop columns that end with ".1"
cols_to_drop = [col for col in df.columns if col.endswith('.1')]
df.drop(columns=cols_to_drop, inplace=True)

# Winsorize numeric columns at ±3.5 standard deviations
def winsorize_at_3_5_std(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        col_mean = df[col].mean()
        col_std = df[col].std()
        lower_bound = col_mean - 3.5 * col_std
        upper_bound = col_mean + 3.5 * col_std
        df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

# Mean imputation for missing numeric values
def mean_impute_numeric(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col].fillna(df[col].mean(), inplace=True)
    return df

# One-hot encode the 'Region' and 'Income Group' columns
def one_hot_encode_categorical(df):
    df = pd.get_dummies(df, columns=['Region', 'Income Group'], drop_first=False)
    return df

# Apply transformations
df = winsorize_at_3_5_std(df)
df = mean_impute_numeric(df)
df = one_hot_encode_categorical(df)

# Save the cleaned dataset
df.to_csv(r'/Users/josh/Desktop/Macbook Working Files/Git Repos/culminating-project-group-1/Week 6/Group1DatasetCleaned.csv', index=False)
print("\nData cleaning complete. File saved as 'Group1DatasetCleaned.csv'.")


Data cleaning complete. File saved as 'Group1DatasetCleaned.csv'.
