In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import KNNImputer

In [5]:
# 1) Read raw CSV
df = pd.read_csv('Group1DatasetRaw.csv')

# Drop columns that end with ".1"
cols_to_drop = [col for col in df.columns if col.endswith('.1')]
df.drop(columns=cols_to_drop, inplace=True)

# Winsorize numeric columns at ±3.5 standard deviations
def winsorize_at_3_5_std(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        col_mean = df[col].mean()
        col_std = df[col].std()
        lower_bound = col_mean - 3.5 * col_std
        upper_bound = col_mean + 3.5 * col_std
        df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

# KNN imputation for missing numeric values
def knn_impute_numeric(df, n_neighbors=5):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    return df

# One-hot encode the 'Region' and 'Income Group' columns
def one_hot_encode_categorical(df):
    df = pd.get_dummies(df, columns=['Region', 'Income Group'], drop_first=False)
    return df

# Apply transformations
df = winsorize_at_3_5_std(df)
df = knn_impute_numeric(df)
df = one_hot_encode_categorical(df)

# Save the cleaned dataset
df.to_csv('Group1DatasetCleaned.csv', index=False)
print("\nData cleaning complete. File saved as 'Group1DatasetCleaned.csv'.")


Data cleaning complete. File saved as 'Group1DatasetCleaned.csv'.
