## Data Cleaning

This notebook will attempt to accomplish the following
- Missing data
- Scaling and Normalization (applies to continuous variables)
- Removing Duplicates
- Outlier detection
- Encoding categorical variables
- Feature engineering

In [4]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt 
from scipy import stats
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler



---
## Removing instances with missing features

 This block of code performs data preprocessing on the 'adult' dataset.
 Removing cells with missing features, removing duplicate rows, detecting and removing outliers using the IQR method.
 Selecting numeric columns for scaling, applying standardization (Z-score normalization),
 and saving the processed data to new CSV files.


In [6]:
# Load the dataset
data = pd.read_csv(f'{os.getcwd()}/ProjectData/adult/adult.data')

# Replace '?' with NaN and remove rows with NaN values
cleaned_data = data.replace('?', pd.NA).dropna()

# Remove duplicate rows
cleaned_data.drop_duplicates(inplace=True)

# Function to detect outliers using IQR
def detect_outliers_iqr(df):
    outliers_indices = []
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers_col = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outliers_indices.extend(outliers_col)
    return list(set(outliers_indices))

outliers = detect_outliers_iqr(cleaned_data)

# Remove outliers from the dataset
cleaned_data_no_outliers = cleaned_data.drop(index=outliers)

# Select only the numeric columns for scaling
numeric_cols = cleaned_data_no_outliers.select_dtypes(include=['float64', 'int64']).columns
numeric_data = cleaned_data_no_outliers[numeric_cols]

# Standardization (Z-score normalization)
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numeric_data)
standardized_df = pd.DataFrame(standardized_data, columns=numeric_cols)

standardized_df.to_csv(f'{os.getcwd()}/ProjectData/adult/standardized_adult.data', index=False)
cleaned_data_no_outliers.to_csv(f'{os.getcwd()}/ProjectData/adult/clean_adult.data', index=False)
