## Data Cleaning

This notebook will attempt to accomplish the following
- Missing data
- Scaling and Normalization (applies to continuous variables)
- Removing Duplicates
- Outlier detection
- Encoding categorical variables
- Feature engineering

In [29]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt 
from scipy import stats
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [25]:
# Define column names
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
           'hours-per-week', 'native-country', 'income']

# Load the dataset
data = pd.read_csv(f'{os.getcwd()}/ProjectData/adult/adult.data', header=None, names=columns)
print(data.shape)
data.head()


(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


---
## Removing instances with missing features

 This block of code performs data preprocessing on the 'adult' dataset.
 Removing cells with missing features, removing duplicate rows, detecting and removing outliers using the IQR method.
 Selecting numeric columns for scaling, applying standardization (Z-score normalization),
 and saving the processed data to new CSV files.


In [26]:

# Strip leading and trailing whitespace from all columns
data = data.map(lambda x: x.strip() if isinstance(x, str) else x)

# Replace '?' with NaN and remove rows with NaN values
cleaned_data = data.replace('?', pd.NA).dropna()

# Remove duplicate rows
cleaned_data.drop_duplicates(inplace=True)

# Function to detect outliers using IQR
def detect_outliers_iqr(df):
    outliers_indices = []
    for col in df.select_dtypes(include=[float, int]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers_col = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outliers_indices.extend(outliers_col)
    return list(set(outliers_indices))

# Detect outliers
outliers = detect_outliers_iqr(cleaned_data)

# Create a DataFrame for outliers
outliers_df = cleaned_data.loc[outliers]

# Create a DataFrame excluding outliers
cleaned_data_no_outliers = cleaned_data.drop(index=outliers)

# Print the shapes of the original, cleaned, and outliers datasets
print(f"Original data shape: {data.shape}")
print(f"Cleaned data shape: {cleaned_data.shape}")
print(f"Data shape after removing outliers IQR: {cleaned_data_no_outliers.shape}")
print(f"Outliers data shape: {outliers_df.shape}")

# Display the first few rows of the cleaned data
# print(cleaned_data_no_outliers.head())

# Display the first few rows of the outliers data
# print(outliers_df.head())

# Remove outliers from the dataset
# cleaned_data_no_outliers = cleaned_data.drop(index=outliers)

# Select only the numeric columns for scaling
numeric_cols = cleaned_data_no_outliers.select_dtypes(include=['float64', 'int64']).columns
numeric_data = cleaned_data_no_outliers[numeric_cols]

# Standardization (Z-score normalization)
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numeric_data)
standardized_df = pd.DataFrame(standardized_data, columns=numeric_cols)

# Save the cleaned and standardized data
standardized_df.to_csv(f'{os.getcwd()}/ProjectData/adult/standardized_adult.data', index=False)
cleaned_data_no_outliers.to_csv(f'{os.getcwd()}/ProjectData/adult/clean_adult.data', index=False)
outliers_df.to_csv(f'{os.getcwd()}/ProjectData/adult/outliers_adult.data', index=False)


Original data shape: (32561, 15)
Cleaned data shape: (30139, 15)
Data shape after removing outliers IQR: (18442, 15)
Outliers data shape: (11697, 15)


In [33]:


# Load the cleaned data
data = pd.read_csv(f'{os.getcwd()}/ProjectData/adult/clean_adult.data')

# Preprocess the data

# Encode the categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split the data into features (X) and target (y)
X = data.drop('income', axis=1)
y = data['income']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on the testing set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8145838980753591
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.96      0.89      2885
           1       0.67      0.29      0.40       804

    accuracy                           0.81      3689
   macro avg       0.75      0.62      0.65      3689
weighted avg       0.80      0.81      0.78      3689



In [28]:
# Fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print(rf_model.score)


Accuracy: 0.8359989156953104
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90      2885
           1       0.67      0.50      0.57       804

    accuracy                           0.84      3689
   macro avg       0.77      0.71      0.73      3689
weighted avg       0.82      0.84      0.83      3689

<bound method ClassifierMixin.score of RandomForestClassifier(random_state=42)>


In [32]:
# Load the cleaned data
data = pd.read_csv(f'{os.getcwd()}/ProjectData/adult/clean_adult.data')

# Preprocess the data

# Encode the categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split the data into features (X) and target (y)
X = data.drop('income', axis=1)
y = data['income']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the testing set
y_pred = gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}\n\n")
print("_________________________________________")
print("Classification Report:\n", class_report)


Accuracy: 0.85


_________________________________________
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      2885
           1       0.73      0.47      0.57       804

    accuracy                           0.85      3689
   macro avg       0.80      0.71      0.74      3689
weighted avg       0.84      0.85      0.83      3689

