In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_classif

# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

# 1. Data Exploration and Preprocessing
# Scaling numerical features
numerical_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_columns] = standard_scaler.fit_transform(df[numerical_columns])
min_max_scaler = MinMaxScaler()
df_min_max_scaled = df.copy()
df_min_max_scaled[numerical_columns] = min_max_scaler.fit_transform(df[numerical_columns])

# Encoding Techniques
categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
category_counts = df[categorical_columns].nunique()

# One-Hot Encoding for columns with <= 5 categories
one_hot_columns = ['sex', 'income', 'race']
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output instead of sparse
df_one_hot_encoded = df.copy()
encoded_one_hot = one_hot_encoder.fit_transform(df[one_hot_columns])
one_hot_df = pd.DataFrame(encoded_one_hot, columns=one_hot_encoder.get_feature_names_out(one_hot_columns))
df_one_hot_encoded = df_one_hot_encoded.drop(one_hot_columns, axis=1).join(one_hot_df)

# Label Encoding for columns with > 5 categories
label_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country']
label_encoder = LabelEncoder()
df_label_encoded = df_one_hot_encoded.copy()
for col in label_columns:
    df_label_encoded[col] = label_encoder.fit_transform(df[col])

# Feature Engineering
df_label_encoded['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['young', 'middle-aged', 'senior'])
df_label_encoded['work_hours_category'] = pd.cut(df['hours_per_week'], bins=[0, 35, 40, 100], labels=['part-time', 'full-time', 'overtime'])
df_label_encoded['log_capital_gain'] = df['capital_gain'].apply(lambda x: np.log(x + 1))

# Encode the newly created categorical features (age_group, work_hours_category)
df_label_encoded['age_group'] = label_encoder.fit_transform(df_label_encoded['age_group'])
df_label_encoded['work_hours_category'] = label_encoder.fit_transform(df_label_encoded['work_hours_category'])

# 4. Feature Selection - Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df_label_encoded['anomaly'] = iso_forest.fit_predict(df_label_encoded[numerical_columns])
outlier_count = df_label_encoded['anomaly'].value_counts()

# Feature Selection Alternative to PPS
# Correlation Matrix
correlation_matrix = df_label_encoded.corr()

# Mutual Information
X = df_label_encoded.drop(columns=['income_ >50K'])  # Assuming 'income' is the target
y = df_label_encoded['income_ >50K']
mutual_info = mutual_info_classif(X, y)

# Display results
print("Outlier Count (Isolation Forest):")
print(outlier_count)
print("\nCorrelation Matrix:")
print(correlation_matrix)
print("\nMutual Information Scores:")
print(mutual_info)


Outlier Count (Isolation Forest):
anomaly
 1    30933
-1     1628
Name: count, dtype: int64

Correlation Matrix:
                               age  workclass    fnlwgt  education  \
age                       1.000000   0.003787 -0.076646  -0.010508   
workclass                 0.003787   1.000000 -0.016656   0.023513   
fnlwgt                   -0.076646  -0.016656  1.000000  -0.028145   
education                -0.010508   0.023513 -0.028145   1.000000   
education_num             0.036527   0.052085 -0.043195   0.359153   
marital_status           -0.266288  -0.064731  0.028153  -0.038407   
occupation               -0.020947   0.254892  0.001597  -0.021260   
relationship             -0.263698  -0.090461  0.008931  -0.010876   
capital_gain              0.077674   0.033835  0.000432   0.030046   
capital_loss              0.057775   0.012216 -0.010252   0.016746   
hours_per_week            0.068756   0.138962 -0.018768   0.055510   
native_country           -0.001151  -0.007690 -