#### By: Anika Achary

In this project, I will use a preprocessor and makepipeline to automatically apply transformations and logistic regression in order to make a prediction about this population dataset. 

#### Import Statements

In [283]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector as col_selector
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from scipy.stats import kstest

#### First, we upload the file into a dataframe

In [285]:
df = pd.read_csv("/Users/anikaachary/Desktop/Intro_to_ML_class/categorical_cols/population.csv")
df

Unnamed: 0,Age,Job Type,Final Weight,Education,Education Number,Marital Status,Job Title,Relationship,Race,Gender,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### Using "Income" as our target variable, we drop it from the dataset but save it into dfy to use in the make_pipeline

In [287]:
dfy = df["Income"]
df = df.drop(["Income"], axis=1)

#### Separating categorical and numerical features

In [289]:
#df.isnull().sum()
#df.dtypes

categorical_cols_obj = col_selector(dtype_include=object) 
categorical_features = categorical_cols_obj(df)

int_cols_obj = col_selector(dtype_include=[np.number]) 
numeric_features = int_cols_obj(df)

print("Categorical features:", categorical_features)
print("Numerical features:", numeric_features)

Categorical features: ['Job Type', 'Education', 'Marital Status', 'Job Title', 'Relationship', 'Race', 'Gender', 'Country']
Numerical features: ['Age', 'Final Weight', 'Education Number', 'Capital Gain', 'Capital Loss', 'Hours per week']


#### Identifying normally and not-normally distributed features using K-Test

When I tried to use the Shapiro and K^2 tests, it gave me an error because there were more than 5000 values, and suggested using the K-Test for datasets with more values.

In [292]:
normal_features = []
non_normal_features = []

for feature in numeric_features:
    stat, p = kstest(df[feature], 'norm', args=(df[feature].mean(), df[feature].std()))
    if p > 0.05:
        normal_features.append(feature)
    else:
        non_normal_features.append(feature)

print("Normally Distributed Features:", normal_features)
# it looks like there are no normally distributed features.

print("Non-Normally Distributed Features:", non_normal_features)

Normally Distributed Features: []
Non-Normally Distributed Features: ['Age', 'Final Weight', 'Education Number', 'Capital Gain', 'Capital Loss', 'Hours per week']


#### Applying OneHot encoding to the categorical features, making a make_pipeline, using MinMax Scaler

After constructing the makepipeline, we then print the confusion matrix, the accuracy, and the AUC.

In [295]:
yb = preprocessing.LabelEncoder() # used to encode target values 
y = yb.fit_transform(dfy)


X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
                                                   
preprocessor = ColumnTransformer(
    transformers=[
        ('num_standard', StandardScaler(), normal_features),
        ('num_minmax', MinMaxScaler(), non_normal_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)  # Use drop='first' to avoid multicollinearity
    ]
)

pipeline = make_pipeline(
    preprocessor, 
    LogisticRegression(max_iter=1000)
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy Score:", accuracy)
print("AUC:", auc)

Confusion Matrix:
 [[6948  507]
 [ 926 1388]]
Accuracy Score: 0.8533114955471389
AUC: 0.9030108626405509
