In [32]:
# ml imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [33]:
# SQLAlchemy imports

import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

from config import postgreSQL_username, postgreSQL_password

In [34]:
engine = create_engine(f"postgresql+psycopg2://{postgreSQL_username}:{postgreSQL_password}@localhost/final_project")
Base = automap_base()
Base.prepare(engine, reflect=True)

In [35]:
df = pd.read_sql_table('CensusDataEducation', con=engine)
df

Unnamed: 0,p_id,age,workclass,fnlwgt,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_lev,education_cat_lev,education_cat
0,0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,4,Bachelor's degree
1,1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,4,Bachelor's degree
2,2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,2,HS graduate
3,3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,1,Less than HS graduate
4,4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,4,Bachelor's degree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,257302,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0,3,Some college or associate's degree
32557,32557,40,Private,154374,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1,2,HS graduate
32558,32558,58,Private,151910,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,2,HS graduate
32559,32559,22,Private,201490,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0,2,HS graduate


In [36]:
# choose data and encode

le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])

In [37]:
# prepare training data

X = df_encoded.drop(columns='income_lev')
y = df_encoded['income_lev']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

## FROM AMANDA'S WORK
# running various samplers to see which is best

In [56]:
# RandomOverSampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1) 
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
randomOverSampler_score = balanced_accuracy_score(y_test, y_pred)
print(randomOverSampler)

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

0.7347190121801835
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.73      0.72      0.80      0.73      0.53      6181
          1       0.46      0.72      0.73      0.56      0.73      0.53      1960

avg / total       0.79      0.73      0.72      0.74      0.73      0.53      8141





In [57]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

SMOTE_score = balanced_accuracy_score(y_test, y_pred)
print(SMOTE_score)
print(classification_report_imbalanced(y_test, y_pred))

0.726592726558347
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.71      0.74      0.79      0.73      0.53      6181
          1       0.45      0.74      0.71      0.56      0.73      0.53      1960

avg / total       0.79      0.72      0.73      0.74      0.73      0.53      8141





In [63]:
# ClusterCentroids

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

CC_score = balanced_accuracy_score(y_test, y_pred)
print(CC_score)
print(classification_report_imbalanced(y_test, y_pred))



0.7038808445235398
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.68      0.73      0.77      0.70      0.49      6181
          1       0.42      0.73      0.68      0.53      0.70      0.50      1960

avg / total       0.77      0.69      0.72      0.71      0.70      0.49      8141





In [59]:
# SMOTEEN

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

SMOTEEN_score = balanced_accuracy_score(y_test, y_pred)
print(SMOTEEN_score)
print(classification_report_imbalanced(y_test, y_pred))



0.7143310721797212
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.76      0.67      0.81      0.71      0.51      6181
          1       0.47      0.67      0.76      0.55      0.71      0.50      1960

avg / total       0.78      0.74      0.69      0.75      0.71      0.51      8141





In [60]:
# Easy Ensemble

from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

y_pred = eec.predict(X_test)

easy_score = balanced_accuracy_score(y_test, y_pred)
print(easy_score)
print(classification_report_imbalanced(y_test, y_pred))



0.8023791226569903
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.77      0.84      0.84      0.80      0.64      6181
          1       0.53      0.84      0.77      0.65      0.80      0.65      1960

avg / total       0.84      0.78      0.82      0.80      0.80      0.64      8141



In [61]:
# BalancedRandomForest

from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

forest_score = balanced_accuracy_score(y_test, y_pred)
print(forest_score)
print(classification_report_imbalanced(y_test, y_pred))



0.7803525203966071
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.75      0.81      0.83      0.78      0.60      6181
          1       0.51      0.81      0.75      0.62      0.78      0.61      1960

avg / total       0.82      0.77      0.80      0.78      0.78      0.61      8141





In [14]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)
for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

age : (0.32568251264444203)
relationship : (0.16304181856290068)
education_cat_lev : (0.14071961014974313)
marital_status : (0.1405115197257229)
occupation : (0.11971470012312195)
workclass : (0.06205055528898905)
race : (0.02618210083325469)
sex : (0.022097182671825706)


In [62]:
# balanced random forest w/ SMOTE

X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

SMOTE_forest_score = balanced_accuracy_score(y_test, y_pred)
print(SMOTE_forest_score)
print(classification_report_imbalanced(y_test, y_pred))



0.7486291515473686
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.81      0.69      0.85      0.75      0.56      6181
          1       0.54      0.69      0.81      0.60      0.75      0.55      1960

avg / total       0.81      0.78      0.72      0.79      0.75      0.56      8141





In [49]:
# Accuracy score table

accuracy_summary = {
    "Model": ["RandomOverSampler", "SMOTE", "ClusterCentroids", "SMOTEEN", "Easy Ensemble", "BalancedRandomForest", "balanced random forest w/ SMOTE"],
    "Balanced Accuracy" : [randomOverSampler_score, SMOTE_score, CC_score, SMOTEEN_score, easy_score, forest_score, SMOTE_forest_score],
}

accuracy_summary_df = pd.DataFrame(accuracy_summary).set_index("Model")
accuracy_summary_df

Unnamed: 0_level_0,Balanced Accuracy
Model,Unnamed: 1_level_1
RandomOverSampler,0.722568
SMOTE,0.727221
ClusterCentroids,0.704188
SMOTEEN,0.706026
Easy Ensemble,0.795361
BalancedRandomForest,0.775493
balanced random forest w/ SMOTE,0.741013


In [55]:
# prepare new set of training data

X = df_encoded.drop(columns=['income_lev', 'marital_status'])
y = df_encoded['income_lev']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [54]:
X.dtypes

age                  int64
workclass            int32
education_cat_lev    int64
marital_status       int32
occupation           int32
relationship         int32
dtype: object