# Models

In [1]:
import pandas as pd
import numpy as np
import math 
import pickle


from sklearn import preprocessing
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.inspection import permutation_importance

import statsmodels.api as sm

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_pickle('preprocessed_X_train_df.pkl')
test_df = pd.read_pickle('preprocessed_X_test_df.pkl')

In [3]:
with open('y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
    
with open('y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

In [4]:
X_train = train_df.to_numpy()

In [5]:
X_test = test_df.to_numpy()

## Logistic regression

In [6]:
logreg_pipe = make_pipeline(
    preprocessing.StandardScaler(), 
    LogisticRegression(random_state=1, max_iter=1000)
)

In [7]:
logreg_pipe.fit(X_train, y_train)

In [8]:
y_pred = logreg_pipe.predict(X_test)

In [9]:
logreg_confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
logreg_confusion_matrix

array([[92679,   897],
       [ 3759,  2427]])

In [10]:
target_names = ['< 50K', '> 50K']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       < 50K       0.96      0.99      0.98     93576
       > 50K       0.73      0.39      0.51      6186

    accuracy                           0.95     99762
   macro avg       0.85      0.69      0.74     99762
weighted avg       0.95      0.95      0.95     99762



In [11]:
pd.DataFrame(y_train).value_counts()

0    187141
1     12382
dtype: int64

In [12]:
pd.DataFrame(y_test).value_counts()

0    93576
1     6186
dtype: int64

In [13]:
# accuracy
logreg_pipe.score(X_test, y_test)

0.9533289228363505

In [15]:
precision = metrics.precision_score(y_test, y_pred)
print(precision)
recall = metrics.recall_score(y_test, y_pred)
print(recall)

0.73014440433213
0.39233753637245394


In [16]:
importance = logreg_pipe.steps[1][1].coef_[0]

In [17]:
len(train_df.columns)

328

In [18]:
importance.shape

(328,)

In [19]:
logreg_feature_importance = pd.DataFrame({
    'feature': list(train_df.columns),
    'coef': logreg_pipe.steps[1][1].coef_[0]
})

In [20]:
logreg_feature_importance['importance'] = \
    logreg_feature_importance['coef'].apply(abs)

In [21]:
logreg_feature_importance.sort_values('importance', ascending=False, inplace=True)

In [22]:
logreg_feature_importance.head(10)

Unnamed: 0,feature,coef,importance
6,weeks worked in year,0.920933,0.920933
0,age,0.712776,0.712776
31,tax filer stat: Nonfiler,-0.556518,0.556518
2,capital gains,0.458455,0.458455
29,tax filer stat: Joint both under 65,0.426756,0.426756
22,family members under 18: Mother only present,-0.410083,0.410083
5,num persons worked for employer,0.387511,0.387511
4,dividends from stocks,0.354668,0.354668
10,education: Children,-0.342963,0.342963
227,veterans benefits: 0,-0.339873,0.339873


In [23]:
logreg_feature_importance.tail(10)

Unnamed: 0,feature,coef,importance
192,detailed industry recode: 27,-0.000691,0.000691
298,state of previous residence: Wyoming,-0.000502,0.000502
107,detailed occupation recode: 3,0.000246,0.000246
296,state of previous residence: West Virginia,0.000242,0.000242
201,detailed industry recode: 35,0.000208,0.000208
47,migration code-change in reg: Different divis...,0.000169,0.000169
136,region of previous residence: Not in universe,-0.000127,0.000127
284,state of previous residence: Not in universe,-0.000127,0.000127
224,live in this house 1 year ago: No,0.000127,0.000127
89,detailed occupation recode: 13,-0.000108,0.000108


In [24]:
logreg_feature_importance[logreg_feature_importance['coef'] > 0].head(10)

Unnamed: 0,feature,coef,importance
6,weeks worked in year,0.920933,0.920933
0,age,0.712776,0.712776
2,capital gains,0.458455,0.458455
29,tax filer stat: Joint both under 65,0.426756,0.426756
5,num persons worked for employer,0.387511,0.387511
4,dividends from stocks,0.354668,0.354668
9,education: Bachelors degree,0.328033,0.328033
229,veterans benefits: 2,0.308381,0.308381
15,education: Masters degree,0.296795,0.296795
24,family members under 18: Not in universe,0.296072,0.296072


In [25]:
logreg_feature_importance[logreg_feature_importance['coef'] > 0].tail(10)

Unnamed: 0,feature,coef,importance
111,detailed occupation recode: 33,0.001438,0.001438
108,detailed occupation recode: 30,0.001327,0.001327
168,member of a labor union: Not in universe,0.00132,0.00132
85,detailed occupation recode: 1,0.001033,0.001033
189,detailed industry recode: 24,0.000871,0.000871
107,detailed occupation recode: 3,0.000246,0.000246
296,state of previous residence: West Virginia,0.000242,0.000242
201,detailed industry recode: 35,0.000208,0.000208
47,migration code-change in reg: Different divis...,0.000169,0.000169
224,live in this house 1 year ago: No,0.000127,0.000127


In [26]:
logreg_feature_importance[logreg_feature_importance['coef'] < 0].head(10)

Unnamed: 0,feature,coef,importance
31,tax filer stat: Nonfiler,-0.556518,0.556518
22,family members under 18: Mother only present,-0.410083,0.410083
10,education: Children,-0.342963,0.342963
227,veterans benefits: 0,-0.339873,0.339873
233,full or part time employment stat: Children o...,-0.274521,0.274521
131,sex: Female,-0.27438,0.27438
322,detailed household summary in household: Chil...,-0.25176,0.25176
23,family members under 18: Neither parent present,-0.181731,0.181731
320,detailed household summary in household: Chil...,-0.176492,0.176492
21,family members under 18: Father only present,-0.171975,0.171975


In [27]:
logreg_feature_importance[logreg_feature_importance['coef'] < 0].tail(10)

Unnamed: 0,feature,coef,importance
255,state of previous residence: Colorado,-0.001418,0.001418
66,major industry code: Entertainment,-0.001205,0.001205
207,detailed industry recode: 40,-0.001205,0.001205
163,hispanic origin: Puerto Rican,-0.001083,0.001083
291,state of previous residence: Tennessee,-0.00075,0.00075
192,detailed industry recode: 27,-0.000691,0.000691
298,state of previous residence: Wyoming,-0.000502,0.000502
136,region of previous residence: Not in universe,-0.000127,0.000127
284,state of previous residence: Not in universe,-0.000127,0.000127
89,detailed occupation recode: 13,-0.000108,0.000108


## Random Forest Classifier

In [28]:
randforest_pipe = make_pipeline(
    preprocessing.StandardScaler(), 
    RandomForestClassifier()
)

In [29]:
randforest_pipe.fit(X_train, y_train)

In [30]:
y_pred_rf = randforest_pipe.predict(X_test)

In [31]:
randforest_confusion_matrix = metrics.confusion_matrix(y_test, y_pred_rf)
randforest_confusion_matrix

array([[92675,   901],
       [ 3690,  2496]])

In [32]:
print(metrics.classification_report(y_test, y_pred_rf, target_names=target_names))

              precision    recall  f1-score   support

       < 50K       0.96      0.99      0.98     93576
       > 50K       0.73      0.40      0.52      6186

    accuracy                           0.95     99762
   macro avg       0.85      0.70      0.75     99762
weighted avg       0.95      0.95      0.95     99762



In [33]:
# Accuracy score
randforest_pipe.score(X_test, y_test)

0.9539804735269942

In [34]:
precision = metrics.precision_score(y_test, y_pred_rf)
print(precision)
recall = metrics.recall_score(y_test, y_pred_rf)
print(recall)

0.7347659699735061
0.4034917555771096


In [35]:
feature_importances = pd.DataFrame({
    'feature': list(train_df.columns),
    'importance': randforest_pipe.steps[1][1].feature_importances_
}).sort_values('importance', ascending=False)


In [36]:
feature_importances.head(20)

Unnamed: 0,feature,importance
0,age,0.094131
4,dividends from stocks,0.077113
2,capital gains,0.065694
5,num persons worked for employer,0.041695
6,weeks worked in year,0.030209
3,capital losses,0.021803
131,sex: Female,0.018483
301,major occupation code: Executive admin and ma...,0.016612
96,detailed occupation recode: 2,0.016205
9,education: Bachelors degree,0.016021


In [37]:
feature_importances.tail(20)

Unnamed: 0,feature,importance
295,state of previous residence: Virginia,4.765142e-05
252,state of previous residence: Arizona,4.633389e-05
257,state of previous residence: Delaware,4.444303e-05
267,state of previous residence: Louisiana,4.236495e-05
269,state of previous residence: Maryland,4.144537e-05
285,state of previous residence: Ohio,4.115744e-05
104,detailed occupation recode: 27,3.481361e-05
41,class of worker: Without pay,3.432382e-05
261,state of previous residence: Idaho,2.22731e-05
308,major occupation code: Private household serv...,2.21176e-05
