In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('Resources/census_data_education.csv')
df

Unnamed: 0,p_id,age,workclass,fnlwgt,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_lev,education_cat_lev,education_cat
0,0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,4,Bachelor's degree
1,1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,4,Bachelor's degree
2,2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,2,HS graduate
3,3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,1,Less than HS graduate
4,4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,4,Bachelor's degree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,257302,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0,3,Some college or associate's degree
32557,32557,40,Private,154374,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1,2,HS graduate
32558,32558,58,Private,151910,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,2,HS graduate
32559,32559,22,Private,201490,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0,2,HS graduate


In [3]:
df.education_cat_lev.unique()

array([4, 2, 1, 5, 3], dtype=int64)

In [4]:
df.groupby(df['education_cat']).count().head(40)

Unnamed: 0_level_0,p_id,age,workclass,fnlwgt,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_lev,education_cat_lev
education_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Bachelor's degree,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355,5355
Graduate or professional degree,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712,2712
HS graduate,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501,10501
Less than HS graduate,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253,4253
Some college or associate's degree,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740,9740


In [5]:
# choose data 

le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'income_lev']]

df_raw

Unnamed: 0,age,workclass,education_cat_lev,marital_status,occupation,relationship,race,sex,income_lev
0,39,State-gov,4,Never-married,Adm-clerical,Not-in-family,White,Male,0
1,50,Self-emp-not-inc,4,Married-civ-spouse,Exec-managerial,Husband,White,Male,0
2,38,Private,2,Divorced,Handlers-cleaners,Not-in-family,White,Male,0
3,53,Private,1,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0
4,28,Private,4,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0
...,...,...,...,...,...,...,...,...,...
32556,27,Private,3,Married-civ-spouse,Tech-support,Wife,White,Female,0
32557,40,Private,2,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,1
32558,58,Private,2,Widowed,Adm-clerical,Unmarried,White,Female,0
32559,22,Private,2,Never-married,Adm-clerical,Own-child,White,Male,0


In [6]:
# encode data

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])

In [7]:
df_encoded

Unnamed: 0,age,workclass,education_cat_lev,marital_status,occupation,relationship,race,sex,income_lev
0,22,7,3,4,1,1,4,1,0
1,33,6,3,2,4,0,4,1,0
2,21,4,1,0,6,1,4,1,0
3,36,4,0,2,6,0,2,1,0
4,11,4,3,2,10,5,2,0,0
...,...,...,...,...,...,...,...,...,...
32556,10,4,2,2,13,5,4,0,0
32557,23,4,1,2,7,0,4,1,1
32558,41,4,1,6,1,4,4,0,0
32559,5,4,1,4,1,3,4,1,0


In [8]:
# prepare ml data

X = df_encoded.drop(columns='income_lev')
y = df_encoded['income_lev']

In [9]:
# split data

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [10]:
classifier = LogisticRegression(solver='lbfgs', max_iter=500)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8022355975924333
              precision    recall  f1-score   support

           0       0.83      0.94      0.88      6181
           1       0.65      0.38      0.48      1960

    accuracy                           0.80      8141
   macro avg       0.74      0.66      0.68      8141
weighted avg       0.78      0.80      0.78      8141



# Demo Using 40-hour basis (Anton)

In [25]:
#df_raw_w_hours = df[['age', 'workclass', 'education_num', 'marital_status', 
                     #'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]
#df_raw_40hours = df_raw_w_hours[df_raw_w_hours['hours_per_week'] == 40].drop(columns='hours_per_week')

In [14]:
#df_raw_40hours

In [15]:
# encode 40 hours dataset

#df_encoded = df_raw_40hours.copy()

#for column in df_raw_40hours.columns:
    #df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])

In [16]:
# split data

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

classifier = LogisticRegression(solver='lbfgs', max_iter=500)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7944969905417025
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      6181
           1       0.63      0.35      0.45      1960

    accuracy                           0.79      8141
   macro avg       0.73      0.64      0.66      8141
weighted avg       0.77      0.79      0.77      8141



In [17]:
#!pip install imbalanced-learn==0.9.0 

In [18]:
!pip install scikit-learn==1.0.2



In [19]:
import sklearn.externals as extjoblib
import joblib

## using balanced random forest

In [20]:
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100)

In [21]:
brfc.fit(X_train, y_train)
y_pred = brfc.predict(X_test)

In [22]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7701756540965483
              precision    recall  f1-score   support

           0       0.92      0.76      0.83      6181
           1       0.51      0.79      0.62      1960

    accuracy                           0.77      8141
   macro avg       0.72      0.78      0.73      8141
weighted avg       0.82      0.77      0.78      8141



In [23]:
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

[(0.3298515178292473, 'age'),
 (0.152028019739096, 'marital_status'),
 (0.14747530517094373, 'education_cat_lev'),
 (0.14098399245505389, 'relationship'),
 (0.11934960090284692, 'occupation'),
 (0.05979239891262409, 'workclass'),
 (0.025308866291656233, 'sex'),
 (0.025210298698531995, 'race')]

In [26]:
y.value_counts()

0    24720
1     7841
Name: income_lev, dtype: int64

# Amanda = below this line

# NAIVE RANDOM OVERSAMPLING

In [29]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_test)

Counter({0: 6303, 1: 1838})

In [31]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(24420, 8)
(8141, 8)
(24420,)
(8141,)


In [32]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 18417, 1: 18417})

In [33]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1) 
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4498, 1805],
       [ 435, 1403]], dtype=int64)

In [35]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.7384790685541558

In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.71      0.76      0.80      0.74      0.54      6303
          1       0.44      0.76      0.71      0.56      0.74      0.55      1838

avg / total       0.80      0.72      0.75      0.75      0.74      0.54      8141



# SMOTE OVERSAMPLING

In [39]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 18417, 1: 18417})

In [40]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [41]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[4498, 1805],
       [ 435, 1403]], dtype=int64)

In [42]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7406228479555395

In [43]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.70      0.78      0.79      0.74      0.54      6303
          1       0.43      0.78      0.70      0.56      0.74      0.55      1838

avg / total       0.81      0.72      0.76      0.74      0.74      0.54      8141



# UNDERSAMPLING - CLUSTERCENTROIDS

In [45]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({0: 6003, 1: 6003})

In [46]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [47]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4149, 2154],
       [ 339, 1499]], dtype=int64)

In [48]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7369091820621197

In [49]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.66      0.82      0.77      0.73      0.53      6303
          1       0.41      0.82      0.66      0.55      0.73      0.55      1838

avg / total       0.81      0.69      0.78      0.72      0.73      0.53      8141



# SMOTEEN

In [51]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 16909, 1: 11953})

In [52]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [53]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7369091820621197

In [54]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4809, 1494],
       [ 550, 1288]], dtype=int64)

In [55]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.76      0.70      0.82      0.73      0.54      6303
          1       0.46      0.70      0.76      0.56      0.73      0.53      1838

avg / total       0.80      0.75      0.71      0.76      0.73      0.54      8141



# EASY ADA BOOST ENSEMBLE

In [57]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train) 

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [58]:
# Calculate the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.8050432225910351

In [59]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4809, 1494],
       [ 281, 1557]], dtype=int64)

In [60]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.76      0.85      0.84      0.80      0.64      6303
          1       0.51      0.85      0.76      0.64      0.80      0.65      1838

avg / total       0.85      0.78      0.83      0.80      0.80      0.64      8141



# RANDOM FOREST REPEATED

In [62]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [63]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.7893344741273003

In [64]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4731, 1572],
       [ 316, 1522]], dtype=int64)

In [65]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.75      0.83      0.83      0.79      0.62      6303
          1       0.49      0.83      0.75      0.62      0.79      0.63      1838

avg / total       0.84      0.77      0.81      0.78      0.79      0.62      8141



In [66]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)
for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

age : (0.32174116862717117)
relationship : (0.16570006253972946)
education_cat_lev : (0.14109322088829115)
marital_status : (0.1403624407294519)
occupation : (0.12056006428916129)
workclass : (0.06196873771086143)
race : (0.026360721332177997)
sex : (0.022213583883155686)


In [68]:
# Confustion Matrix Summary Table

confusion_summary = {
    "Model": ["Cluster Centroids", "SMOTEENN", "Naive Random Oversampling", "SMOTE", "Balanced Random Forest", "Easy Ensemble"],
    "Balanced Accuracy" : [0.737, 0.737, 0.738, 0.741, 0.789, 0.805],
}

confusion_summary_df = pd.DataFrame(confusion_summary).set_index("Model")
confusion_summary_df

Unnamed: 0_level_0,Balanced Accuracy
Model,Unnamed: 1_level_1
Cluster Centroids,0.737
SMOTEENN,0.737
Naive Random Oversampling,0.738
SMOTE,0.741
Balanced Random Forest,0.789
Easy Ensemble,0.805
