In [6]:
# ml imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# SQLAlchemy imports

import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

from config import postgreSQL_username, postgreSQL_password

In [15]:
# output function

def summary(desc):
    print(f'Description: {desc}')
    print('Balanced accuracy score: ' + str(balanced_accuracy_score(y_test, y_pred)))
    print('Classification report:')
    print(classification_report_imbalanced(y_test, y_pred))

In [3]:
# prep SQLAlchemy

engine = create_engine(f"postgresql+psycopg2://{postgreSQL_username}:{postgreSQL_password}@localhost/final_project")
Base = automap_base()
Base.prepare(engine, reflect=True)

In [4]:
df = pd.read_sql_table('CensusDataEducation', con=engine)
df

Unnamed: 0,p_id,age,workclass,fnlwgt,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_lev,education_cat_lev,education_cat
0,0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,4,Bachelor's degree
1,1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,4,Bachelor's degree
2,2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,2,HS graduate
3,3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,1,Less than HS graduate
4,4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,4,Bachelor's degree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,27,Private,257302,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0,3,Some college or associate's degree
32557,32557,40,Private,154374,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1,2,HS graduate
32558,32558,58,Private,151910,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,2,HS graduate
32559,32559,22,Private,201490,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0,2,HS graduate


In [21]:
# choose all data and encode

le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])

In [17]:
X = df_encoded.drop(columns='income_lev')
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('all data')



Description: all data
Balanced accuracy score: 0.8050298148704556
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.77      0.84      0.85      0.80      0.64      6181
          1       0.54      0.84      0.77      0.65      0.80      0.65      1960

avg / total       0.84      0.79      0.82      0.80      0.80      0.64      8141



In [23]:
df_encoded = df_encoded[df_encoded['hours_per_week'] == 40].drop(columns='hours_per_week')

X = df_encoded.drop(columns='income_lev')
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('full time')



Description: full time
Balanced accuracy score: 0.5357142857142857
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.57      0.50      0.67      0.53      0.29         7
          1       0.25      0.50      0.57      0.33      0.53      0.28         2

avg / total       0.68      0.56      0.52      0.59      0.53      0.29         9



In [25]:
# choose data and encode

le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev', 'hours_per_week'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('all data - hours per week')



Description: all data - hours per week
Balanced accuracy score: 0.7994854623616151
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.76      0.84      0.84      0.80      0.63      6181
          1       0.52      0.84      0.76      0.65      0.80      0.64      1960

avg / total       0.84      0.78      0.82      0.79      0.80      0.63      8141



In [27]:
# choose data and encode

le = LabelEncoder()
df_raw = df[['age', 'race', 'sex', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('age/race/sex')



Description: age/race/sex
Balanced accuracy score: 0.7049856125915825
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.61      0.80      0.73      0.70      0.48      6181
          1       0.39      0.80      0.61      0.53      0.70      0.50      1960

avg / total       0.78      0.66      0.75      0.68      0.70      0.48      8141



In [28]:
le = LabelEncoder()
df_raw = df[['age', 'education_cat_lev', 'race', 'sex', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('age/race/sex/education')



Description: age/race/sex/education
Balanced accuracy score: 0.749408325051425
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.70      0.80      0.79      0.75      0.55      6181
          1       0.46      0.80      0.70      0.58      0.75      0.56      1960

avg / total       0.81      0.72      0.77      0.74      0.75      0.56      8141



In [29]:
le = LabelEncoder()
df_raw = df[['workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('all data - age')



Description: all data - age
Balanced accuracy score: 0.7938732174636559
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.76      0.83      0.84      0.79      0.62      6181
          1       0.52      0.83      0.76      0.64      0.79      0.63      1960

avg / total       0.83      0.78      0.81      0.79      0.79      0.63      8141



In [30]:
le = LabelEncoder()
df_raw = df[['age', 'marital_status', 'relationship', 'income_lev']]

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('family status')



Description: family status
Balanced accuracy score: 0.7635388980054083
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.71      0.82      0.80      0.76      0.57      6181
          1       0.47      0.82      0.71      0.60      0.76      0.59      1960

avg / total       0.82      0.74      0.79      0.75      0.76      0.58      8141



In [32]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Male'].drop(columns='sex')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('all data male')



Description: all data male
Balanced accuracy score: 0.807154696886687
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.79      0.82      0.85      0.81      0.65      3782
          1       0.63      0.82      0.79      0.72      0.81      0.65      1666

avg / total       0.83      0.80      0.81      0.81      0.81      0.65      5448



In [33]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Female'].drop(columns='sex')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('all data female')



Description: all data female
Balanced accuracy score: 0.8737471904553229
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.87      0.88      0.92      0.87      0.76      2398
          1       0.45      0.88      0.87      0.59      0.87      0.76       295

avg / total       0.92      0.87      0.88      0.89      0.87      0.76      2693



In [34]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['education_cat_lev'] <= 2].drop(columns='education_cat_lev')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('high school or less')



Description: high school or less
Balanced accuracy score: 0.7990375376545134
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.76      0.84      0.85      0.80      0.63      3209
          1       0.34      0.84      0.76      0.49      0.80      0.64       480

avg / total       0.89      0.77      0.83      0.80      0.80      0.63      3689



In [35]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['education_cat_lev'] > 2].drop(columns='education_cat_lev')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('more than high school')



Description: more than high school
Balanced accuracy score: 0.8257523605976385
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.80      0.85      0.86      0.83      0.68      2971
          1       0.68      0.85      0.80      0.76      0.83      0.68      1481

avg / total       0.84      0.82      0.83      0.82      0.83      0.68      4452



In [36]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['education_cat_lev'] > 3].drop(columns='education_cat_lev')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('college grad')



Description: college grad
Balanced accuracy score: 0.8021740414140619
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.79      0.82      0.80      0.80      0.64      1040
          1       0.78      0.82      0.79      0.80      0.80      0.65       977

avg / total       0.80      0.80      0.80      0.80      0.80      0.64      2017



In [37]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Female'].drop(columns='sex')
df_raw = df[df['education_cat_lev'] < 3].drop(columns='education_cat_lev')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('female high school')



Description: female high school
Balanced accuracy score: 0.8185490677261869
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.77      0.86      0.86      0.82      0.66      3209
          1       0.36      0.86      0.77      0.51      0.82      0.67       480

avg / total       0.89      0.78      0.85      0.82      0.82      0.66      3689



In [38]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Male'].drop(columns='sex')
df_raw = df[df['education_cat_lev'] < 3].drop(columns='education_cat_lev')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('male high school')



Description: male high school
Balanced accuracy score: 0.8189674872753714
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.77      0.87      0.86      0.82      0.66      3209
          1       0.36      0.87      0.77      0.51      0.82      0.67       480

avg / total       0.90      0.78      0.85      0.82      0.82      0.66      3689



In [39]:
df['marital_status'].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [40]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['marital_status'] == ('Never-married' or 'Divorced' or 'Separated' or 'Widowed')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('single')



Description: single
Balanced accuracy score: 0.8613722224349402
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.85      0.87      0.92      0.86      0.74      2548
          1       0.22      0.87      0.85      0.35      0.86      0.74       123

avg / total       0.96      0.85      0.87      0.89      0.86      0.74      2671



In [41]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['marital_status'] == ('Married-civ-spouse' or 'Married-spouse-absent' or 'Married-AF-spouse')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('married')



Description: married
Balanced accuracy score: 0.7487868648628211
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.78      0.72      0.78      0.75      0.56      2071
          1       0.73      0.72      0.78      0.72      0.75      0.56      1673

avg / total       0.75      0.75      0.75      0.75      0.75      0.56      3744



In [42]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Male'].drop(columns='sex')
df_raw = df[df['marital_status'] == ('Never-married' or 'Divorced' or 'Separated' or 'Widowed')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('male single')



Description: male single
Balanced accuracy score: 0.8704276269607281
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.85      0.89      0.91      0.87      0.75      2548
          1       0.22      0.89      0.85      0.35      0.87      0.76       123

avg / total       0.96      0.85      0.89      0.89      0.87      0.75      2671



In [43]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Female'].drop(columns='sex')
df_raw = df[df['marital_status'] == ('Never-married' or 'Divorced' or 'Separated' or 'Widowed')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('female single')



Description: female single
Balanced accuracy score: 0.864119475182193
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.86      0.87      0.92      0.86      0.75      2548
          1       0.23      0.87      0.86      0.36      0.86      0.75       123

avg / total       0.96      0.86      0.87      0.89      0.86      0.75      2671



In [44]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Male'].drop(columns='sex')
df_raw = df[df['marital_status'] == ('Married-civ-spouse' or 'Married-spouse-absent' or 'Married-AF-spouse')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('male married')



Description: male married
Balanced accuracy score: 0.7486145596996984
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.78      0.72      0.78      0.75      0.56      2071
          1       0.73      0.72      0.78      0.72      0.75      0.56      1673

avg / total       0.75      0.75      0.75      0.75      0.75      0.56      3744



In [45]:
le = LabelEncoder()
df_raw = df[['age', 'workclass', 'education_cat_lev', 'marital_status', 'occupation', 
             'relationship', 'race', 'sex', 'hours_per_week', 'income_lev']]

df_raw = df[df['sex'] == 'Female'].drop(columns='sex')
df_raw = df[df['marital_status'] == ('Married-civ-spouse' or 'Married-spouse-absent' or 'Married-AF-spouse')].drop(columns='marital_status')

df_encoded = df_raw.copy()

for column in df_raw.columns:
    df_encoded[f'{column}'] = le.fit_transform(df_encoded[f'{column}'])
    
# make training/test sets

X = df_encoded.drop(columns=['income_lev'])
y = df_encoded['income_lev']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# run model

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)

summary('female married')



Description: female married
Balanced accuracy score: 0.757224189797745
Classification report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.78      0.73      0.78      0.76      0.58      2071
          1       0.73      0.73      0.78      0.73      0.76      0.57      1673

avg / total       0.76      0.76      0.75      0.76      0.76      0.57      3744

