# Read Data

In [None]:
import pandas as pd

feature_names=["Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
               "Hours per week", "Country", "Income"]

df_train=pd.read_csv("../../data/adult.data",
                     names=feature_names)

df_test=pd.read_csv("../../data/adult.test",
                    skiprows =1,
                    names=feature_names)


# Data prep & problem definition

In [None]:

def df_prep(df):
    df=df.copy()
    LABEL="Income"
    y_train=df[LABEL].replace([" <=50K"," <=50K."," >50K"," >50K."],[0,0,1,1])
    df_X=df.drop(LABEL, axis=1)
    return df_X,y_train

df_X_train,y_train=df_prep(df_train)
df_X_test,y_test=df_prep(df_test)

# Very basic feature/classifier pipeline

In [None]:
# Sorry, I'm that lazy... :)

import numpy as np
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper
import sklearn.preprocessing as preprocessing

nums=[ ([c],preprocessing.Imputer()) for c in df_X_train.select_dtypes([np.number])]
cats=[ ([c],preprocessing.LabelBinarizer()) for c in df_X_train.select_dtypes(["object"])]

feature_mapper=DataFrameMapper(nums+cats,df_out=True)
feature_mapper

# Save model util

In [None]:
import os
import joblib

def save_model(pipeline,model_id):
    
    os.makedirs(f"../../models/",exist_ok=True)
    
    save_path=f"../../models/{model_id}.pickle"
    joblib.dump(pipeline,save_path)
        
    return save_path

# Logistic regression/default random forest models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


pipeline=Pipeline([('featurize', feature_mapper),
                   ('classifier',LogisticRegression(random_state=42))])

logreg_clf=pipeline.fit(df_X_train,y_train)
save_model(logreg_clf,"adult_logreg_default")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

pipeline=Pipeline([('featurize', feature_mapper),
                   ('pca',  PCA()),
                   ('classifier',LogisticRegression(random_state=42))])

logreg_clf=pipeline.fit(df_X_train,y_train)
save_model(logreg_clf,"adult_logreg_pca")

# Grid/Random search  

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split

# Quick grid search
CV=3
N_ITER=5

param_dist = {
              # Note n_estimators probably not a true hyperparameter, 
              # in general more is better (aside performance/diminishing returns)
              "classifier__n_estimators": [20],
              "classifier__max_features": ['auto', 'sqrt', 'log2'],
              "classifier__max_depth": [1,2,4,8,16],
              "classifier__min_samples_leaf": [1,8,16],
              "classifier__bootstrap": [True,False],
              "classifier__class_weight": [None,"balanced","balanced_subsample"],
              "classifier__n_jobs":[-1],
              "classifier__criterion" :['gini', 'entropy']
             }

classifier = RandomForestClassifier(random_state=42)

pipeline=Pipeline([('featurize', feature_mapper),
                   ('classifier',classifier)])

search_cv = RandomizedSearchCV(pipeline, param_distributions=param_dist,
                                   n_jobs=1,
                                   n_iter=N_ITER,
                                   scoring="roc_auc",
                                   error_score=0,cv=CV,verbose=5,
                                   
                                   #will not be default for sklearn .021
                                   return_train_score=True)

# Use subsample for grid search
GRID_TRAIN_PERC=.1

(X_train_grid,_,y_train_grid,_)=train_test_split(df_X_train,
                                                 y_train,
                                                 test_size=GRID_TRAIN_PERC,
                                                 random_state=42,
                                                 stratify=y_train)

In [None]:
search_cv.fit(X_train_grid,y_train_grid)

clf=search_cv.best_estimator_
clf

# Eval Grid Search results

In [None]:
eval_cv_metric="mean_test_score"
train_cv_metric="mean_train_score"
cv_results_df=pd.DataFrame(search_cv.cv_results_).sort_values(by='rank_test_score')

             
cv_results_df["dif_test_train"]=cv_results_df[train_cv_metric]-cv_results_df[eval_cv_metric]

cv_results_df.fillna("None",inplace=True)

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from IPython.display import display as display, Markdown

score_result=eval_cv_metric
score_result2="dif_test_train"
split_col=""

display(Markdown("## %s,%s vs parameters (numeric)"%(score_result,score_result2)))

all=cv_results_df

all["all"]=""

if not split_col in all.keys():
     split_col="all"

axis=0
for col in all.columns:
    if col.startswith("param_") and len(all[col].unique())>1:
        plt.figure(figsize=(12,6))


        sns.boxplot(x=col, y=score_result, hue=split_col,data=all)
        sns.swarmplot(x=col, y=score_result, color="red",data=all)
        plt.legend()
        ax2 = plt.twinx()
        sns.pointplot(x=col, y=score_result2,hue=split_col,ax=ax2, data=all)


# Full fit (more estimators & full train set)

In [None]:
import sklearn
full_clf=sklearn.clone(clf)
full_clf.set_params(classifier__n_estimators=500)

full_clf.fit(df_X_train,y_train)
full_clf

save_model(full_clf,"adult_randomforest")

# Model eval (test set)

In [None]:
from sklearn.metrics import roc_curve, auc

y_score = full_clf.predict_proba(df_X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test,y_score)
roc_auc= auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()