# Model Building
In this stage, you will build several machine learning models on the cleaned data set and attempt to train a model that performs better than baseline. Depending on your data set, this may mean different things.
## Imports

In [6]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import SCORERS, plot_precision_recall_curve, plot_roc_curve

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [8]:
src_path = os.path.abspath('../src/')
sys.path.append(src_path)

In [11]:
 # setup input and output locations
input_loc = '../data/processed/'
output_loc = '../data/processed/' # for checkpoints

In [14]:
# to allow for all variables to be displayed in jupyter
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Data

In [12]:
micro = pd.read_pickle(input_loc + 'all_missing_excluded.pkl')

## Data Splitting

In [17]:
y_micro = micro.at_least_one_alarm

In [44]:
X_micro_columns = micro.columns.to_list()[17:23] + micro.columns.to_list()[26:-4]
X_micro_columns

['additional_hazard_type',
 'people_served',
 'youth_served',
 'seniors_served',
 'veterans_military_members_and_military_family_members_served',
 'individuals_with_disabilities_access_or_functional_needs_served',
 'tot_population',
 'in_poverty',
 'inc_pct_poverty',
 'inc_pcincome',
 'age_pct_under25',
 'age_pct_25_64',
 'age_pct_over65',
 'educ_tot_pop',
 'educ_no_school',
 'educ_nursery_4th',
 'educ_5th_6th',
 'educ_7th_8th',
 'educ_9th',
 'educ_10th',
 'educ_11th',
 'educ_12th_no_diploma',
 'educ_high_school_grad',
 'educ_col_less_1_yr',
 'educ_some_col_no_grad',
 'educ_associates',
 'educ_bachelors',
 'educ_masters',
 'educ_professional',
 'educ_docterate',
 'total_pop_16_plus',
 'worked_past_12_mo',
 'did_not_work_past_12_mo',
 'race_pct_white',
 'race_pct_whitenh',
 'race_pct_nonwhite',
 'race_pct_nonwhitenh',
 'race_pct_amind',
 'race_pct_black',
 'race_pct_hisp',
 'house_tot_occ_cnt',
 'house_pct_family',
 'house_pct_family_married',
 'house_pct_family_male_hh',
 'house_pct_fa

In [48]:
X_micro_columns = [var for var in X_micro_columns if var.find('tot') == -1]

In [49]:
X_micro = micro[X_micro_columns]

In [69]:
numerical_variables= X_micro_columns[1:]
numerical_variables

['people_served',
 'youth_served',
 'seniors_served',
 'veterans_military_members_and_military_family_members_served',
 'individuals_with_disabilities_access_or_functional_needs_served',
 'in_poverty',
 'inc_pct_poverty',
 'inc_pcincome',
 'age_pct_under25',
 'age_pct_25_64',
 'age_pct_over65',
 'educ_no_school',
 'educ_nursery_4th',
 'educ_5th_6th',
 'educ_7th_8th',
 'educ_9th',
 'educ_10th',
 'educ_11th',
 'educ_12th_no_diploma',
 'educ_high_school_grad',
 'educ_col_less_1_yr',
 'educ_some_col_no_grad',
 'educ_associates',
 'educ_bachelors',
 'educ_masters',
 'educ_professional',
 'educ_docterate',
 'worked_past_12_mo',
 'did_not_work_past_12_mo',
 'race_pct_white',
 'race_pct_whitenh',
 'race_pct_nonwhite',
 'race_pct_nonwhitenh',
 'race_pct_amind',
 'race_pct_black',
 'race_pct_hisp',
 'house_pct_family',
 'house_pct_family_married',
 'house_pct_family_male_hh',
 'house_pct_family_female_hh',
 'house_pct_non_family',
 'house_pct_live_alone',
 'house_pct_no_live_alone',
 'house_pct_

In [73]:
categorical_variables = [X_micro_columns[0]]
categorical_variables

['additional_hazard_type']

In [74]:
cat_pipeline = Pipeline([
    ('encode', OneHotEncoder(drop='first')),
 ])

In [75]:
num_pipeline = Pipeline([
    ('rbst_scaler', RobustScaler()),
 ])

In [76]:
full_pipeline = ColumnTransformer([
    ("cat", cat_pipeline, categorical_variables),
    ("num", num_pipeline, numerical_variables)
 ])
X_micro = full_pipeline.fit_transform(X_micro)

## Baseline Model

In [77]:
%%time

clf = LogisticRegression(random_state=42, solver='saga', class_weight='balanced')

scores_clf = cross_val_score(clf, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_clf



Wall time: 8min 46s




array([0.64502473, 0.64605142, 0.62271309, 0.61799297, 0.63158908])

In [78]:
%%time
clf_tree = DecisionTreeClassifier(class_weight='balanced')
scores_tree = cross_val_score(clf_tree, X_micro, y_micro, cv=5, scoring='roc_auc')

Wall time: 6min 45s


In [79]:
scores_tree

array([0.50735444, 0.5008593 , 0.48513809, 0.49292878, 0.52318234])

In [80]:
%%time
rfc = RandomForestClassifier(n_jobs=7, class_weight='balanced')
scores_rfc = cross_val_score(rfc, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_rfc    

Wall time: 20min 13s


array([0.59804201, 0.54366079, 0.51085927, 0.54686707, 0.6116126 ])

In [81]:
%%time
sgc_hinge = SGDClassifier(loss="hinge", penalty="l2", max_iter=500, class_weight="balanced")
scores_sgd_hinge = cross_val_score(sgc_hinge, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_sgd_hinge

Wall time: 2min 48s


array([0.60122585, 0.6213128 , 0.58877946, 0.6029857 , 0.62511996])

In [82]:
%%time

clf_harsh = LogisticRegression(random_state=42, penalty='l1', C=0.1, solver='saga', class_weight='balanced')

scores_clf_harsh = cross_val_score(clf_harsh, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_clf_harsh



Wall time: 11min 4s




array([0.64507731, 0.64604564, 0.6227025 , 0.61805863, 0.63158165])

In [83]:
%%time

clf_harsh = LogisticRegression(random_state=42, penalty='l1', C=0.01, solver='saga', class_weight='balanced', max_iter=500)

scores_clf_harsh = cross_val_score(clf_harsh, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_clf_harsh



Wall time: 1h 6s


array([0.64648461, 0.64705613, 0.62554257, 0.61384459, 0.63281974])

In [None]:
%%time

clf_harsh = LogisticRegression(random_state=42, penalty='l2', C=0.1, solver='saga', class_weight='balanced', max_iter=500)

scores_clf_harsh = cross_val_score(clf_harsh, X_micro, y_micro, cv=5, scoring='roc_auc')
scores_clf_harsh



## Model Improvement

### Scaling

### Feature Selection and Engineering

### Hyperparameter Tuning

### Additional Tuning, Processing, or Model-Improvement

## Outcome