In [1]:
from lib.project_5_ADH import load_data_from_database, make_data_dict, general_model, general_transformer
from numpy import arange
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

# Step 3 - Build Model

### Domain and Data

The data, referred to as Madelon, is 2000 rows and 500 features (1 index, 1 target). The dataset is artificial with a two-class target (-1, 1) with continuous input (parameter) variables. 

### Problem Statement

Implement a machine learning pipeline using Logisitic Regression and KNeighborsClassifier while transforming the data using SelectKBest.

### Solution Statement

Provide a jupyter notebook with a pipeline (with regularization) that will show how the Logistic Regression and KNeighbors models work. Check how many salient features each use.

### Metric

I would like to reduce the amount of salient features that I determined in the prior workbook (step 2).

### Benchmark

I would like to beat my test score from step 1 of ~55%.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/build_model.png" width="600px">

In [2]:
madelon_df = load_data_from_database('dsi_student', 'correct horse battery staple', 'joshuacook.me', 
                                     '5432', 'dsi', 'madelon')

In [3]:
madelon_df.head()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
0,0,485,477,537,479,452,471,491,476,475,...,481,477,485,511,485,481,479,475,496,-1
1,1,483,458,460,487,587,475,526,479,485,...,478,487,338,513,486,483,492,510,517,-1
2,2,487,542,499,468,448,471,442,478,480,...,481,492,650,506,501,480,489,499,498,-1
3,3,480,491,510,485,495,472,417,474,502,...,480,474,572,454,469,475,482,494,461,1
4,4,484,502,528,489,466,481,402,478,487,...,479,452,435,486,508,481,504,495,511,1


In [4]:
data_dict = make_data_dict(madelon_df, random_state = 40, test_size = 0.20)

In [5]:
data_dict = general_transformer(StandardScaler(), data_dict)

In [6]:
data_dict = general_transformer(SelectKBest(), data_dict)

In [7]:
K_best_selection = data_dict['processes'][1]

In [8]:
np.where(K_best_selection.get_support())

(array([ 48,  64, 105, 128, 241, 336, 338, 442, 472, 475]),)

In [9]:
LR_scores = general_model(LogisticRegression(), data_dict)
LR_scores['train score'], LR_scores['test score']

(0.61499999999999999, 0.59750000000000003)

In [10]:
data_dict['processes'][2].coef_.flatten()
# SelectKBest only chose 10!

array([ 0.13579799, -0.11706957,  0.05055887,  0.15962245,  0.16447294,
        0.21886389,  0.20087084, -0.31649912,  0.2762322 ,  0.40462379])

In [11]:
KNN_scores = general_model(KNeighborsClassifier(), data_dict)
KNN_scores['train score'], KNN_scores['test score']

(0.91062500000000002, 0.86250000000000004)

In [12]:
lg_param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}
GSCV_LR_scores = general_model(GridSearchCV(LogisticRegression(),lg_param_grid), data_dict)
GSCV_LR_scores['train score'], GSCV_LR_scores['test score']

(0.61312500000000003, 0.59999999999999998)

In [13]:
GSCV_LR = data_dict['processes'][4]

In [14]:
pd.DataFrame(GSCV_LR.cv_results_)
# best estimator is c = 0.1

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.003468,0.000451,0.59375,0.590936,0.0001,{u'C': 0.0001},7,0.595506,0.585366,0.594747,0.596064,0.590994,0.591378,0.000387,6.6e-05,0.001972,0.004379
1,0.002825,0.000361,0.591875,0.596875,0.001,{u'C': 0.001},8,0.586142,0.596623,0.590994,0.598875,0.598499,0.595127,0.000262,6e-06,0.005084,0.001541
2,0.002969,0.00036,0.600625,0.608437,0.01,{u'C': 0.01},2,0.580524,0.606942,0.613508,0.602624,0.60788,0.615745,0.000101,6e-06,0.014411,0.00546
3,0.003262,0.000401,0.60375,0.615623,0.1,{u'C': 0.1},1,0.578652,0.60788,0.611632,0.617619,0.621013,0.621368,0.000198,3.3e-05,0.018172,0.005685
4,0.003551,0.000368,0.600625,0.616248,1.0,{u'C': 1.0},2,0.578652,0.610694,0.609756,0.614808,0.613508,0.623243,0.000114,1.3e-05,0.015627,0.005223
5,0.003965,0.000516,0.59625,0.610938,10.0,{u'C': 10.0},4,0.576779,0.611632,0.604128,0.608247,0.60788,0.612933,0.000702,0.000194,0.013866,0.001975
6,0.004005,0.000395,0.595625,0.610313,100.0,{u'C': 100.0},5,0.576779,0.611632,0.604128,0.606373,0.606004,0.612933,0.000264,3.2e-05,0.013361,0.002836
7,0.003827,0.000376,0.595625,0.610313,1000.0,{u'C': 1000.0},5,0.576779,0.611632,0.604128,0.606373,0.606004,0.612933,0.000152,1.4e-05,0.013361,0.002836


In [15]:
knn_param_grid = {'n_neighbors': [x for x in arange(3, 52, 2)]}
GSCV_knn_scores = general_model(GridSearchCV(KNeighborsClassifier(),knn_param_grid), data_dict)
GSCV_knn_scores['train score'], GSCV_knn_scores['test score']

(0.91062500000000002, 0.86250000000000004)

In [16]:
GSCV_KNN = data_dict['processes'][5]

In [17]:
pd.DataFrame(GSCV_KNN.cv_results_)
# best estimator is neighbors = 5

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.002745,0.007738,0.84875,0.931246,3,{u'n_neighbors': 3},2,0.853933,0.919325,0.840525,0.947516,0.851782,0.926898,0.000563,0.001857,0.005879,0.011913
1,0.002333,0.008025,0.85,0.903437,5,{u'n_neighbors': 5},1,0.861423,0.903377,0.834897,0.904405,0.853659,0.90253,1.8e-05,0.000891,0.011135,0.000766
2,0.00225,0.008466,0.844375,0.890624,7,{u'n_neighbors': 7},3,0.85206,0.888368,0.838649,0.893158,0.842402,0.890347,0.000105,0.000849,0.005651,0.001966
3,0.00214,0.008552,0.839375,0.874374,9,{u'n_neighbors': 9},4,0.844569,0.869606,0.840525,0.873477,0.833021,0.880037,3.3e-05,0.000529,0.004785,0.004306
4,0.002169,0.008288,0.82625,0.871249,11,{u'n_neighbors': 11},5,0.833333,0.86773,0.833021,0.870665,0.812383,0.875351,2.4e-05,0.000261,0.009802,0.003139
5,0.002156,0.008821,0.821875,0.863436,13,{u'n_neighbors': 13},6,0.827715,0.860225,0.833021,0.863168,0.804878,0.866917,1.3e-05,0.000256,0.012207,0.002738
6,0.002154,0.009115,0.815625,0.856873,15,{u'n_neighbors': 15},8,0.822097,0.848968,0.816135,0.859419,0.80863,0.862231,1.5e-05,0.000103,0.005511,0.005706
7,0.002197,0.009526,0.816875,0.848749,17,{u'n_neighbors': 17},7,0.820225,0.845216,0.818011,0.846298,0.812383,0.854733,3.8e-05,0.000107,0.003301,0.004254
8,0.003361,0.012593,0.81125,0.844061,19,{u'n_neighbors': 19},9,0.816479,0.837711,0.80863,0.843486,0.80863,0.850984,0.001095,0.002558,0.003701,0.005434
9,0.00312,0.011371,0.80875,0.834998,21,{u'n_neighbors': 21},10,0.810861,0.82833,0.810507,0.830366,0.804878,0.846298,0.000785,0.001534,0.00274,0.008033


## Results

KNearestNeighbors (KNN) immensely outperformed Logisitic Regression (LR). Train and test scores for KNN and LR are (0.91, 0.86) and (0.61, 0.60), respectively, when a grid search is performed. 