In [1]:
from lib.project_5_ADH import load_data_from_database, make_data_dict, general_model, general_transformer
from numpy import arange
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

# Step 3 - Build Model

### Domain and Data

The data, referred to as Madelon, is 2000 rows and 500 features (1 index, 1 target). The dataset is artificial with a two-class target (-1, 1) with continuous input (parameter) variables. 

### Problem Statement

Implement a machine learning pipeline using Logisitic Regression and KNeighborsClassifier while transforming the data using SelectKBest.

### Solution Statement

Provide a jupyter notebook with a pipeline (with regularization) that will show how the Logistic Regression and KNeighbors models work. Check how many salient features each use.

### Metric

I would like to reduce the amount of salient features that I determined in the prior workbook (step 2).

### Benchmark

I would like to beat my test score from step 1 of ~55%.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/build_model.png" width="600px">

In [2]:
madelon_df = load_data_from_database('dsi_student', 'correct horse battery staple', 'joshuacook.me', 
                                     '5432', 'dsi', 'madelon')

In [3]:
madelon_df.head()

Unnamed: 0,index,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_491,feat_492,feat_493,feat_494,feat_495,feat_496,feat_497,feat_498,feat_499,label
0,0,485,477,537,479,452,471,491,476,475,...,481,477,485,511,485,481,479,475,496,-1
1,1,483,458,460,487,587,475,526,479,485,...,478,487,338,513,486,483,492,510,517,-1
2,2,487,542,499,468,448,471,442,478,480,...,481,492,650,506,501,480,489,499,498,-1
3,3,480,491,510,485,495,472,417,474,502,...,480,474,572,454,469,475,482,494,461,1
4,4,484,502,528,489,466,481,402,478,487,...,479,452,435,486,508,481,504,495,511,1


In [4]:
data_dict = make_data_dict(madelon_df, random_state = 40, test_size = 0.20)

In [5]:
data_dict = general_transformer(StandardScaler(), data_dict)

In [6]:
data_dict = general_transformer(SelectKBest(), data_dict)

In [7]:
K_best_selection = data_dict['processes'][1]

In [8]:
np.where(K_best_selection.get_support())

(array([ 48,  64, 105, 128, 241, 336, 338, 442, 472, 475]),)

In [9]:
LR_scores = general_model(LogisticRegression(), data_dict)
LR_scores['train score'], LR_scores['test score']

(0.61499999999999999, 0.59750000000000003)

In [10]:
data_dict['processes'][2].coef_.flatten()
# SelectKBest only chose 10!

array([ 0.13579799, -0.11706957,  0.05055887,  0.15962245,  0.16447294,
        0.21886389,  0.20087084, -0.31649912,  0.2762322 ,  0.40462379])

In [11]:
KNN_scores = general_model(KNeighborsClassifier(), data_dict)
KNN_scores['train score'], KNN_scores['test score']

(0.91062500000000002, 0.86250000000000004)

In [12]:
lg_param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}
GSCV_LR_scores = general_model(GridSearchCV(LogisticRegression(),lg_param_grid), data_dict)
GSCV_LR_scores['train score'], GSCV_LR_scores['test score']

(0.61312500000000003, 0.59999999999999998)

In [13]:
GSCV_LR = data_dict['processes'][4]

In [14]:
GSCV_LR_df = pd.DataFrame(GSCV_LR.cv_results_)
GSCV_LR_df[['mean_test_score','mean_train_score', 'param_C', 'rank_test_score']]
# best estimator is c = 0.1

Unnamed: 0,mean_test_score,mean_train_score,param_C,rank_test_score
0,0.59375,0.590936,0.0001,7
1,0.591875,0.596875,0.001,8
2,0.600625,0.608437,0.01,2
3,0.60375,0.615623,0.1,1
4,0.600625,0.616248,1.0,2
5,0.59625,0.610938,10.0,4
6,0.595625,0.610313,100.0,5
7,0.595625,0.610313,1000.0,5


In [15]:
knn_param_grid = {'n_neighbors': [x for x in arange(3, 22, 2)]}
GSCV_knn_scores = general_model(GridSearchCV(KNeighborsClassifier(),knn_param_grid), data_dict)
GSCV_knn_scores['train score'], GSCV_knn_scores['test score']

(0.91062500000000002, 0.86250000000000004)

In [16]:
GSCV_KNN = data_dict['processes'][5]

In [17]:
GSCV_KNN_df = pd.DataFrame(GSCV_KNN.cv_results_)
GSCV_KNN_df[['mean_test_score','mean_train_score', 'param_n_neighbors', 'rank_test_score']]
# best estimator is neighbors = 5

Unnamed: 0,mean_test_score,mean_train_score,param_n_neighbors,rank_test_score
0,0.84875,0.931246,3,2
1,0.85,0.903437,5,1
2,0.844375,0.890624,7,3
3,0.839375,0.874374,9,4
4,0.82625,0.871249,11,5
5,0.821875,0.863436,13,6
6,0.815625,0.856873,15,8
7,0.816875,0.848749,17,7
8,0.81125,0.844061,19,9
9,0.80875,0.834998,21,10


## Results

KNearestNeighbors (KNN) immensely outperformed Logisitic Regression (LR). Train and test scores for KNN and LR are (0.91, 0.86) and (0.61, 0.60), respectively, when a grid search is performed. 