# Model Selection

## Imports and reading data

In [359]:
#imports
import sys
sys.path.append('..')

import importlib
import pandas as pd
import numpy as np
import scripts.preprocess as pre
import scripts.model_selection as ms

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


from sklearn.pipeline import Pipeline

In [360]:
# reload after changing files
importlib.reload(pre)
importlib.reload(ms)

<module 'scripts.model_selection' from '/home/cita_zane/Documents/kood_johvi/ai/kaggle-titanic/notebooks/../scripts/model_selection.py'>

In [361]:
# fetch data and preproces
data = pre.get_data('../data/train.csv')
X, y = pre.preprocess(data)


In [362]:
# calculate base model accuracy scores based on cross validation (5 models)
baseline = ms.cross_validate_models(X,y)

In [363]:
# save base scores in Dataframe and add extra column for upgraded scores
upgraded = pd.Series(data={'lg':0, 'rf':0, 'knn': 0, 'svc':0, 'gb':0})
model_scores = pd.DataFrame({"baseline":baseline,"upgraded":upgraded})

### Logistic Regression

In [364]:
# logistic regression CV
lr = Pipeline([('scaler', StandardScaler()),
               ('lg', LogisticRegression())])
lr_params = {'lg__solver': ['lbfgs', 'liblinear', 'saga'],
            'lg__C':np.logspace(0, 4, num=10),
            'lg__class_weight':['balanced', None],
            'lg__max_iter':[10000]}
lg_updated_score,lg_best_model = ms.grid_search(lr,lr_params,X,y)


Params:  {'lg__C': 1.0, 'lg__class_weight': None, 'lg__max_iter': 10000, 'lg__solver': 'lbfgs'}


In [365]:
model_scores.loc['lg','upgraded'] = lg_updated_score

### KNN

In [366]:
knn = Pipeline([('scaler', StandardScaler()),
               ('knn',KNeighborsClassifier())])
knn_params = {'knn__n_neighbors': [1,3, 5, 7, 9,10,12,14,18,20],
            'knn__weights':['uniform','distance'],
            'knn__p':[1,2,3],
            'knn__leaf_size':list(range(1,50,5))}

knn_updated_score,knn_best_model = ms.grid_search(knn,knn_params,X,y)

Params:  {'knn__leaf_size': 1, 'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'uniform'}


In [367]:
model_scores.loc['knn','upgraded'] = knn_updated_score

### SVC

In [368]:
svc = Pipeline([('scaler', StandardScaler()),
               ('svc',SVC())])
svc_params = {'svc__C': [0.1, 1, 10, 100, 1000],
            'svc__kernel':['rbf','sigmoid'],
            'svc__degree':[1,3,5],
            'svc__gamma':[1, 0.1, 0.01, 0.001, 0.0001],
            'svc__class_weight':['balanced', None]}
svc_updated_score,svc_best_model = ms.grid_search(svc,svc_params,X,y)

Params:  {'svc__C': 10, 'svc__class_weight': None, 'svc__degree': 1, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


In [369]:
model_scores.loc['svc','upgraded'] = svc_updated_score

### Random Forest Classifier
**For tree like structures there are a lot of possible parameters. To narrow down the search first perform randomized search, and after that based on best params narrow them down for gridsearch*

In [370]:
#rf = RandomForestClassifier()

In [371]:
# rf_wide_params = {
#     'n_estimators':[10,100,200,500],
#     'criterion':['gini','entropy','log_loss'],
#     'max_depth':[10,50,None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features':['sqrt', 'log2',None]
# }
# rf_test_score = ms.randomize_search(rf,rf_wide_params,X,y)

In [372]:
# rf_test_score

In [373]:
# rf_params = {
#     'n_estimators':[70,100,150],
#     'criterion':['log_loss'],
#     'max_depth':[None,50],
#     'min_samples_split': [5,10,15],
#     'min_samples_leaf': [4],
#     'max_features':[ None]
# }
# rf_updated_score,rf_best_model = ms.grid_search(rf,rf_params,X,y)

In [374]:
# model_scores.loc['rf','upgraded'] = rf_updated_score

### Gradient Boosting
*GridSearch takes a long time. Suggest to skip it when doing audit*

In [375]:
# gb = GradientBoostingClassifier()

In [376]:
# gb_wide_params = {
#     'n_estimators':[10,100,200,500],
#     'criterion':['friedman_mse','squared_error'],
#     'loss':['log_loss','exponential'],
#     'learning_rate': [0.001, 0.01, 0.1,0.5],
#     'subsample': [0.1, 0.2,0.5,0.7,0.9],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_depth':[None, 1,3,5],
#     'warm_start':[True,False]
# }
# gb_test_score = ms.randomize_search(gb,gb_wide_params,X,y)
# gb_test_score

In [377]:
# gb_params = {
#     'n_estimators':[50,100,150],
#     'criterion':['squared_error'],
#     'loss':['exponential'],
#     'learning_rate': [ 0.01, 0.1,],
#     'subsample': [0.5,0.7],
#     'min_samples_split': [2,5, 10,],
#     'min_samples_leaf': [1, 2],
#     'max_depth':[None,5],
#     'warm_start':[False, True]
# }
# gb_updated_score, gb_best_model = ms.grid_search(gb,gb_params,X,y)

In [378]:
# model_scores.loc['gb','upgraded'] = gb_updated_score

### Performance - Accuracy

In [379]:
model_scores

Unnamed: 0,baseline,upgraded
lg,0.813318,0.829021
rf,0.802082,0.0
knn,0.802082,0.847019
svc,0.802063,0.844769
gb,0.822339,0.0


### Predict

In [380]:
data = pre.get_data('../data/test.csv')
X_test, _ = pre.preprocess(data,True)

In [381]:
X_test = pre.find_missing_cols(X_test,X)

In [382]:
#predict on the model
y_pred = svc_best_model.predict(X_test)

In [383]:
# save prediction for sumbission
temp = pd.DataFrame(pd.read_csv("../data/test.csv")['PassengerId'])
temp['Survived'] = y_pred
temp.to_csv("../solutions/submission_svc_15.csv", index = False)