# Model Selection

## Imports and reading data

In [78]:
#imports
import sys
sys.path.append('..')

import importlib
import pandas as pd
import numpy as np
import scripts.preprocess as pre
import scripts.model_selection as ms

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

In [115]:
# reload after changing files
importlib.reload(pre)
importlib.reload(ms)

<module 'scripts.model_selection' from '/home/cita_zane/Documents/kood_johvi/ai/kaggle-titanic/notebooks/../scripts/model_selection.py'>

In [81]:
# fetch data and preproces
data = pre.get_data('../data/train.csv')
X, y = pre.preprocess(data)


In [82]:
# calculate base model accuracy scores based on cross validation (5 models)
baseline = ms.cross_validate_models(X,y)

In [83]:
# save base scores in Dataframe and add extra column for upgraded scores
upgraded = pd.Series(data={'lg':0, 'rf':0, 'knn': 0, 'svc':0, 'gb':0})
model_scores = pd.DataFrame({"baseline":baseline,"upgraded":upgraded})

### Logistic Regression

In [84]:
# logistic regression CV
lr = Pipeline([('scaler', StandardScaler()),
               ('lg', LogisticRegression())])
lr_params = {'lg__solver': ['lbfgs', 'liblinear', 'saga'],
            'lg__C':np.logspace(0, 4, num=10),
            'lg__class_weight':['balanced', None],
            'lg__max_iter':[10000]}
lg_updated_score,lg_best_model = ms.grid_search(lr,lr_params,X,y)


Params:  {'lg__C': 2.7825594022071245, 'lg__class_weight': None, 'lg__max_iter': 10000, 'lg__solver': 'lbfgs'}


In [85]:
model_scores.loc['lg','upgraded'] = lg_updated_score

### KNN

In [86]:
knn = Pipeline([('scaler', StandardScaler()),
               ('knn',KNeighborsClassifier())])
knn_params = {'knn__n_neighbors': [1,3, 5, 7],
            'knn__weights':['uniform','uniform'],
            'knn__p':[1,2,3]}
knn_updated_score,knn_best_model = ms.grid_search(knn,knn_params,X,y)

Params:  {'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'uniform'}


In [87]:
model_scores.loc['knn','upgraded'] = knn_updated_score

### SVC

In [95]:
svc = Pipeline([('scaler', StandardScaler()),
               ('svc',SVC())])
svc_params = {'svc__C': [1.0,2.0, 5.0],
            'svc__kernel':['rbf','sigmoid','poly'],
            'svc__degree':[1,3,5],
            'svc__gamma':['scale', 'auto'],
            'svc__shrinking':[True, False],
            'svc__class_weight':['balanced', None]}
svc_updated_score,svc_best_model = ms.grid_search(svc,svc_params,X,y)

Params:  {'svc__C': 2.0, 'svc__class_weight': None, 'svc__degree': 1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__shrinking': False}


In [96]:
model_scores.loc['svc','upgraded'] = svc_updated_score

### Random Forest Classifier

In [97]:
rf = RandomForestClassifier()

In [98]:
rf_wide_params = {
    'n_estimators':[10,100,200,500],
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[10,50,None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features':['sqrt', 'log2',None]
}
rf_test_score = ms.randomize_search(rf,rf_wide_params,X,y)

Params:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10, 'criterion': 'gini'}


In [67]:
rf_test_score

0.905511811023622

In [99]:
rf_params = {
    'n_estimators':[150,200,250],
    'criterion':['gini','entropy'],
    'max_depth':[None],
    'min_samples_split': [7, 10,15],
    'min_samples_leaf': [1, 2],
    'max_features':['sqrt', None]
}
rf_updated_score,rf_best_model = ms.grid_search(rf,rf_params,X,y)

Params:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 200}


In [100]:
model_scores.loc['rf','upgraded'] = rf_updated_score

### Gradient Boosting

In [71]:
gb = GradientBoostingClassifier()

In [74]:
gb_wide_params = {
    'n_estimators':[10,100,200,500],
    'criterion':['friedman_mse','squared_error'],
    'loss':['log_loss','exponential'],
    'learning_rate': [0.001, 0.01, 0.1,0.5],
    'subsample': [0.1, 0.2,0.5,0.7,0.9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_depth':[None, 1,3,5],
    'warm_start':[True,False]
}
gb_test_score = ms.randomize_search(gb,gb_wide_params,X,y)
gb_test_score

Params:  {'warm_start': False, 'subsample': 0.7, 'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5, 'loss': 'exponential', 'learning_rate': 0.01, 'criterion': 'friedman_mse'}


0.8537682789651294

In [75]:
gb_params = {
    'n_estimators':[50,100,150],
    'criterion':['squared_error'],
    'loss':['exponential'],
    'learning_rate': [ 0.01, 0.1,],
    'subsample': [0.5,0.7],
    'min_samples_split': [2,5, 10,],
    'min_samples_leaf': [1, 2],
    'max_depth':[None,5],
    'warm_start':[False, True]
}
gb_updated_score, gb_best_model = ms.grid_search(gb,gb_params,X,y)

Params:  {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.1, 'warm_start': True}


In [76]:
model_scores.loc['gb','upgraded'] = gb_updated_score

In [101]:
model_scores

Unnamed: 0,baseline,upgraded
lg,0.78519,0.8009
rf,0.811046,0.929134
knn,0.806532,0.863892
svc,0.82343,0.843645
gb,0.826795,0.0


### Predict

In [123]:
data = pre.get_data('../data/test.csv')
X_test, _ = pre.preprocess(data,True)

Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Family_Size    0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [124]:
#predict on the model
y_pred= svc_best_model.predict(X_test)

In [125]:
# save prediction for sumbission
temp = pd.DataFrame(pd.read_csv("../data/test.csv")['PassengerId'])
temp['Survived'] = y_pred
temp.to_csv("../solutions/submission_svc_1.csv", index = False)