# Introduction to Scikit-Learn (sklearn)

0. An end to end sklearn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions or our data
4. Evaluating the model 
5. Improve the model
6. =>Save and load trained model
7. Putting it all together!

## 6. Save and load trained model
### There are three ways of tunning parameters
1. using pickle
2. using joblib

In [2]:
## how to get hyperparameters in the model
import pandas as pd
import numpy as np


In [3]:
from sklearn.metrics import (accuracy_score,
                             f1_score, 
                             recall_score,
                             precision_score,
                             r2_score, 
                             mean_absolute_error, 
                             mean_squared_error)


def evaluate_classiification_model(y_test, y_pred):
    
    accuracy = accuracy_score(y_test, y_pred) * 100
    precission = precision_score(y_test, y_pred) * 100
    recall = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    
    metric_dict = {'accuracy': round(accuracy, 2),
                  'precission': round(precission, 2),
                  'recall': round(recall, 2),
                  'f1': round(f1, 2)}
    [print(each+":", metric_dict[each]) for each in metric_dict.keys()]
    return metric_dict
    
def evaluate_regression_model(y_test, x_pred):
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    metric_dict = {'r2': round(r2, 2),
                  'mae': round(mae, 2),
                  'mse': round(mse, 2)}
    [print(each+":", metric_dict[each]) for each in metric_dict.keys()]
    return metric_dict


In [4]:
from sklearn.ensemble import RandomForestClassifier

# load the data 
heart_disease = pd.read_csv("dataset/heart-disease.csv")

# split into X and y
X = heart_disease.drop("target", axis=1)
y = heart_disease['target']

# train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
from sklearn.model_selection import (train_test_split,
                                     RandomizedSearchCV)

grid = {"n_estimators": [10, 100, 200, 500, 1200],
       "max_depth": [None, 5, 10, 20, 30],
       "max_features": ["auto", "sqrt"],
       "min_samples_split": [2, 4, 6],
       "min_samples_leaf": [1, 2, 4]}


np.random.seed(42)

# choose a model
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5, # cross validation
                           verbose=2)

clf.fit(X_train, y_train)
rs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time=   0.4s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time=   0.4s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time=   0.8s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=100; t

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=1),
                   param_distributions={'max_depth': [None, 5, 10, 20, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [10, 100, 200, 500,
                                                         1200]},
                   verbose=2)

In [7]:
grid2= {"n_estimators": [100, 200, 1000],
       "max_depth": [20, 30],
       "max_features": ["auto", "sqrt"],
       "min_samples_split": [4, 6],
       "min_samples_leaf": [1, 2]}

In [8]:
from sklearn.model_selection import GridSearchCV

grid_clf = GridSearchCV(estimator=clf,
                       param_grid=grid2,
                       cv=5,
                       verbose=2)

grid_clf.fit(X_train, y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=20

[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features

[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.2s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=200; total time=   0.3s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   1.6s
[CV] END max_depth=30, max_features=aut

[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   2.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   1.9s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=6, n_estimators=1000; total time=   1.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=s

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=1),
             param_grid={'max_depth': [20, 30],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [4, 6],
                         'n_estimators': [100, 200, 1000]},
             verbose=2)

## Using Pickle


In [13]:
# dump the model
import pickle

with open("heart-disease-model-baseline.pkl", "wb") as base:
    pickle.dump(clf, base)


with open("heart-disease-model-random.pkl", "wb") as base:
    pickle.dump(rs_clf, base)


with open("heart-disease-model-grid.pkl", "wb") as base:
    pickle.dump(grid_clf, base)



In [17]:
# Load and use the model
with open("heart-disease-model-baseline.pkl", "rb") as base:
    clf = pickle.load(base)


with open("heart-disease-model-random.pkl", "rb") as base:
    rs_clf = pickle.load(base)

with open("heart-disease-model-grid.pkl", "rb") as base:
    grid_clf = pickle.load(base)

y_clf_pred = clf.predict(X_test)
y_rs_pred = rs_clf.predict(X_test)
y_gird_pred = grid_clf.predict(X_test)

evaluate_classiification_model(y_test, y_clf_pred)
print("")
evaluate_classiification_model(y_test, y_rs_pred)
print("")
evaluate_classiification_model(y_test, y_gird_pred);

accuracy: 85.25
precission: 90.62
recall: 82.86
f1: 86.57

accuracy: 88.52
precission: 91.18
recall: 88.57
f1: 89.86

accuracy: 86.89
precission: 90.91
recall: 85.71
f1: 88.24


## Using Joblib

In [20]:
# dump
from joblib import load, dump
dump(clf, '1.joblib')
dump(rs_clf, '2.joblib')
dump(grid_clf, '3.joblib');

In [21]:
# Load
clf = load('1.joblib')
rs_clf = load('2.joblib')
grid_clf = load('3.joblib')

# use loaded model
y_clf_pred = clf.predict(X_test)
y_rs_pred = rs_clf.predict(X_test)
y_gird_pred = grid_clf.predict(X_test)

evaluate_classiification_model(y_test, y_clf_pred)
print("")
evaluate_classiification_model(y_test, y_rs_pred)
print("")
evaluate_classiification_model(y_test, y_gird_pred);

accuracy: 85.25
precission: 90.62
recall: 82.86
f1: 86.57

accuracy: 88.52
precission: 91.18
recall: 88.57
f1: 89.86

accuracy: 86.89
precission: 90.91
recall: 85.71
f1: 88.24
