# Overview
This notebook is used to create tools to automate the training, tuning and saving of different sklearn models


# Preliminary: Imports and constants

In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
# import models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

# import tuning modules
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# import metrics and scoring modules
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, balanced_accuracy_score

import math 
import pickle

In [3]:
from sklearn.metrics import get_scorer_names
score_function = {"accuray": accuracy_score, "precision": precision_score, "recall": recall_score, 
                  "f1": f1_score, "balanced_accuracy": balanced_accuracy_score}

# constant used for cross validation
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
# get_scorer_names()

# systemizing ML processes
In this block we split the most common functionalities used in Machine learning in function block for re-usibility. 

In [4]:
X, Y = make_classification(n_samples=4000, n_features=20, n_classes=3, random_state = 18, n_informative=8)
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [5]:
def tune_model(model, params_grid, X_train, y_train, cv = None, scoring=None):
    # scoring should be determined depending on the nature of the classification problem
    if scoring is None:
        # if the classification problem is binary
        if len(np.unique(y_train)) == 2:
            scoring='f1'
        else:
            scoring='balanced_accuracy'
        
    if cv is None:
        cv = CV
        
    searcher = GridSearchCV(model, param_grid=params_grid, cv=cv, scoring=scoring, n_jobs=-1)
    searcher.fit(X_train, y_train)
    return searcher.best_estimator_


In [6]:
def evaluate_tuned_model(tuned_model, X_train, X_test, y_train, y_test, train=True, metrics=['accuracy']):

    if isinstance(metrics, str):
        metrics = [metrics]
        
    # train the model
    if train:
        tuned_model.fit(X_train, y_train)
        
    # predict on the test dataset
    y_pred = tuned_model.predict(X_test)
    # evluate the model
    scores = dict(list(zip(metrics, [score_function[m](y_test, y_pred) for m in metrics])))    
    return tuned_model, scores


In [7]:
def save_model(tuned_model, path):
    with open(path, 'wb') as f:
        pickle.dump(tuned_model, f)

def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [8]:
  
def try_model(model, X, y, params_grid, save=True, save_path=None, test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    # the dataset passed is assumed to be ready to be processed
    # all its features are numerical and all its missing values are imputed/discarded
    
    if save and save_path is None:
        raise ValueError("Please pass a path to save the model or set the 'save' parameter to False")
    
    # split the dataset into train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=11, stratify=y)
    
    # tune the model
    tuned_model = tune_model(model, params_grid, X_train, y_train, cv=cv, scoring =tune_metric)
    
    # evluate teh tuned model
    model, results = evaluate_tuned_model(tuned_model, X_train, y_train, X_test, y_test, test_metrics)    
    # save the model to the passed path
    if save:
        save_model(tuned_model, save_path)
     
    return model, results


# Common Machine Learning models
In this subsection, we customize the ML processes considered above for the most common Machine Learning models:
* Logistic Regression
* Linear SVM
* DecisionTreeClassifier
* RandomForestClassifier
* XGBoostClassifier

## Logistic Regression

In [9]:
lr_basic = LogisticRegression(max_iter=5000)

LR_grid = {"C": [0.1]}

def try_LR(X, y, lr_model=lr_basic, params_grid=LR_grid, save=True, save_path=None, test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    return try_model(lr_model, X, y, params_grid, save=save, save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)
    
lr, results = try_LR(X, Y, save=False)

22.71s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


KeyboardInterrupt: 

In [None]:


param_grid = {
    'C': [0.5, 1.0, 1.5]
}

Log_clf = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid = GridSearchCV(Log_clf, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
pickle.dump(grid.best_estimator_, open('log_reg_clf', 'wb'))
print(accuracy_score(y_test, y_pred))

In [12]:
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


param_grid = {
    'max_depth': range(10, 20),
    'min_samples_split': [2, 3, 4]
}

tree=DecisionTreeClassifier()
grid = GridSearchCV(tree, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
pickle.dump(grid.best_estimator_, open('dec_tree_clf', 'wb'))
print(accuracy_score(y_test, y_pred))

Fitting 15 folds for each of 30 candidates, totalling 450 fits
{'max_depth': 10, 'min_samples_split': 4}
0.855


In [13]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'max_depth': range(8, 13),
    'min_samples_split': [2, 3, 4],
    'n_estimators': range(95, 111, 5)
}


forest=RandomForestClassifier()
grid = GridSearchCV(forest, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
pickle.dump(grid.best_estimator_, open('rand_forest_clf', 'wb'))
print(accuracy_score(y_test, y_pred))

Fitting 15 folds for each of 60 candidates, totalling 900 fits
{'max_depth': 10, 'min_samples_split': 3, 'n_estimators': 100}
0.93


In [14]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': range(5, 15)
}

model = KNeighborsClassifier()
grid = GridSearchCV(model, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('knn_clf', 'wb'))
y_pred = grid.predict(X_test)
print(accuracy_score(y_test, y_pred))

Fitting 15 folds for each of 10 candidates, totalling 150 fits
{'n_neighbors': 5}
0.96


In [15]:
##SVM
from sklearn.svm import SVC


param_grid = {
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'C': [0.5, 1.0, 2.0]
}

svmclassifier = SVC(kernel = 'rbf', random_state = 0)
grid = GridSearchCV(svmclassifier, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('svm_clf', 'wb'))
y_pred = grid.predict(X_test)
print(accuracy_score(y_test, y_pred))

Fitting 15 folds for each of 18 candidates, totalling 270 fits
{'C': 2.0, 'degree': 2, 'gamma': 'scale'}
0.975


In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
import numpy as np

model = XGBClassifier(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

param_grid = {
    'base_score': [0.5, 1, 1.5, 2], 
    'max_depth': [3, 4, 5],
    'colsample_bylevel': [1, 2, 3]
}

grid = GridSearchCV(model, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('xgb_clf', 'wb'))
y_pred = grid.predict(X_test)
print(accuracy_score(y_test, y_pred))

Fitting 15 folds for each of 36 candidates, totalling 540 fits


360 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/sklearn.py", line 1516, in fit
    self._Booster = train(
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)


{'base_score': 0.5, 'colsample_bylevel': 1, 'max_depth': 5}
0.94
