# Overview
This notebook is used to create tools to automate the training, tuning and saving of different sklearn models


# Preliminary: Imports and constants

In [2]:
# import packages
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [3]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVC

# import tuning modules
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# import metrics and scoring modules
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, balanced_accuracy_score

import math 
import pickle

In [4]:
from sklearn.metrics import get_scorer_names
score_function = {"accuracy": accuracy_score, "precision": precision_score, "recall": recall_score, 
                  "f1": f1_score, "balanced_accuracy": balanced_accuracy_score}

# constant used for cross validation
CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
# get_scorer_names()

# Systemizing ML processes
In this block we split the most common functionalities used in Machine learning in function block for re-usibility. 

In [5]:
X, Y = make_classification(n_samples=4000, n_features=20, n_classes=3, random_state = 18, n_informative=8)
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [6]:
def tune_model(model, params_grid, X_train, y_train, cv=None, scoring=None):
    # scoring should be determined depending on the nature of the classification problem
    if scoring is None:
        # if the classification problem is binary
        if len(np.unique(y_train)) == 2:
            scoring = 'f1'
        else:
            scoring = 'balanced_accuracy'

    if cv is None:
        cv = CV

    searcher = GridSearchCV(model, param_grid=params_grid, cv=cv, scoring=scoring, n_jobs=-1)
    searcher.fit(X_train, y_train)
    return searcher.best_estimator_


In [7]:
def evaluate_tuned_model(tuned_model, X_train, X_test, y_train, y_test, train=True, metrics=None):
    if metrics is None:
        metrics = ['accuracy']
    if isinstance(metrics, str):
        metrics = [metrics]

    # train the model
    if train:
        tuned_model.fit(X_train, y_train)

    # predict on the test dataset
    y_pred = tuned_model.predict(X_test)
    # evaluate the model
    scores = dict(list(zip(metrics, [score_function[m](y_test, y_pred) for m in metrics])))
    return tuned_model, scores


In [8]:
def save_model(tuned_model, path):
    with open(path, 'wb') as f:
        pickle.dump(tuned_model, f)


def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [9]:
  
def try_model(model, X, y, params_grid, save=True, save_path=None, test_size=0.2, tune_metric=None,
              test_metrics=None, cv=None):
    # the dataset passed is assumed to be ready to be processed
    # all its features are numerical and all its missing values are imputed/discarded

    if test_metrics is None:
        test_metrics = ['accuracy']

    if save and save_path is None:
        raise ValueError("Please pass a path to save the model or set the 'save' parameter to False")

    # split the dataset into train and test datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=11, stratify=y)

    # tune the model
    tuned_model = tune_model(model, params_grid, X_train, y_train, cv=cv, scoring=tune_metric)

    # evaluate teh tuned model
    model, results = evaluate_tuned_model(tuned_model, X_train, X_test, y_train, y_test, test_metrics)
    # save the model to the passed path
    if save:
        save_model(tuned_model, save_path)

    return model, results


# Common Machine Learning models
In this subsection, we customize the ML processes considered above for the most common Machine Learning models:
* Logistic Regression
* Linear SVM
* DecisionTreeClassifier
* RandomForestClassifier
* XGBoostClassifier

## Logistic Regression

In [10]:
lr_basic = LogisticRegression(max_iter=5000)
# lsvc_basic = LinearSVC(max_iter=5000)

LR_grid = {"C": np.logspace(-5, 5, 20)}
# LSVC_grid = {}

def try_LR(X, y, lr_model=lr_basic, params_grid=None, save=True, save_path=None, 
           test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    if params_grid is None:
        params_grid = LR_grid
    if test_metrics is None:
        test_metrics = ['accuracy']
    return try_model(lr_model, X, y, params_grid, save=save, 
    save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

# def try_LSVM(X, y, lsvc_model = lsvc_basic, params_grid=)
    
lr, results = try_LR(X, Y, save=False)

print(results)

{'accuracy': 0.66625}


## Linear SVC

In [17]:
lsvc_model = LinearSVC(max_iter=4000)
lsvc_parameters = {'C': [100, 10, 1.0, 0.1, 0.001], 'penalty': ['l1', 'l2'], 'loss': ['hinge', 'squared_hinge'],
                   'dual':[True, False], 'fit_intercept': [True, False]}

def try_LSVC(X, y, lr_model=lsvc_model, params_grid=None, save=True, save_path=None, 
           test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    if params_grid is None:
        params_grid = lsvc_parameters
    if test_metrics is None:
        test_metrics = ['accuracy']
    return try_model(lr_model, X, y, params_grid, save=save, 
    save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lsvc, results = try_LSVC(X, Y, save=False, params_grid = lsvc_parameters)

print(results)    


200 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_t

{'accuracy': 0.65625}




## Decision Tree Classifier

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_parameters = {'criterion': ['gini', 'entropy'], 'splitter':['random', 'best'], 'max_depth':list(range(1, 30)),
                            'min_samples_split': list(range(1, 20))}

def try_Decision_Tree(X, y, dtc_model=decision_tree_model, params_grid=None, save=True, save_path=None, 
           test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    if params_grid is None:
        params_grid = decision_tree_parameters
    if test_metrics is None:
        test_metrics = ['accuracy']
    return try_model(dtc_model, X, y, params_grid, save=save, 
    save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lsvc, results = try_Decision_Tree(X, Y, save=False, params_grid = decision_tree_parameters)

print(results)    

580 fits failed out of a total of 11020.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
580 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 969, in fit
    super().fit(
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\tree\_classes.py", line 265, in fit
    check_scalar(
  File "c:\Users\user\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\valid

{'accuracy': 0.80625}


## Random Forest Classifier

In [None]:
random_forest_model = RandomForestClassifier()
random_forest_parameters = {'max_leaf_nodes':[2, 3, 4, 5, 6, 7], 'min_samples_split':[5, 10, 20, 50], 'max_depth': [5,10,15,20],
                            'max_features':[3, 4, 5], 'n_estimators': [50, 100, 200]}

def try_RFC(X, y, rfc_model=random_forest_model, params_grid=None, save=True, save_path=None, 
           test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    if params_grid is None:
        params_grid = random_forest_parameters
    if test_metrics is None:
        test_metrics = ['accuracy']
    return try_model(rfc_model, X, y, params_grid, save=save, 
    save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lsvc, results = try_RFC(X, Y, save=False, params_grid = random_forest_parameters)

print(results)

{'accuracy': 0.74625}


## XGBoostClassifier

In [None]:
xgboost_model = XGBClassifier()
xgboost_parameters = {'booster': ['gbtree', 'gblinear', 'dart'], 'eta': [0.01, 0.05, 0.1, 0.15, 0.2]}

def try_xgboost(X, y, xgb_model=xgboost_model, params_grid=None, save=True, save_path=None, 
           test_size=0.2, tune_metric=None, test_metrics=['accuracy'], cv=None):
    if params_grid is None:
        params_grid = xgboost_parameters
    if test_metrics is None:
        test_metrics = ['accuracy']
    return try_model(xgboost_model, X, y, params_grid, save=save, 
    save_path=save_path, test_size=test_size, tune_metric=tune_metric, test_metrics=test_metrics, cv=cv)

lsvc, results = try_xgboost(X, Y, save=False, params_grid = xgboost_parameters)

print(results)

{'accuracy': 0.8925}
