# This notebook is for optimizing the pipeline hyperparameters

## Imports

### Basic imports

In [96]:
import numpy as np
import pandas as pd
import pickle
import os

In [97]:
from sklearn.model_selection import RandomizedSearchCV, cross_validate, KFold
from sklearn.decomposition import FastICA, PCA
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from skopt.space import Real, Categorical, Integer

In [98]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Functions for storing the data

In [99]:
def pickle_dump(obj, fname):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)

def pickle_load(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)
    
def create_directory(dir_path="path/to/file"):
    """
    Wrapper function for the os.makedirs method.

    Args:
        dir_path (Any): The data to be pickled.

    Returns:
        Either the desired directory is created or a message stating it already exists.
    """
    try:
        os.makedirs(dir_path)
    except FileExistsError:
        print(f"Path to {dir_path} already exists, yo.")

## Setting hyperparameter grids for random search

In [100]:
def sklearn_setup_hp_grid(number_grid):
    # Set up possible values of parameters to optimize over, follows autosklearn and assignment 2
    params_dt = { 'classifier__criterion' : ['gini', 'entropy'],
                'classifier__min_samples_leaf': np.linspace(1, 20, num=number_grid, dtype=int),
                'classifier__min_samples_split': np.linspace(2, 20, num=number_grid, dtype=int) }

    params_gb = {'classifier__learning_rate' : np.linspace(0.01, 1.0, num=number_grid), 
                'classifier__max_leaf_nodes' : np.linspace(3, 2047, num=number_grid, dtype=int), 
                'classifier__min_samples_leaf' : np.linspace(1, 200, num=number_grid, dtype=int),
                'classifier__n_iter_no_change' : np.linspace(1, 20, num=number_grid, dtype=int),
                'classifier__validation_fraction' : np.linspace(0.01, 0.4, num=number_grid)}

    params_knn = {'classifier__n_neighbors' : np.linspace(1,100, num=number_grid, dtype=int),
                'classifier__p' : [1,2],
                'classifier__weights' : ['uniform', 'distance']}

    params_log = {'classifier__C' : np.linspace(0.03125, 5, num=number_grid), ## Restricted the upper bound from 32768 to 5, to mimic autosklearn and preserve runtime
                'classifier__tol' : np.linspace(0.00001, 0.1, num=number_grid),
                'classifier__penalty' : ['l1', 'l2']}

    params_rf = {'classifier__bootstrap' : [True, False],
                'classifier__criterion' : ['gini', 'entropy'],
                'classifier__min_samples_leaf': np.linspace(1, 20, num=number_grid, dtype=int), 
                'classifier__min_samples_split': np.linspace(2, 20, num=number_grid, dtype=int) }
    
    param_grids = [params_dt, params_gb, params_knn, params_log, params_rf]
    return param_grids

## Setting hyperparameter grids that are compatible with BayesianSearchCV

In [101]:
def skopt_setup_hp_grid():
    # Set up possible values of parameters to optimize over, follows autosklearn and assignment 2
    params_dt = { 'classifier__criterion' : Categorical(['gini', 'entropy']),
                'classifier__min_samples_leaf': Integer(1,20),
                'classifier__min_samples_split':  Integer(2,20) }

    params_gb = {
                'classifier__learning_rate' : Real(0.01, 1.0), 
                'classifier__max_leaf_nodes' :  Integer(3,2047), 
                'classifier__min_samples_leaf' :  Integer(1,200),
                # 'classifier__n_iter_no_change' :  Integer(1,20), ## This throws errors and I don't know why >:(
                'classifier__validation_fraction' : Real(0.01, 0.4)
                }

    params_knn = {'classifier__n_neighbors' :  Integer(1,100),
                'classifier__p' : Categorical([1,2]),
                'classifier__weights' : Categorical(['uniform', 'distance'])}

    params_log = {'classifier__C' : Real(0.03125, 5.0), ## Restricted the upper bound from 32768 to 5, to mimic autosklearn and preserve runtime
                'classifier__tol' : Real(0.00001, 0.1),
                'classifier__penalty' : Categorical(['l1', 'l2'])}

    params_rf = {'classifier__bootstrap' : Categorical([True, False]),
                'classifier__criterion' : Categorical(['gini', 'entropy']),
                'classifier__min_samples_leaf':  Integer(1,20), 
                'classifier__min_samples_split':  Integer(2,20) }
    
    param_grids = [params_dt, params_gb, params_knn, params_log, params_rf]
    return param_grids

## Defining your own "steps" in the pipeline

In [102]:
class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, option='none'):
        self.option = option

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        feature_selection = ["alcohol", "density", "chlorides", "volatile acidity",
                              "total sulfur dioxide", "fixed acidity", "pH"]
        
        no_feature_selection = ["alcohol", "density", "chlorides", "volatile acidity",
                                 "total sulfur dioxide", "fixed acidity", "pH",
                                "residual sugar", "sulphates", "citric acid", "free sulfur dioxide"]
        
        new_X = pd.DataFrame.copy(X)
        if self.option == 'none':
            new_X = new_X[no_feature_selection]
        elif self.option == 'yes':
            new_X = new_X[feature_selection]
        else:
            print("Wrong option specified")
            return
        
        return new_X
    
class FeaturePreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self, method = 'none'):
        self.method = method

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.method == 'none':
            return X

        if self.method == 'pca':
            transformer = PCA(random_state=5)
            X = transformer.fit_transform(X)
        elif self.method == 'fast_ica':
            ## Had to modify this to allow for it to converge and lessen warnings
            transformer = FastICA(random_state=5, whiten='unit-variance', max_iter=5000, tol=0.5)
            X = transformer.fit_transform(X)
        else:
            print("Wrong option specified")
            return 
        return X

## Function for Randomized Search

In [103]:
def sklearn_pipeline(X, y, hpo_method):
    classifier_names = ['decision_tree', 'gradient_boosting', 'k_nearest_neighbors',
                        'logistic_regression', 'random_forest']
    # Here are our classifiers
    classifiers = [ DecisionTreeClassifier(random_state= 5),
                    GradientBoostingClassifier(random_state= 5),
                    KNeighborsClassifier(),
                    LogisticRegression(random_state= 5, solver='liblinear'),
                    RandomForestClassifier(random_state= 5)]

    # Loop for each trial
    all_data_accuracy = {}
    all_data_hyperparams = {}

    # hyperparameter ranges for classifiers:
    params_grid_rs = sklearn_setup_hp_grid(30)
    params_grid_bo = skopt_setup_hp_grid()


    for j, classifier in enumerate(classifiers):
        classifier_name = classifier_names[j]

        # Setting the pipelines
        steps = [('feature_selection', FeatureSelection()),
                ('feature_preprocessor', FeaturePreprocessing() ),
                ('classifier', classifier)]
        pipeline = Pipeline(steps)

        # Getting the pipeline hyperparameters:
        if hpo_method == 'rs':
            hyperparams = { 'feature_selection__option': ['none', 'yes'],
                            'feature_preprocessor__method': ['pca', 'fast_ica', 'none']}
            classifier_grid = params_grid_rs[j]
            
        if hpo_method == 'bo':
            hyperparams = { 'feature_selection__option': Categorical(['none', 'yes']),
                            'feature_preprocessor__method': Categorical(['pca', 'fast_ica', 'none'])}
            classifier_grid = params_grid_bo[j]
        
        # Adding the classifier hyperparameters:
        for key in classifier_grid:
            hyperparams[key] = classifier_grid[key]

        

        inner_cv = KFold(n_splits=5, shuffle=True, random_state=5)
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=5)
        if hpo_method == 'rs':
            clf = RandomizedSearchCV(pipeline, hyperparams, cv=inner_cv, scoring= 'accuracy',
                                     random_state= 5, n_iter = 30, n_jobs=3)
        elif hpo_method == 'bo':
            clf = BayesSearchCV(pipeline, hyperparams, cv=inner_cv, scoring= 'accuracy',
                                random_state= 5, n_iter = 30, n_jobs = 3)
        else:
            print("Wrong hpo method given")
            return
        
        cv_result = cross_validate(clf, X=X, y=y, cv=outer_cv, return_estimator=True)

        all_data_accuracy[classifier_name] =  cv_result['test_score']

        hyperparams_runs = [cv_result['estimator'][i].best_params_ for i in range(5)]
        all_data_hyperparams[classifier_name] = hyperparams_runs
        print(f'classifier {classifier_name} is done')
        
    dir_path = './data_manual'
    create_directory(dir_path)
    pickle_dump(all_data_accuracy, f'{dir_path}/{hpo_method}_accuracy.pkl')
    pickle_dump(all_data_hyperparams, f'{dir_path}/{hpo_method}_hyperparameters.pkl')  
    return 

In [None]:
fl = "winequality-white.csv"
table = pd.read_csv(fl, delimiter = ";", header='infer')

## yes, this gets all the features, it is redundant but it will fix itself in the pipeline
no_feature_selection = ["alcohol", "density", "chlorides", "volatile acidity",
                        "total sulfur dioxide", "fixed acidity", "pH",
                        "residual sugar", "sulphates", "citric acid", "free sulfur dioxide"]
X = table[no_feature_selection]
y = table.quality

# Need to do this:
# pip install "numpy < 1.24.0"
for hpo_method in ['bo', 'rs']:
    sklearn_pipeline(X, y, hpo_method)
    print(f'Done with method {hpo_method}')

In [105]:
pickle_load('./data_manual/bo_accuracy.pkl')

{'decision_tree': array([0.61938776, 0.61734694, 0.58877551, 0.57609806, 0.58937692]),
 'gradient_boosting': array([0.5744898 , 0.58979592, 0.63061224, 0.61389173, 0.52298264]),
 'k_nearest_neighbors': array([0.65612245, 0.63163265, 0.63877551, 0.62206333, 0.64249234]),
 'logistic_regression': array([0.5622449 , 0.52142857, 0.54693878, 0.51787538, 0.52808989]),
 'random_forest': array([0.69591837, 0.69081633, 0.70816327, 0.66496425, 0.67313585])}

In [106]:
temp = pickle_load('./data_manual/bo_hyperparameters.pkl')
for ordered_dict in temp.values():
    for ord_dict in ordered_dict:
        print(dict(ord_dict))

{'classifier__criterion': 'entropy', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_preprocessor__method': 'none', 'feature_selection__option': 'none'}
{'classifier__criterion': 'gini', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_preprocessor__method': 'none', 'feature_selection__option': 'none'}
{'classifier__criterion': 'gini', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_preprocessor__method': 'none', 'feature_selection__option': 'none'}
{'classifier__criterion': 'gini', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_preprocessor__method': 'none', 'feature_selection__option': 'none'}
{'classifier__criterion': 'gini', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_preprocessor__method': 'none', 'feature_selection__option': 'yes'}
{'classifier__learning_rate': 0.7479753502026856, 'classifier__max_leaf_nodes': 1680, 'c

In [107]:
pickle_load('./data_manual/rs/accuracy.pkl')

{'decision_tree': array([0.55204082, 0.51326531, 0.53367347, 0.52502554, 0.52093973]),
 'gradient_boosting': array([0.58163265, 0.55612245, 0.52755102, 0.52911134, 0.53728294]),
 'k_nearest_neighbors': array([0.6255102 , 0.61020408, 0.61938776, 0.57303371, 0.59652707]),
 'logistic_regression': array([0.53571429, 0.49489796, 0.52653061, 0.49336057, 0.48927477]),
 'random_forest': array([0.61326531, 0.6       , 0.61734694, 0.59141982, 0.57201226])}