# This notebook is for optimizing the pipeline hyperparameters

## Imports

### Basic imports

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
import autosklearn.classification
from sklearn.model_selection import RandomizedSearchCV, cross_validate, KFold
from sklearn.decomposition import FastICA, PCA

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Functions for storing the data

In [4]:
def pickle_dump(obj, fname):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)

def pickle_load(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

## Autosklearn run with Bayesian Optimization

In [5]:
def bayesian_autoskl(X, y, feature_preprocessor, minute_run):
    automl = autosklearn.classification.AutoSklearnClassifier(
            include = {
                # Classifiers
                'classifier': ["decision_tree", "random_forest", "gradient_boosting", "liblinear_svc", "k_nearest_neighbors"],
                'feature_preprocessor': [feature_preprocessor],
            },
            # Inner cross validation
            resampling_strategy="cv",
            resampling_strategy_arguments={"folds":5},

            time_left_for_this_task=60*minute_run,
            tmp_folder=None,
            memory_limit=None,
            seed=3,
            initial_configurations_via_metalearning=0,

            n_jobs = 3,

            metric=autosklearn.metrics.accuracy,
        )
    # Run the automl model
    automl.fit(X,y)
    
    # Store results
    fname = f'./data/bayesian_fs/accuracy_{minute_run}_minutes_{feature_preprocessor}.pkl'
    pickle_dump(automl.cv_results_, fname)
    return

## Setting hyperparameter grids for random search

In [6]:
def sklearn_setup_hp_grid(number_grid):
    # Set up possible values of parameters to optimize over, follows autosklearn and assignment 2
    params_dt = { 'criterion' : ['gini', 'entropy'],
                'min_samples_leaf': np.linspace(1, 20, num=number_grid, dtype=int),
                'min_samples_split': np.linspace(2, 20, num=number_grid, dtype=int) }

    params_gb = {'learning_rate' : np.linspace(0.01, 1.0, num=number_grid), 
                'max_leaf_nodes' : np.linspace(3, 2047, num=number_grid, dtype=int), 
                'min_samples_leaf' : np.linspace(1, 200, num=number_grid, dtype=int),
                'n_iter_no_change' : np.linspace(1, 20, num=number_grid, dtype=int),
                'validation_fraction' : np.linspace(0.01, 0.4, num=number_grid)}

    params_knn = {'n_neighbors' : np.linspace(1,100, num=number_grid, dtype=int),
                'p' : [1,2],
                'weights' : ['uniform', 'distance']}

    params_log = {'C' : np.linspace(0.03125, 5, num=number_grid), ## Restricted the upper bound from 32768 to 5, to mimic autosklearn and preserve runtime
                'tol' : np.linspace(0.00001, 0.1, num=number_grid),
                'penalty' : ['l1', 'l2']}

    params_rf = {'bootstrap' : [True, False],
                'criterion' : ['gini', 'entropy'],
                'min_samples_leaf': np.linspace(1, 20, num=number_grid, dtype=int), 
                'min_samples_split': np.linspace(2, 20, num=number_grid, dtype=int) }
    
    param_grids = [params_dt, params_gb, params_knn, params_log, params_rf]
    return param_grids

## Function for Randomized Search

In [7]:
def sklearn_rand_search(X, y, feature_preprocessor, number_grid):
    param_grids = sklearn_setup_hp_grid(number_grid)
    # Here are our classifiers
    classifiers = [ DecisionTreeClassifier(random_state= 5),
                    GradientBoostingClassifier(random_state= 5),
                    KNeighborsClassifier(),
                    LogisticRegression(random_state= 5, solver='liblinear'),
                    RandomForestClassifier(random_state= 5)]

    classifier_names = ['decision_tree', 'gradient_boosting', 'k_nearest_neighbors',
                        'logistic_regression', 'random_forest']

    # Choose preprocessing (Use defaults for each, kept random state)
    if feature_preprocessor == "fast_ica":
        transformer = FastICA(random_state=5)
        X = transformer.fit_transform(X)
    elif feature_preprocessor == "pca":
        transformer = PCA(random_state=5)
        X = transformer.fit_transform(X)

    # Loop for each trial
    all_data_accuracy = {}
    all_data_hyperparams = {}

    for j, classifier in enumerate(classifiers):
        param_grid = param_grids[j]
        classifier_name = classifier_names[j]

        inner_cv = KFold(n_splits=5, shuffle=True, random_state=5)
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=5)
        clf = RandomizedSearchCV(classifier, param_grid, cv=inner_cv, scoring= 'accuracy', random_state= 5)

        cv_result = cross_validate(clf, X=X, y=y, cv=outer_cv, return_estimator=True)
        
        all_data_accuracy[classifier_name] =  cv_result['test_score']

        hyperparams_runs = [cv_result['estimator'][i].best_params_ for i in range(5)]
        all_data_hyperparams[classifier_name] = hyperparams_runs
        
    pickle_dump(all_data_accuracy, f'./data/rs_fs/accuracy_{number_grid}_numgrid_{feature_preprocessor}.pkl')
    pickle_dump(all_data_hyperparams, f'./data/rs_fs/hyperparameters_{number_grid}_numgrid_{feature_preprocessor}.pkl')  
    return 

## Running the autosklearn classifiers on varying minutes

In [None]:
fl = "winequality-white.csv"
table = pd.read_csv(fl, delimiter = ";", header='infer')

# Feature selection options
# All features:
no_feature_selection = ["alcohol", "density", "chlorides", "volatile acidity", "total sulfur dioxide", "fixed acidity", "pH",
                        "residual sugar", "sulphates", "citric acid", "free sulfur dioxide"]
# Selected features from first two assignments:
feature_selection = ["alcohol", "density", "chlorides", "volatile acidity", "total sulfur dioxide", "fixed acidity", "pH"]

# Hyperparameters for pipeline:
hpo_methods = ["Bayesian Optimization", "Randomized Search"]
feature_preprocessors = [ "no_preprocessing", "fast_ica", "pca"]
minutes_run = [5, 15, 30, 60]
numbers_grid = [5, 15, 50, 100]
feature_selection_options = [no_feature_selection, feature_selection]


# Implementing grid search to notify what hyperparameters work with which
# Also used to show explicitly that each hyperparameter is used
# Sklearn's grid search doesn't work as abstract as desired.
for features in feature_selection_options:
    X = table[features]
    y = table.quality
    for hpo_method in hpo_methods:
        for feature_preprocessor in feature_preprocessors:
            if hpo_method == "Bayesian Optimization":
                # Do bayesian for each minutes_run with 
                for minute_run in minutes_run:
                    bayesian_autoskl(X, y, feature_preprocessor, minute_run)
                    print(f'Minute {minute_run} is done')
            elif hpo_method == "Randomized Search":
                for number_grid in numbers_grid:
                    sklearn_rand_search(X, y, feature_preprocessor, number_grid)
                    print(f'number {number_grid} is done')
            else:
                print(f'Incorrect hpo method given')

            print(f'Feature preprocessor {feature_preprocessor} is done')
        print(f'Method {hpo_method} is done')
    print(f'Feature selection option {features} is done')
print('All done!')

In [None]:
randomized_search_acc_5 = pickle_load('./data/rs/accuracy_5_numgrid_no_preprocessing.pkl')
pd.DataFrame.from_dict(randomized_search_acc_5)

Unnamed: 0,decision_tree,gradient_boosting,k_nearest_neighbors,logistic_regression,random_forest
0,0.545918,0.584694,0.645918,0.52449,0.659184
1,0.558163,0.614286,0.640816,0.509184,0.636735
2,0.557143,0.516327,0.629592,0.512245,0.671429
3,0.555669,0.494382,0.606742,0.491318,0.629213
4,0.580184,0.494382,0.632278,0.492339,0.624106


In [None]:
randomized_search_acc_15 = pickle_load('./data/rs/accuracy_15_numgrid_no_preprocessing.pkl')
pd.DataFrame.from_dict(randomized_search_acc_15)

Unnamed: 0,decision_tree,gradient_boosting,k_nearest_neighbors,logistic_regression,random_forest
0,0.560204,0.628571,0.627551,0.557143,0.697959
1,0.541837,0.583673,0.647959,0.519388,0.691837
2,0.567347,0.602041,0.622449,0.542857,0.706122
3,0.54239,0.550562,0.592441,0.517875,0.663943
4,0.522983,0.550562,0.609806,0.521961,0.664964
