# Yacine Mahdid May 20 2020
In this notebook the goal is to augment the analysis by using a grid search model selection inside the inner loop of the classify_loso. This should use all the white box model defined previously and re-use the same pipeline as before.

- [X] Define a pipeline that let us use Grid Search
- [X] Define all the classifier used and their parameters space to search for
- [X] Augment the classify_loso to do model selection and test it to output all the models parameters selected by grid search
- [x] Bundle all of this into an easy to reuse function and put it into ml_tools

## Classifier to Test out
- Linear SVM
- Linear Regression
- Decision Trees

In [6]:
# Pipeline with grid search taken from: https://stackoverflow.com/questions/38555650/try-multiple-estimator-in-one-grid-search

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GroupKFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from ml_tools.classification import classify_loso
from ml_tools.pre_processing import pre_process


def classify_loso_model_selection(X, y, group, gs):
    """ This do classification using LOSO while also doing model selection using LOSO

        Args:
            X (numpy matrix): this is the feature matrix with row being a data point
            y (numpy vector): this is the label vector with row belonging to a data point
            group (numpy vector): this is the group vector (which is a the participant id)
            gs (sklearn GridSearchCV): this is a gridsearch object that will output the best model

        Returns:
            accuracies (list): the accuracy at for each leave one out participant
    """
    logo = LeaveOneGroupOut()

    accuracies = []
    best_params = []
    for train_index, test_index in logo.split(X, y, group):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        group_train, group_test = group[train_index], group[test_index]

        gs.fit(X_train, y_train, groups=group_train)
        y_hat = gs.predict(X_test)

        accuracy = accuracy_score(y_test, y_hat)
        
        accuracies.append(accuracy)
        best_params.append(gs.best_params_)
    return accuracies, best_params


def create_gridsearch_pipeline():
    """ Helper function to create a gridsearch with a search space containing classifiers
    
        Returns:
            gs (sklearn gridsearch): this is the grid search objec wrapping the pipeline
    """
    # Create LOSO Grid Search to search amongst many classifier
    class DummyEstimator(BaseEstimator):
        """Dummy estimator to allow gridsearch to test many estimator"""
        def fit(self): pass
        def score(self): pass

    # Create a pipeline
    pipe = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('scaler', StandardScaler()),
        ('clf', DummyEstimator())])# Placeholder Estimator

    # Candidate learning algorithms and their hyperparameters
    search_space = [{'clf': [LogisticRegression()], # Actual Estimator
                     'clf__penalty': ['l1', 'l2'],
                     'clf__solver' : ['liblinear'],
                     'clf__C': np.logspace(0, 4, 10)},

                    {'clf': [SVC()],
                     'clf__kernel': ['linear'],
                     'clf__C': [1, 10, 100, 1000]},

                    {'clf': [DecisionTreeClassifier()],  # Actual Estimator
                     'clf__criterion': ['gini', 'entropy']}]

    gs = GridSearchCV(pipe, search_space, cv=LeaveOneGroupOut())
    return gs


# Experiment
input_filename = '/home/yacine/Documents/BIAPT/data_window_10.csv'

gs = create_gridsearch_pipeline()
X,y,group = pre_process(input_filename, "HEALTHY")
accuracies, best_params = classify_loso_model_selection(X, y, group, gs)



In [7]:
np.mean(accuracies)
best_params

[{'clf': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='linear', max_iter=-1, probability=False, random_state=None,
      shrinking=True, tol=0.001, verbose=False),
  'clf__C': 1,
  'clf__kernel': 'linear'},
 {'clf': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='linear', max_iter=-1, probability=False, random_state=None,
      shrinking=True, tol=0.001, verbose=False),
  'clf__C': 1,
  'clf__kernel': 'linear'},
 {'clf': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='linear', max_iter=-1, probability=False, random_state=None,
      shrinking=True, tol=0.001, verbose=False),
  'clf__C': 100,
  'clf__kernel': 'linear'},
 {'clf': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_