# Yacine Mahdid June 12 2020
The goal of this notebook is to load data from features.csv that were generated by matlab, create a dataframe and filter the data depending on parameter 
- aec (baseline vs emf5)
    - func
    - wei
    - bin
    - func-wei
    - func-bin
- pli (baseline vs emf5)
    - func
    - wei
    - bin
    - func-wei
    - fun-bin
- aec (baseline vs eml5)
    - func
    - wei
    - bin
    - func-wei
    - func-bin
- pli (baseline vs eml5)
    - func
    - wei
    - bin
    - func-wei
    - fun-bin
This is 20 different binary classifier we need to construct


In [8]:
# The input parameter should be constructed as such:
# [metric_type]_[epoch]_[feature_group]
# metric_type can be {aec,pli}
# epoch can be {emf5, eml5}
# feature_group can be {func, wei, bin, func-wei, func-bin}

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import LeaveOneGroupOut

from ml_tools.classification import classify_loso_model_selection
import config as cfg

# This will be given by the srun in the bash file
arg = "pli_eml5_func"
(graph, epoch, feature_group) = arg.split("_")


# Read the CSV
df = pd.read_csv(cfg.DF_FILE_PATH)

# Keep only the Graph of interest
df = df[df.graph == cfg.GRAPHS.index(graph)]

# Keep only the epoch of interest
df = df[(df.epoch == cfg.EPOCHS[epoch]) | (df.epoch == cfg.EPOCHS['ec1'])]

# Keep only the features of interest
df.drop(df.filter(regex=cfg.FILTER_REGEX[feature_group]), axis=1, inplace=True)
# Set the up the feature matrix, the label vector and the group ids
X = df.drop(['p_id', 'frequency', 'epoch','graph','window'], axis=1).to_numpy()
y = df.epoch.to_numpy()
group = df.p_id.to_numpy()

# Create LOSO Grid Search to search amongst many classifier
class DummyEstimator(BaseEstimator):
    """Dummy estimator to allow gridsearch to test many estimator"""

    def fit(self): pass
    
    def score(self): pass

# Create a pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
    ('clf', DummyEstimator())])  # Placeholder Estimator

# Candidate learning algorithms and their hyperparameters
search_space = [{'clf': [LogisticRegression()],  # Actual Estimator
                 'clf__penalty': ['l2'],
                 'clf__solver': ['lbfgs'],
                 'clf__max_iter': [1000],
                 'clf__C': np.logspace(0, 4, 10)},

                {'clf': [LinearSVC()],
                 'clf__C': [1, 10, 100, 1000]},

                {'clf': [DecisionTreeClassifier()],  # Actual Estimator
                 'clf__criterion': ['gini', 'entropy']}]


# We will try to use as many processor as possible for the gridsearch
gs = GridSearchCV(pipe, search_space, cv=LeaveOneGroupOut(), n_jobs=-1)

accuracies, f1s, cms, best_params = classify_loso_model_selection(X, y, group, gs)