# Yacine Mahdid June 12 2020
The goal of this notebook is to load data from the 20 binary classifier that were made and generate the bootstrap confidence interval for them using the best classifier


In [24]:
# The input parameter should be constructed as such:
import pickle

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import LeaveOneGroupOut

import config as cfg

def load_pickle(filename):
    '''Helper function to unpickle the pickled python obj'''
    file = open(filename, 'rb')
    data = pickle.load(file)
    file.close()
    
    return data

def find_best_model(best_params):
    """ helper fo find best model given the best parameter """
    
    models_occurence = {}
    for param in best_params:
        
        clf = param['clf']
        if isinstance(clf, LogisticRegression):
            penalty = param['clf__penalty']
            c = param['clf__C']
            key = f"log-penality={penalty}_C={c}"
        elif isinstance(clf, LinearSVC):
            c = param['clf__C']
            key = f"svc-kernel=linear_C= {c}"
        elif isinstance(clf, DecisionTreeClassifier):
            criterion = param['clf__criterion']
            key = f"dec-criterion{criterion}"
        elif isinstance(clf, RandomForestClassifier):
            n_estimators = param['clf__n_estimators']
            max_depth = param['clf__max_depth']
            min_samples_split = param['clf__min_samples_split']
            min_samples_leaf = param['clf__min_samples_leaf']
            key = f"rand-n_estimators={n_estimators}-max_depth={max_depth}-min_samples_split={min_samples_split}-min_samples_leaf={min_samples_leaf}"
        elif isinstance(clf, LinearDiscriminantAnalysis):
            solver = param['clf__solver']
            key = f"lda-solver={solver}"
        
        if key not in models_occurence:
            models_occurence[key] = 1
        else:
            models_occurence[key] = models_occurence[key] + 1
            
    for key, value in models_occurence.items():
        print(f"{key} : {value}")

    best_clf_params = max(models_occurence, key=models_occurence.get)

    content = best_clf_params.split('-')
    
    if content[0] == "log":
        C = content[1]
        clf = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, C=C)
    elif content[0] == "svc":
        C = content[1]
        clf = LinearSVC(C=C)
    elif content[0] == "dec":
        criterion = content[1]
        clf = DecisionTreeClassifier(criterion=criterion)
    elif content[0] == "rand":
        n_estimators = content[1]
        max_depth = content[2]
        min_samples_split = content[3]
        min_samples_leaf = content[4]
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                     min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        key = f"rand_n_estimators={n_estimators}_max_depth={max_depth}_min_samples_split={min_samples_split}_min_samples_leaf={min_samples_leaf}"
    elif content[0] == "lda":
        content = content[1]
        clf = LinearDiscriminantAnalysis(solver=solver)
    
    return clf

# This will be given by the srun in the bash file
arg = "best_clf_pli_emf5_func-wei.pickle"
best_clf_filename =  f"/home/yacine/Documents/BIAPT/testing/{arg}"

best_clf_data = load_pickle(best_clf_filename)
clf = find_best_model(best_clf_data['best_params'])

log-penality=l2_C=1.0 : 4
lda-solver=svd : 5


LinearDiscriminantAnalysis()