# Hyperparameter search for double classification task

The purpose of this notebook is to search for the optimal hyperparameters for each classifier involved. 

A dictionary (dict_classifiers) is written to 'classifier_dict.pkl', so that it can be used later on, for testing. 
This hyperparameter search is done to avoid having to do a exhaustive search for hyperparameters during testing / development. 

For each dataset the following classifiers are saved: 

* An initial classifier ( KNN, LR, RF, SVM), that is trained on the entire training set, from the original partition ( 0.8 train, 0.2 test)
* An ensemble of n = 5 classifiers (KNN, LR, RF, SVM), that is trained for the secondary classification task, on a k-fold = 5, part of the dataset. 

In [1]:
dict_classifiers = {}

In [4]:
import numpy as np
import pandas as pd
import sys
import os
module_path = os.path.abspath(os.path.join('../..'))
sys.path.insert(1, module_path + '/src/')
from sklearn.model_selection import KFold
import pickle


import warnings
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning, FitFailedWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sktime.transformations.panel.rocket import Rocket
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sktime.utils.data_io import load_from_arff_to_dataframe
import time


import utility
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns
import matplotlib
from matplotlib.colors import LinearSegmentedColormap
font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)
font_small = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 8, weight = 1000)


from sktime.datasets import load_UCR_UEA_dataset
from sktime.transformations.panel.catch22_features import Catch22



def get_dataset_partitions(X, y, ft):
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    norm = MinMaxScaler()
    X_train_full = pd.DataFrame(norm.fit_transform(X_train_full))
    X_test = pd.DataFrame(norm.fit_transform(X_test))
    k = min([X_test.shape[0], 100])
    print('Chosen number of components: ', k)
    '''
    if (ft == 'rocket') and (fs == 'pca'):  
        pca = PCA(n_components = k)
        X_train_full = pca.fit_transform(X_train_full)
        X_train_full = pd.DataFrame(X_train_full)
        
        X_test = pca.fit_transform(X_test)
        X_test = pd.DataFrame(X_test)
    '''
    if (ft == 'rocket'):
        select = SelectKBest(chi2, k=k)
        X_train_full = select.fit_transform(X_train_full, y_train_full)
        indices = select.get_support(indices = True)
        X_test = X_test[indices]
    return pd.DataFrame(X_train_full), y_train_full, pd.DataFrame(X_test), y_test


def get_initial_classifier(X_train_full,y_train_full, clf, grid, no_grid_search = False):
    grid_cv=GridSearchCV(clf,grid,cv=5)
    grid_cv.fit(X_train_full,y_train_full)
    return grid_cv.best_estimator_

def get_helping_classifier(X_train, y_train, X_val, y_val, clf, grid ):
    '''Get the helping classifier
    
    returns a classifier, which classifies if something is likely to be classified wrongly
    
    '''
    new_X = X_val

    #grid_cv=GridSearchCV(clf,grid,cv=5)
    #grid_cv.fit(X_train,y_train)
    #clf_2 = grid_cv.best_estimator_
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_val)
    new_y = pd.Series(np.hstack([y_pred != y_val]))
    clf = LogisticRegression(random_state=0, solver = 'liblinear',penalty = 'l1', class_weight = 'balanced').fit(new_X,new_y)
    return clf
'''   
    grid_cv=GridSearchCV(clf,grid,cv=5)
    if 'True' not in list(new_y.astype(str)):
         return None, None
    grid_cv.fit(new_X,new_y)
    return grid_cv.best_estimator_ , grid_cv.best_score_
'''

def get_ensemble_helping_classifier(X_train, y_train, clf, grid):
    kf = StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
    #kf.get_n_splits(X_train)
    classifiers = []
    for train_index, val_index in kf.split(X_train, y_train):
        #clf_temp, score_temp = get_helping_classifier(X_train.iloc[train_index], y_train.iloc[train_index], X_train.iloc[val_index], y_train.iloc[val_index], clf, grid)
        clf_temp = get_helping_classifier(X_train.iloc[train_index], y_train.iloc[train_index], X_train.iloc[val_index], y_train.iloc[val_index], clf, grid)
        if clf_temp == None:
            continue
        #classifiers.append((clf_temp, score_temp))
        classifiers.append(clf_temp)
    return classifiers

def make_ensemble_classification(X_test, ensemble):
    y_pred = []
    for e in ensemble: 
        clf = e[0]
        score = e[1]
        y_pred_temp = clf.predict(X_test)
        y_pred.append((score * (y_pred_temp + 1)) -1)
    y_pred = np.array(y_pred)
    y_pred_sum = y_pred.sum(axis = 0)
    idx_true = np.where(y_pred_sum > 0)[0]
    
    y_pred = y_pred.mean(axis = 0)
    y_pred = np.round(y_pred).astype(int)
    
    y_pred[idx_true] = 1
    return y_pred

def make_ensemble_classification_2(X_test, ensemble):
    y_pred = []
    for clf in ensemble: 
        y_pred_temp = clf.predict(X_test).astype(int)
        y_pred.append(y_pred_temp)
    y_pred = np.array(y_pred)
    y_pred_sum = y_pred.sum(axis = 0)
    idx_true = np.where(y_pred_sum >= 1)[0]
    
    y_pred = np.zeros(y_pred_temp.shape)

    
    y_pred[idx_true] = 1
    return y_pred

def filter_test_set_ensemble(X_test, y_test, ensemble):
    y_pred = make_ensemble_classification_2(X_test, ensemble)
    to_del = np.where(y_pred == 1)[0]
    print('Indices to delete: ')
    print(to_del)
    X_test = X_test.reset_index(drop = True)
    y_test = y_test.reset_index(drop = True)
    X_test_new = X_test[~X_test.index.isin(to_del)]
    y_test_new = y_test[~y_test.index.isin(to_del)]
    return X_test_new, y_test_new


def plot_cm(clf, X_test, y_test, class_names, color_index = 2):
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    colors = ["#F94144", "#F3722C", '#F8961E', '#F9C74F','#90BE6D', '#43AA8B','#577590']

    font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)

    colors_2 = ['#FFFFFF', colors[color_index]]
    cmap_name = 'my colormap'
    font_small = FontProperties(fname =  module_path + '/src/visualization/CharterRegular.ttf', size = 6, weight = 1000)

    cm_map = LinearSegmentedColormap.from_list(cmap_name, colors_2)


    f, ax = plt.subplots(1,1) # 1 x 1 array , can also be any other size
    f.set_size_inches(5, 5)

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    ax = sns.heatmap(cm, annot=True,
                fmt='.2%', cmap=cm_map, xticklabels=class_names,yticklabels=class_names )
    cbar = ax.collections[0].colorbar
    for label in ax.get_yticklabels() :
        label.set_fontproperties(font_small)
    for label in ax.get_xticklabels() :
        label.set_fontproperties(font_small)
    ax.set_ylabel('True Label', fontproperties = font)
    ax.set_xlabel('Predicted Label', fontproperties = font)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 0)

    for child in ax.get_children():
        if isinstance(child, matplotlib.text.Text):
            child.set_fontproperties(font)
    for l in cbar.ax.yaxis.get_ticklabels():
        l.set_fontproperties(font_small)
        
    plt.show()


def print_and_plot_to_compare(X_test, y_test, X_test_2, y_test_2, clf, class_names):
    y_pred = clf.predict(X_test)
    acc_1 = accuracy_score(y_test, y_pred)
    print(f'Accuracy score for the initial classification: {acc_1} ')
    plot_cm(clf, X_test, y_test, class_names)
    
    y_pred_2 = clf.predict(X_test_2)
    acc_2 = accuracy_score(y_test_2, y_pred_2)
    print(f'Accuracy score for the double classification: {acc_2} ')
    plot_cm(clf, X_test_2, y_test_2, class_names, color_index = 5)
    return acc_1 , acc_2
    
def compare_single_double_classifier(name_UCR, class_names, feature_transform = 'rocket'):
    a_file = open(module_path + '/src/dictionaries/empty_temp_dict.pkl', 'rb')
    temp_dict = pickle.load(a_file)

    X, y = load_UCR_UEA_dataset(name_UCR, return_X_y=True)
  
    if (feature_transform == 'rocket'):
        rocket = Rocket(random_state=1)  # by default, ROCKET uses 10,000 kernels
        rocket.fit(X)
        X = rocket.transform(X)
    else:
        c22f = Catch22()
        c22f.fit(X, y)
        X = c22f.transform(X)

    X_train_full, y_train_full , X_test, y_test = get_dataset_partitions(X,y, feature_transform)
    
    ############ K nearest neighbours 
    max_nn = round((0.8*X.shape[0])//8)
    k_range = list(range(1,max_nn))
    weight_options = ["uniform", "distance"]
    grid = dict(n_neighbors = k_range, weights = weight_options)
    clf = KNeighborsClassifier()
    
    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['KNN']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf_full, grid)
    temp_dict['KNN']['ensemble'] = clf_ensemble

    ############ Logistic Regression
    grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
    clf=LogisticRegression()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['LR']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['LR']['ensemble'] = clf_ensemble

  
    ############ SVM
    grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
    clf = SVC()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['SVM']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['SVM']['ensemble'] = clf_ensemble

    ############# Random Forest
    grid = {
    'n_estimators'      : [100,200,300],
    'max_depth'         : [8, 10, 12],
    'random_state'      : [0],
    }
    clf = RandomForestClassifier()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['RF']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['RF']['ensemble'] = clf_ensemble
    return temp_dict

def convert_arff_to_ts(filepath, filename):
    X, y = load_from_arff_to_dataframe(filepath + '/' + filename)
    new_filename = filename[:-4] + 'ts'
    print(new_filename)
    dataset = filename.split('_')[0]
    print(dataset)
    
    labels = np.unique(y).astype(str)
    label_str = ''
    for label in labels:
        label_str = label_str + label + ' '
    print(label_str)
    w = open(filepath + '/' + new_filename, 'w+')
    
    w.write(f'@problemName {dataset} \n')
    w.write('@timeStamps false \n')
    w.write('@univariate true \n')
    w.write(f'@classLabel true {label_str} \n')
    w.write('@data \n')
    for (idx, row) in X.iterrows():
        new_row = (list(row)[0]).tolist()
        new_row = str(new_row)[1:-1].replace(' ', '') + ':' + y[idx] + '\n'
        w.write(new_row)


# Arrowhead

In [5]:
class_names = ['Avonlea', 'Clovis','Mix']
name_UCR = 'ArrowHead'

start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  43
Time used to find hyperparameters: 24.61294937133789


# Osuleaf

In [6]:
class_names = ['Acer Circinatum', 'Acer Glabrum', 'Acer Macrophyllum', 'Acer Negundo', 'Quercus Garryanaand' , 'Quercus Kelloggii' ]
name_UCR = 'OSULeaf'


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  89
Time used to find hyperparameters: 42.38264441490173


# ECG 200

In [8]:
class_names = ['normal','myocardial']
name_UCR = 'ECG200'


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  40
Time used to find hyperparameters: 24.800294637680054


# ChlorineConsentration

In [9]:
class_names = ['0', '1', '2']
name_UCR = 'ChlorineConcentration'


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  100
Time used to find hyperparameters: 2391.799001932144


# PhalangesOutlinesCorrect

In [10]:
class_names = ['0', '1']
name_UCR = 'PhalangesOutlinesCorrect'


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  100
Time used to find hyperparameters: 530.091997385025


# DistalPhalanxOutlineCorrect

In [11]:
class_names = ['0', '1']
name_UCR = 'DistalPhalanxOutlineCorrect'


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  100
Time used to find hyperparameters: 70.74565196037292


# CricketZ

In [12]:
n_classes = 12
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()

name_UCR = 'CricketZ'

start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  100
Time used to find hyperparameters: 107.69919204711914


# ERing

In [13]:
n_classes = 6
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()

name_UCR = 'ERing'

start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  60
Time used to find hyperparameters: 31.55928325653076


# Colposcopy

In [14]:
filepath = '/home/andrine/anaconda3/lib/python3.7/site-packages/sktime/datasets/data/Colposcopy'
filename  = 'Colposcopy_TEST.arff'
convert_arff_to_ts(filepath, filename)

n_classes = 6
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()

name_UCR = 'Colposcopy'

start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Colposcopy_TEST.ts
Colposcopy
0 1 2 3 4 5 
Chosen number of components:  40
Time used to find hyperparameters: 24.2416570186615


# Epilepsy

In [15]:
n_classes = 4
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()

name_UCR = 'Epilepsy'

start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  55
Time used to find hyperparameters: 31.684996366500854


# EyesOpenShut

In [16]:
name_UCR = 'EyesOpenShut'

# Code below is to fix the issue that this dataset does not have .ts files yet 
filepath = f'/home/andrine/anaconda3/lib/python3.7/site-packages/sktime/datasets/data/{name_UCR}'
filename  = f'{name_UCR}_TEST.arff'
convert_arff_to_ts(filepath, filename)
filename  = f'{name_UCR}_TRAIN.arff'
convert_arff_to_ts(filepath, filename)
n_classes = 2
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

EyesOpenShut_TEST.ts
EyesOpenShut
0 1 
EyesOpenShut_TRAIN.ts
EyesOpenShut
0 1 
Chosen number of components:  20
Time used to find hyperparameters: 18.894746780395508


# FingerMovements

In [17]:
name_UCR = 'FingerMovements'

n_classes = 2
class_names = np.linspace(0,n_classes -1 ,n_classes).astype(int).astype(str).tolist()


start = time.time()
dict_classifiers[name_UCR]= compare_single_double_classifier(name_UCR, class_names)
print(f'Time used to find hyperparameters: {time.time() - start}')

Chosen number of components:  84
Time used to find hyperparameters: 40.28056812286377


### Will not run on the following datasets 

Because of the computational effort of using catch22/ROCKET for feature extraction, the following time series problems will not run/ takes forever to run: 

* Electric Devices
* Face Detection
* ECG5000

ECGFiveDays is too accurate already


Maybe try out ROCKET instead 

# Lung Sound Data

In [186]:
def compare_single_double_classifier_LS():    
    a_file = open(module_path + '/src/dictionaries/empty_temp_dict.pkl', 'rb')
    temp_dict = pickle.load(a_file)
    
    class_names = ['crackle', 'no-crackle']
    X, y =  utility.get_X_y('noDecomp', feature_type = 'all')
    X_train_full, y_train_full, X_test, y_test = get_dataset_partitions(X,y,'lungsound')

    ############ K nearest neighbours 
    max_nn = round((0.8*X.shape[0])//8)
    k_range = list(range(1,max_nn))
    weight_options = ["uniform", "distance"]
    grid = dict(n_neighbors = k_range, weights = weight_options)
    clf = KNeighborsClassifier()
    
    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['KNN']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['KNN']['ensemble'] = clf_ensemble

    ############ Logistic Regression
    grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
    clf=LogisticRegression()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['LR']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['LR']['ensemble'] = clf_ensemble

  
    ############ SVM
    grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
    clf = SVC()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['SVM']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['SVM']['ensemble'] = clf_ensemble

    ############# Random Forest
    grid = {
    'n_estimators'      : [100,200,300],
    'max_depth'         : [8, 10, 12],
    'random_state'      : [0],
    }
    clf = RandomForestClassifier()

    clf_full = get_initial_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['RF']['init'] = clf_full
    clf_ensemble = get_ensemble_helping_classifier(X_train_full, y_train_full, clf, grid)
    temp_dict['RF']['ensemble'] = clf_ensemble
 

    return temp_dict

In [188]:
start = time.time()
dict_classifiers['LungSound'] = compare_single_double_classifier_LS()
print(f'Time used to find hyperparameters: {time.time() - start} ')

Chosen number of components:  100


TypeError: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'

## Write classifiers with found hyper-parameters to file

In [11]:
dict_classifiers

{'FingerMovements': {'SVM': {'init': SVC(C=10, gamma=1),
   'ensemble': [(SVC(C=0.1, gamma=1), 0.5857707509881422),
    (SVC(C=0.1, gamma=1), 0.6577075098814229),
    (SVC(C=0.1, gamma=1), 0.6454545454545454)]},
  'LR': {'init': LogisticRegression(C=0.1),
   'ensemble': [(LogisticRegression(C=0.001), 0.5857707509881422),
    (LogisticRegression(C=0.001), 0.6035573122529644),
    (LogisticRegression(), 0.6181818181818182)]},
  'RF': {'init': RandomForestClassifier(max_depth=8, random_state=0),
   'ensemble': [(RandomForestClassifier(max_depth=10, n_estimators=200, random_state=0),
     0.5411067193675889),
    (RandomForestClassifier(max_depth=8, random_state=0), 0.6221343873517787),
    (RandomForestClassifier(max_depth=10, n_estimators=200, random_state=0),
     0.6)]},
  'KNN': {'init': KNeighborsClassifier(n_neighbors=15),
   'ensemble': [(KNeighborsClassifier(n_neighbors=21), 0.5857707509881422),
    (KNeighborsClassifier(n_neighbors=19), 0.667193675889328),
    (KNeighborsClassifi

In [12]:
a_file = open(module_path + '/src/dictionaries/classifier_dict.pkl', 'rb')
dict_classifiers_2 = pickle.load(a_file)
joined = {**dict_classifiers, **dict_classifiers_2}

In [15]:
joined

{'FingerMovements': {'SVM': {'init': SVC(C=10, gamma=1),
   'ensemble': [(SVC(C=0.1, gamma=1), 0.5857707509881422),
    (SVC(C=0.1, gamma=1), 0.6577075098814229),
    (SVC(C=0.1, gamma=1), 0.6454545454545454)]},
  'LR': {'init': LogisticRegression(C=0.1),
   'ensemble': [(LogisticRegression(C=0.001), 0.5857707509881422),
    (LogisticRegression(C=0.001), 0.6035573122529644),
    (LogisticRegression(), 0.6181818181818182)]},
  'RF': {'init': RandomForestClassifier(max_depth=8, random_state=0),
   'ensemble': [(RandomForestClassifier(max_depth=10, n_estimators=200, random_state=0),
     0.5411067193675889),
    (RandomForestClassifier(max_depth=8, random_state=0), 0.6221343873517787),
    (RandomForestClassifier(max_depth=10, n_estimators=200, random_state=0),
     0.6)]},
  'KNN': {'init': KNeighborsClassifier(n_neighbors=15),
   'ensemble': [(KNeighborsClassifier(n_neighbors=21), 0.5857707509881422),
    (KNeighborsClassifier(n_neighbors=19), 0.667193675889328),
    (KNeighborsClassifi

In [19]:
dict_classifiers

{'ArrowHead': {'SVM': {'init': SVC(C=100, gamma=0.1),
   'ensemble': [LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear'),
    LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear'),
    LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear')]},
  'LR': {'init': LogisticRegression(C=100.0),
   'ensemble': [LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear'),
    LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear'),
    LogisticRegression(class_weight='balanced', penalty='l1', random_state=0,
                       solver='liblinear')]},
  'RF': {'init': RandomForestClassifier(max_depth=8, random_state=0),
   'ensemble': [LogisticRegression(class_weight='balanced', penalty='l

In [20]:
a_file = open(module_path + '/src/dictionaries/classifier_dict_lr_helper.pkl', 'wb')
pickle.dump(dict_classifiers, a_file)
a_file.close()

In [16]:
a_file = open(module_path + '/src/dictionaries/classifier_dict.pkl', 'wb')
pickle.dump(joined, a_file)
a_file.close()