In [None]:
import os
import pandas as pd
import numpy as np
import random


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')


from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, ShuffleSplit, learning_curve, validation_curve
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, auc
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler
import itertools
import timeit



In [None]:
# import dill
# # # from dill.settings import settings
# dill.dump_session('avila-nt.db')

Data2

In [None]:
Fs = ['F' + str(i) for i in range(1,11) ]
#names = ['y' BAAAAAAAD, 'variance', 'skewness', 'curtosis', 'entropy']
names = Fs + ['y']
print(names)

In [None]:
df_avil = pd.read_csv('avila.csv')

print("Data has",len(df_avil),"rows and", len(df_avil.columns),"columns.")
if df_avil.isnull().values.any():
    print("have data missing")

df_avil.describe(include='all')

In [None]:
class_names = sorted(df_avil.y.unique())

In [None]:
sns.pairplot(df_avil)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_avil['y'])
le.transform(df_avil['y'])

In [None]:
df_avil['y'] = le.transform(df_avil['y'])
le.inverse_transform(df_avil['y'])

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
names.insert(0, names.pop(names.index('y')))
df_num = df_avil[names[1:]]
x_scaled = min_max_scaler.fit_transform(df_num)

In [None]:
df_scaled = pd.DataFrame(x_scaled, columns=names[1:])
df_scaled.head()

In [None]:



df_avil = pd.concat([df_avil[names[0]],df_scaled],axis=1)


In [None]:
df_avil.describe(include='all')

In [None]:
df_avil.shape

In [None]:
# class_names = le.inverse_transform( sorted(df_avil.y.unique()) )

In [None]:
df_avil.y.value_counts()

In [None]:
plt.title('Counts of Avila Labels')

sns.countplot(df_avil.y.rename('Labels'))


Now we have successfully loaded and processed both datasets. We are ready to start the ML!

### helper funcs

In [None]:
#src: sklearn
def pllc(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 20)):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("F1-macro")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring='f1_macro')
    


    
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    
    results ={'sizes':train_sizes,'tr_scores':train_scores_mean, 'val_scores': test_scores_mean, 'title':title}
    
    
    
    
    
    plt.grid(True)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="g")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="r")
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color="g",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="r",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.tight_layout()
    
    return plt, results

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

def tune_hp(estimator, X_train, y_train, title, param_name,param_range, xlabel,xvals=None, cv=5):

    
            
            
    train_scores, val_scores = validation_curve(estimator, X_train, y_train, param_name, 
                                                 param_range, cv=cv, n_jobs=-1, scoring='f1_macro')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    #train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    #test_scores_std = np.std(test_scores, axis=1)
    
    if xvals is not None:
        param_range=xvals
            
    plt.grid(True)  
    plt.plot(param_range, train_scores_mean, 'o-', color = 'g', label='Train Score')
    plt.plot(param_range, val_scores_mean, 'o-', color='r', label='Validation Score')
    plt.ylabel('F1-macro')
    plt.xlabel(xlabel)
    
    plt.title(title)
    plt.legend(loc='best')
    plt.tight_layout()

In [None]:




def get_cr_data():


    x = np.array(df_avil.values[:,1:])
    y = np.array(df_avil.values[:,0])

    return x, y




    
    


## ML- DT

In [None]:
avX, avY = get_cr_data()
X_train, X_test, y_train, y_test = train_test_split(np.array(avX),np.array(avY), test_size=0.20)

In [None]:
tune_hp(estimator=DecisionTreeClassifier(max_depth=15), X_train= X_train, y_train=y_train, title="Hypertunning Decision Tree Min Leaf Size - Avila Data",
        param_name='min_samples_leaf',param_range=np.arange(1,20,1), xlabel="Min Leaf Size")

In [None]:
param_grid_DT = {'min_samples_leaf':np.array([1]), 'max_depth':np.arange(5,30)}

In [None]:
GSTree = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid=param_grid_DT, cv=5, scoring='f1_macro', n_jobs=-1)
GSTree.fit(X_train, y_train)
max_depth, min_samples_leaf =GSTree.best_params_['max_depth'], GSTree.best_params_['min_samples_leaf']
print("Tree chosen parameters: ")
print(GSTree.best_params_)

# Tree Grid Search chosen parameters: 
# {'max_depth': 19, 'min_samples_leaf': 1}

In [None]:
title = "Learning Curve DT- Avila Data"


cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)

estimator =  DecisionTreeClassifier(criterion='gini',max_depth=max_depth, min_samples_leaf= min_samples_leaf, random_state=1)
_ , DTLC_results = pllc(estimator, title, X_train, y_train, ylim=None, cv=cv, n_jobs=-1)

In [None]:

DTLC_results

# NN

In [None]:
mlp = MLPClassifier(solver='adam',random_state=1, verbose=10,
                    learning_rate_init=.01, hidden_layer_sizes= (100,200,80,) ,)


tune_hp(estimator=mlp, X_train= X_train, y_train=y_train, title="Hypertunning Neural Network N of nodes in Hidden Layer - Avila Data",
        param_name='activation',param_range=['relu', 'logistic', 'tanh'], 
        xlabel="N of nodes in hidden Layers", xvals=None )


In [None]:
param_grid_nn = {'learning_rate_init':[0.01,0.1,1]}
mlp1 = MLPClassifier(solver='adam',random_state=1, verbose=10, hidden_layer_sizes=(10,200,100,), activation='relu')


estimator_nn =  MLPClassifier(solver='adam',random_state=1, verbose=0, hidden_layer_sizes=(100,200,80,),
                          learning_rate_init= 0.01, activation= 'relu')

In [None]:
GSmlp = GridSearchCV(estimator = mlp1, param_grid=param_grid_nn, cv=5, scoring='f1_macro', n_jobs=-1)
GSmlp.fit(X_train, y_train)
learning_rate_init, activation =GSmlp.best_params_['learning_rate_init'], GSmlp.best_params_['activation']

print(GSmlp.best_params_)


In [None]:
title = "Learning Curve NN"

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)


_ , NNLC_results = pllc(estimator_nn, title, X_train, y_train, ylim=None, cv=cv, n_jobs=-1)

In [None]:
NNLC_results

The final section for neural network will plot the loss curve for each dataset over the iterations.

In [None]:

estimator_nn.fit(X_train, y_train)
loss_iter = estimator_nn.loss_curve_

plt.figure()
plt.title("NN Loss Curve- Avila Data")
plt.xlabel("Number of Iterations")
plt.ylabel("Log Loss")
plt.plot(loss_iter, 'o-', color="g", markersize=4)

plt.show()

#### Boosted

In [None]:
GBC = GradientBoostingClassifier( random_state=1,)



In [None]:

tune_hp(estimator=GBC, X_train= X_train, y_train=y_train, title="Hypertunning GradBoosted Tree Max Depth - Avila Data",
        param_name='max_depth',param_range=np.linspace(1,30,30).astype('int'), xlabel="Max Depth")

In [None]:
param_grid = {'min_samples_leaf': np.array([1]),
              'max_depth': np.linspace(3,25,3).round().astype('int'),
              'n_estimators': np.linspace(80,300,3).round().astype('int')}

boost = GridSearchCV(estimator = GBC, param_grid=param_grid, cv=5, n_jobs=-1)
boost.fit(X_train, y_train)
print("Per Hyperparameter tuning, best parameters are:")
print(boost.best_params_)

In [None]:
title = 'Learning Curve for Grad-Boosted- Avila Data'
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=1)

estimator =  GradientBoostingClassifier(max_depth=4 , min_samples_leaf=1 , n_estimators=200,random_state=1,)
_ , GBDTLC_results = pllc(estimator, title, X_train, y_train, ylim=None, cv=cv, n_jobs=-1)

In [None]:
GBDTLC_results

# SVM

In [None]:
scaler_st = StandardScaler()
scaler_st = scaler_st.fit(X_train)

In [None]:
#scaler = StandardScaler()
#GBC = GradientBoostingClassifier( random_state=1,)
X_train_sc = scaler_st.transform(X_train)
X_test_sc = scaler_st.transform(X_test)

In [None]:
tune_hp(estimator=SVC(random_state=1, kernel='rbf', C=90), X_train= X_train_sc, y_train=y_train, title="Hypertunning SVM Gamma value- Avila Data",
        param_name='gamma',param_range=[0.1,0.2,0.3,1], xlabel="Gamma", cv=3)

In [None]:
title = "Learning Curve SVM- Avila Data"


cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)

estimator =  SVC(random_state=1, kernel='rbf', C=80)
_ , SVM_results = pllc(estimator, title, X_train_sc, y_train, ylim=None, cv=cv, n_jobs=-1)

In [None]:
SVM_results

# KNN

In [None]:
SVM_results
    

In [None]:

tune_hp(estimator=KNeighborsClassifier(n_jobs=-1), X_train= X_train, y_train=y_train, title="Hypertunning KNN # of Neighbors- Avila Data",
        param_name='n_neighbors',param_range=[1,2,3,4,5], xlabel="K-neighbors")

In [None]:
title = "Learning Curve KNN- Avila Data"


cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)

estimator =  KNeighborsClassifier(n_jobs=-1,n_neighbors=1)
_ , KNN_results = pllc(estimator, title, X_train, y_train, ylim=None, cv=cv, n_jobs=-1)

In [None]:
KNN_results

## Comparison Plots

In [None]:

#results = [DTLC_results, NNLC_results, GBDTLC_results, SVM_results, KNN_results]

def plot_LRs(sizes,DTLC_results, NNLC_results, GBDTLC_results, SVM_results, KNN_results, title,scr):
    
    plt.figure()
    plt.title(title)
    plt.xlabel("Training Examples")
    plt.ylabel("Model F1 Score")
    plt.plot(sizes, DTLC_results[scr], '-', color="r", label="Neural Network")
    plt.plot(sizes, NNLC_results[scr] , '-', color="g", label="NN")
    plt.plot(sizes, GBDTLC_results[scr] , '-', color="b", label="Grad-Bossted")
    plt.plot(sizes, SVM_results[scr] , '-', color="k", label="SVM")
    plt.plot(sizes, KNN_results[scr] , '-', color="y", label="KNN")
    plt.legend(loc="best")
    #plt.show() 

In [None]:
plot_LRs(results[0]['sizes'],results[0], results[1], results[2], results[3], results[4], "Validation Learning Rates - Avila Data",'val_scores')

In [None]:

results = [DTLC_results, NNLC_results, GBDTLC_results, SVM_results, KNN_results]

print(results)


In [None]:
def testing_func(clf,X_train, X_test, y_train, y_test):
    
    start_time = timeit.default_timer()
    clf.fit(X_train, y_train)
    end_time = timeit.default_timer()
    training_time = end_time - start_time
    
    start_time = timeit.default_timer()    
    y_pred = clf.predict(X_test)
    end_time = timeit.default_timer()
    pred_time = end_time - start_time
    

    f1 = f1_score(y_test,y_pred, average='macro')
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred, average='macro')
    recall = recall_score(y_test,y_pred, average='macro')
    cm = confusion_matrix(y_test,y_pred )
    
    df_cm = pd.DataFrame(cm, index= class_names, columns=class_names)

    print("Model Training Time (s):   "+"{:.5f}".format(training_time))
    print("Model Prediction Time (s): "+"{:.5f}\n".format(pred_time))
    print("F1 Score:  "+"{:.2f}".format(f1))
    print("Accuracy:  "+"{:.2f}".format(accuracy))
    print("Precision: "+"{:.2f}".format(precision))
    print(" Recall: "+"{:.2f}".format(recall))
    
    print("Matrix")

    sns.heatmap(df_cm,cmap="YlGnBu", annot=True, fmt="d")

In [None]:
f_DT  = DecisionTreeClassifier(criterion='gini',max_depth=24, min_samples_leaf= 1, random_state=1)
f_NN = MLPClassifier(solver='adam',random_state=1, verbose=0, hidden_layer_sizes=(100,200,80,),
                          learning_rate_init= 0.01, activation= 'relu')

f_GBC = GradientBoostingClassifier(max_depth=5 , min_samples_leaf=1 , n_estimators=200,random_state=1,)
f_SVM = SVC(random_state=1, kernel='rbf', C=80)
f_KNN = KNeighborsClassifier(n_jobs=-1, n_neighbors=1,)



In [None]:
testing_func(f_DT,X_train, X_test, y_train, y_test)

In [None]:
testing_func(f_NN,X_train, X_test, y_train, y_test)

In [None]:
testing_func(f_GBC,X_train, X_test, y_train, y_test)

In [None]:
testing_func(f_SVM,X_train_sc, X_test_sc, y_train, y_test)

In [None]:
testing_func(f_KNN,X_train_sc, X_test_sc, y_train, y_test)