# Model testbench


In this jupyter notebook I have all the model tests documented. Written in Python3 and SKlearn. Don't "run all" this notebook. It's computationally complex and can take a very long time to complete.

Dependences: \
-Numpy  \
-Pandas \
-Matplotlib \
-ScyPy \
-Sklearn \
-Lightgbm 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis, NearestCentroid
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import lightgbm as lgb
from scipy.fft import fft, ifft
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.metrics import roc_auc_score, f1_score

In [3]:
dataset = pd.read_csv("dataset.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
dataset.shape

(301, 12003)

## Basic classifications (baseline)

In [124]:
clf = LogisticRegression(random_state=0, penalty='none').fit(X_train, y_train)
y_test_prediction=clf.predict_proba(X_test)
y_test_class=clf.predict(X_test)
clf.score(X_test, y_test), roc_auc_score(y_test, y_test_prediction,multi_class='ovr'), f1_score(y_test,y_test_class,average='micro')

(0.7912087912087912, 0.9473690734094266, 0.7912087912087912)

In [127]:
clf2=LinearSVC(penalty='l2',random_state=0, tol=1e-5,max_iter=9000,C=1,loss='squared_hinge').fit(X_train,y_train)
y_test_class=clf2.predict(X_test)
clf2.score(X_test, y_test), f1_score(y_test,y_test_class,average='micro')

(0.8021978021978022, 0.8021978021978022)

In [17]:
clf3 = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
clf3.score(X_test, y_test)

0.4444444444444444

In [18]:
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train, y_train)
    if sc<clf3.score(X_test,y_test):
        sc=clf3.score(X_test,y_test)
        iteration=neig
print (sc, iteration)

0.6222222222222222 1


In [19]:
red5=PCA(.90)
X_train_PCA=red5.fit_transform(X_train)
X_test_PCA=red5.transform(X_test)
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train_PCA, y_train)
    if sc<clf3.score(X_test_PCA,y_test):
        sc=clf3.score(X_test_PCA,y_test)
        iteration=neig
print (sc, iteration)

0.6444444444444445 1


In [128]:
clf4 = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear']},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga']},])
gs=GridSearchCV(clf4,params).fit(X_train,y_train)
y_test_prediction=gs.predict_proba(X_test)
y_test_class=gs.predict(X_test)
gs.score(X_test, y_test), roc_auc_score(y_test, y_test_prediction,multi_class='ovr'), f1_score(y_test,y_test_class,average='micro')

(0.8461538461538461, 0.9893147930753768, 0.8461538461538461)

In [131]:
gs.best_estimator_, gs.best_index_, gs.best_params_, gs.best_score_

(LogisticRegression(max_iter=10000, random_state=0, solver='newton-cg'),
 4,
 {'penalty': 'l2', 'solver': 'newton-cg'},
 0.8571428571428571)

## SVC

In [139]:
clf4=LinearSVC(penalty='l2',random_state=0, tol=1e-5,max_iter=9000,C=2,loss='squared_hinge').fit(X_train,y_train)
clf4.score(X_test, y_test)

0.7692307692307693

In [42]:
clf6 = SVC(random_state=1, kernel='rbf', C=80, gamma=0.0001).fit(X_train,y_train)
clf6.score(X_test, y_test)
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
gs=GridSearchCV(clf6,param_grid).fit(X_train,y_train)
gs.best_estimator_, gs.best_params_, gs.best_score_

(SVC(C=10, gamma=0.0001, random_state=1),
 {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'},
 0.8619047619047618)

In [43]:
param_grid = [
  {'C': [40,100,400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},
 ]

gs2=GridSearchCV(clf6,param_grid).fit(X,y)
gs2.best_estimator_, gs2.best_params_, gs2.best_score_

(SVC(C=40, gamma=0.0001, random_state=1),
 {'C': 40, 'gamma': 0.0001, 'kernel': 'rbf'},
 0.8936065573770492)

In [4]:
param_grid3= [
  {'C': [1,10,80,100,400,1000], 'loss': ['hinge','squared_hinge']},]
gs3=GridSearchCV(LinearSVC(), param_grid3).fit(X,y)
gs3.best_estimator_, gs3.best_params_, gs3.best_score_

(LinearSVC(C=1, loss='hinge'), {'C': 1, 'loss': 'hinge'}, 0.7604918032786886)

In [45]:
red=PCA(.90)
X_train_PCA=red.fit_transform(X_train)
X_test_PCA=red.transform(X_test)

clf5 = SVC(random_state=1, kernel='rbf', C=80, gamma=0.0001).fit(X_train,y_train)
clf5.score(X_test, y_test)

0.9010989010989011

In [46]:
red2=KernelPCA(.90)
X_train_KPCA=red.fit_transform(X_train)
X_test_KPCA=red.transform(X_test)
param_grid = [{'C': [40,100,400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},]
gs4=GridSearchCV(clf5,param_grid).fit(X_train_KPCA,y_train)
gs4.best_estimator_, gs4.best_score_

(SVC(C=40, gamma=0.001, random_state=1), 0.8285714285714285)

In [47]:
red4=PCA(.90)
sc=StandardScaler()
X_train_sc=sc.fit_transform(X_train)
X_test_sc=sc.transform(X_test)
X_train_PCA=red4.fit_transform(X_train_sc)
X_test_PCA=red4.transform(X_test_sc)
sc=StandardScaler()
sc.fit_transform(X_train)
sc.transform(X_test)
clf6=SVC()
param_grid = [{'C': [0.1, 40, 100, 400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},
              {'C': [0.1, 40, 100, 400], 'kernel': ['linear'], 'gamma': [0.001, 0.0001, 0.00001]},
              {'C': [0.1, 40, 100, 400], 'kernel': ['poly'], 'gamma': [0.001, 0.0001, 0.00001],},]

gs=GridSearchCV(clf6,param_grid).fit(X_train_PCA,y_train)
gs.best_estimator_, gs.best_score_, gs.best_params_

(SVC(C=40, gamma=1e-05),
 0.8714285714285713,
 {'C': 40, 'gamma': 1e-05, 'kernel': 'rbf'})

In [48]:
red4=PCA()
sc2=StandardScaler()
X_train_sc=sc2.fit_transform(X_train)
X_test_sc=sc2.transform(X_test)
X_train_PCA=red4.fit_transform(X_train_sc)
X_test_PCA=red4.transform(X_test_sc)

clf6=LinearSVC(max_iter=5000)
param_grid = [{'C': [0.1, 40, 100, 400], 'penalty': ['l2'], 'loss': ['hinge','squared_hinge']},]
gs=GridSearchCV(clf6,param_grid).fit(X_train_PCA,y_train)
gs.best_estimator_, gs.best_score_, gs.best_params_

(LinearSVC(C=0.1, loss='hinge', max_iter=5000),
 0.8142857142857143,
 {'C': 0.1, 'loss': 'hinge', 'penalty': 'l2'})

## Ensemble methods

In [6]:
dataset = pd.read_csv("dataset.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [52]:
ensemble1=RandomForestClassifier().fit(X_train, y_train)
ensemble1.score(X_test, y_test)

0.8021978021978022

In [53]:
ensemble2=AdaBoostClassifier().fit(X_train, y_train)
ensemble2.score(X_test, y_test)

0.6703296703296703

In [5]:
ensemble3=GradientBoostingClassifier().fit(X_train, y_train)
ensemble3.score(X_test,y_test)

0.9010989010989011

In [55]:
red4=PCA(0.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

In [56]:
ensemble1=RandomForestClassifier().fit(X_train_PCA, y_train)
ensemble1.score(X_test_PCA, y_test)

0.9120879120879121

In [57]:
ensemble2=AdaBoostClassifier().fit(X_train_PCA, y_train)
ensemble2.score(X_test_PCA, y_test)

0.7362637362637363

In [58]:
ensemble3=GradientBoostingClassifier().fit(X_train_PCA, y_train)
ensemble3.score(X_test_PCA,y_test)

0.8571428571428571

In [59]:
dataset = pd.read_csv("dataset.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
dataset.shape

(301, 12003)

In [60]:
ensemble4 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(ensemble4, param_grid, cv=3)
gbm.fit(X_train, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

(LGBMClassifier(n_estimators=10000, num_leaves=25),
 0.7571428571428571,
 10,
 {'learning_rate': 0.1, 'n_estimators': 10000, 'num_leaves': 25})

In [62]:
ensemble5 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [21,22,23,24,25,26],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100]
}
gbm2 = GridSearchCV(ensemble5, param_grid, cv=3)
gbm2.fit(X_train, y_train)
gbm2.best_estimator_, gbm2.best_score_, gbm2.best_index_, gbm2.best_params_

(LGBMClassifier(num_leaves=21),
 0.719047619047619,
 6,
 {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 21})

In [65]:
gbm = lgb.LGBMClassifier(num_leaves=33,
                        learning_rate=0.0001,
                        n_estimators=2000,
                        max_depth=7)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['multi_error','multiclass'],
        early_stopping_rounds=70)
gbm.score(X_test, y_test)

[1]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.1015
Training until validation scores don't improve for 70 rounds
[2]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10143
[3]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10135
[4]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10127
[5]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10119
[6]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10111
[7]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10104
[8]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10096
[9]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10088
[10]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.1008
[11]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10072
[12]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10064
[13]	valid_0's multi_error: 0.692308	valid_0's multi_logloss: 1.10057
[14]	valid_0's multi_error: 0.692308	val

0.3076923076923077

In [63]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)
ensemble6 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [40,38,37,34,32,30,28,25],
    'learning_rate': [0.01,0.001],
    'n_estimators': [100],
    'max_depth': [3,4,5,6,7,8,9,10]
}
gbm3 = GridSearchCV(ensemble6, param_grid, cv=3)
gbm3.fit(X_train_PCA, y_train)
gbm3.best_estimator_, gbm3.best_score_, gbm3.best_index_, gbm3.best_params_

(LGBMClassifier(learning_rate=0.01, max_depth=5, num_leaves=40),
 0.8523809523809525,
 16,
 {'learning_rate': 0.01,
  'max_depth': 5,
  'n_estimators': 100,
  'num_leaves': 40})

## Naive bayes

In [31]:
nb1=GaussianNB().fit(X_train, y_train)
nb1.score(X_test,y_test)

0.6813186813186813

In [32]:
nb2=ComplementNB().fit(X_train, y_train)
nb2.score(X_test,y_test)

0.7142857142857143

In [123]:
#0.075
nb4=BernoulliNB(alpha=0.075, binarize=0, fit_prior=True).fit(X_train, y_train)
y_test_prediction=nb4.predict_proba(X_test)
y_test_class=nb4.predict(X_test)
nb4.score(X_test,y_test), roc_auc_score(y_test, y_test_prediction,multi_class='ovr'), f1_score(y_test,y_test_class,average='micro')

(0.9120879120879121, 0.968923484290085, 0.9120879120879121)

In [34]:
clf6 = NearestCentroid().fit(X_train, y_train)
clf6.score(X_test,y_test)

0.6813186813186813

In [81]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

In [87]:
nb1=GaussianNB().fit(X_train_PCA, y_train)
nb1.score(X_test_PCA,y_test)

0.8571428571428571

In [92]:
nb2=ComplementNB().fit(np.abs(X_train_PCA), y_train)
nb2.score(np.abs(X_test_PCA),y_test)

0.7472527472527473

In [93]:
nb4=BernoulliNB(alpha=0.075, binarize=0, fit_prior=True).fit(X_train_PCA, y_train)
nb4.score(X_test_PCA,y_test)

0.8571428571428571

In [94]:
clf6 = NearestCentroid().fit(X_train_PCA, y_train)
clf6.score(X_test_PCA,y_test)

0.7032967032967034

In [None]:
#Couldn't run this test in some computers...

#nca = NeighborhoodComponentsAnalysis().fit_transform(X_train, y_train) 
#nca.transform(X_test,y_test)
#knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
#knn.score(X_test, y_test)

## Preprocessing: FFT (fast fourier transform)

In [11]:
X_fourier=fft(X[0:11999])
X_fourier_abs=np.abs(X_fourier)
X_fourier=pd.DataFrame.from_records(X_fourier_abs)
X_f=pd.concat([X_fourier,X[X.columns[12000]], X[X.columns[12001]]], axis=1, sort=False)
X_fourier.shape
#from here X_train X_test... Are fourier transformed

(301, 12002)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_f, y, test_size=0.30)

In [14]:
clf_fft= LogisticRegression(random_state=0, penalty='none').fit(X_train, y_train)
clf_fft.score(X_test, y_test)

0.8681318681318682

In [None]:
clf2_fft = SVC()
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
gs_fft=GridSearchCV(clf2_fft,param_grid).fit(X,y)
gs_fft.cv_results_
gs_fft.best_score_, gs_fft.best_params_, gs_fft.best_estimator_, gs_fft.best_index_

In [None]:
clf3_fft = LinearSVC()
param_grid = [{'C': [0.5, 1, 10, 100, 1000], 'loss': ['hinge', 'squared_hinge']},]
gs2_fft=GridSearchCV(clf3,param_grid).fit(X,y)
gs2_fft.best_score_, gs2_fft.best_params_, gs2_fft.best_estimator_, gs2_fft.best_index_

In [15]:
#KNC n_neighbors iterators to test the best number of it
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train, y_train)
    if sc<clf3.score(X_test,y_test):
        sc=clf3.score(X_test,y_test)
        iteration=neig
print (sc, iteration)

0.8131868131868132 56


In [16]:
red5=PCA(.90)
X_train_PCA=red5.fit_transform(X_train)
X_test_PCA=red5.transform(X_test)
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train_PCA, y_train)
    if sc<clf3.score(X_test_PCA,y_test):
        sc=clf3.score(X_test_PCA,y_test)
        iteration=neig
print (sc, iteration)

0.8241758241758241 56


In [None]:
clf_kn_fft= KNeighborsClassifier(iteration).fit(X_train, y_train)
clf_kn_fft.score(X_test, y_test)

In [None]:
nb4=BernoulliNB(alpha=1).fit(X_train, y_train)
nb4.score(X_test,y_test)

In [None]:
clf4_fft = LogisticRegression(random_state=0)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['elasticnet'], 'solver': ['saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear']},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga']},])

gs=GridSearchCV(clf4_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

In [None]:
clf5_fft=LogisticRegression(penalty='none', solver='sag', max_iter=10000).fit(X_train, y_train)
clf5_fft.score(X_test, y_test)

In [None]:
clf6_fft = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear'], 'C': [0.1, 1, 100]},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.1, 1, 100]},])

gs=GridSearchCV(clf6_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

In [None]:
clf6_fft_b=LogisticRegression(random_state=0, max_iter=10000, C=150, penalty'l1', solver='liblinear')

In [None]:
clf7_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='multinomial').fit(X_train, y_train)
clf8_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='ovr').fit(X_train, y_train)
clf7_fft.score(X_test, y_test), clf8_fft.score(X_test, y_test)

In [None]:
clf10_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='elasticnet', solver='saga')
params=([{'multi_class': ['ovr', 'multinomial'], 'l1_ratio':[0, 0.25, 0.5, 0.75, 1]}])
gs=GridSearchCV(clf10_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

In [None]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)
clf8_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='ovr').fit(X_train_PCA, y_train)
clf8_fft.score(X_test_PCA, y_test)

In [None]:
clf11_fft_PCA = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'sag','saga','liblinear'], 'C': [0.1, 1, 100]},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.1, 1, 100]},])

gs=GridSearchCV(clf11_fft_PCA,params).fit(X_train_PCA,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

In [None]:
lgbm_fft= lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(lgbm_fft, param_grid, cv=3)
gbm.fit(X_train, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

In [None]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

lgbm_fft_PCA= lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(lgbm_fft_PCA, param_grid)
gbm.fit(X_train_PCA, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

In [None]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

gbm = lgb.LGBMClassifier(num_leaves=15,
                        learning_rate=0.001,
                        n_estimators=20000)#                        max_depth=10)             
gbm.fit(X_train_PCA, y_train,
eval_set=[(X_test_PCA, y_test)],
eval_metric=['multi_error','multiclass'],
early_stopping_rounds=100)


gbm.score(X_test_PCA, y_test)