# Model testbench


In this jupyter notebook I have all the model tests documented. Written in Python3 and SKlearn. Don't "run all" this notebook. It's computationally complex and can take a very long time to complete.

Dependences: \
-Numpy  \
-Pandas \
-Matplotlib \
-ScyPy \
-Sklearn \
-Lightgbm 

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis, NearestCentroid
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import lightgbm as lgb
from scipy.fft import fft, ifft
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

In [71]:
dataset = pd.read_csv("dataset_new_features.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
dataset.shape

(400, 12003)

## Basic classifications (baseline)

In [6]:
clf = LogisticRegression(random_state=0, penalty='none').fit(X_train, y_train)
clf.score(X_test, y_test)

0.4

In [10]:
clf2=LinearSVC(penalty='l2',random_state=0, tol=1e-5,max_iter=9000,C=1,loss='squared_hinge').fit(X_train,y_train)
clf2.score(X_test, y_test)

0.4

In [11]:
clf3 = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
clf3.score(X_test, y_test)

0.3416666666666667

In [14]:
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train, y_train)
    if sc<clf3.score(X_test,y_test):
        sc=clf3.score(X_test,y_test)
        iteration=neig
print (sc, iteration)

0.4083333333333333 1


In [15]:
red5=PCA(.90)
X_train_PCA=red5.fit_transform(X_train)
X_test_PCA=red5.transform(X_test)
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train_PCA, y_train)
    if sc<clf3.score(X_test_PCA,y_test):
        sc=clf3.score(X_test_PCA,y_test)
        iteration=neig
print (sc, iteration)

0.4166666666666667 1


In [None]:
clf4 = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['elasticnet'], 'solver': ['saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear']},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga']},])
gs=GridSearchCV(clf8,params).fit(X_train,y_train)

## SVC

In [17]:
clf4=LinearSVC(penalty='l2',random_state=0, tol=1e-5,max_iter=9000,C=1,loss='squared_hinge').fit(X_train,y_train)
clf4.score(X_test, y_test)

0.4

In [18]:
clf6 = SVC(random_state=1, kernel='rbf', C=80, gamma=0.0001).fit(X_train,y_train)
clf6.score(X_test, y_test)
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
gs=GridSearchCV(clf6,param_grid).fit(X_train,y_train)
gs.best_estimator_, gs.best_params_, gs.best_score_

(SVC(C=1, gamma=0.0001, random_state=1),
 {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'},
 0.49642857142857144)

In [19]:
param_grid = [
  {'C': [40,100,400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},
 ]

gs2=GridSearchCV(clf6,param_grid).fit(X,y)
gs2.best_estimator_, gs2.best_params_, gs2.best_score_

(SVC(C=40, gamma=0.0001, random_state=1),
 {'C': 40, 'gamma': 0.0001, 'kernel': 'rbf'},
 0.4574999999999999)

In [20]:
param_grid3= [
  {'C': [1,10,80,100,400,1000], 'loss': ['hinge','squared_hinge']},]
gs3=GridSearchCV(LinearSVC(), param_grid3).fit(X,y)

In [21]:
red=PCA(.90)
X_train_PCA=red.fit_transform(X_train)
X_test_PCA=red.transform(X_test)

clf5 = SVC(random_state=1, kernel='rbf', C=80, gamma=0.0001).fit(X_train,y_train)
clf5.score(X_test, y_test)

0.44166666666666665

In [22]:
red2=KernelPCA(.90)
X_train_KPCA=red.fit_transform(X_train)
X_test_KPCA=red.transform(X_test)
param_grid = [{'C': [40,100,400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},]
gs4=GridSearchCV(clf5,param_grid).fit(X_train_KPCA,y_train)
gs4.best_estimator_, gs4.best_score_

(SVC(C=40, gamma=1e-05, random_state=1), 0.4714285714285714)

In [26]:
red4=PCA(.90)
sc=StandardScaler()
X_train_sc=sc.fit_transform(X_train)
X_test_sc=sc.transform(X_test)
X_train_PCA=red4.fit_transform(X_train_sc)
X_test_PCA=red4.transform(X_test_sc)
sc=StandardScaler()
sc.fit_transform(X_train)
sc.transform(X_test)
clf6=SVC()
param_grid = [{'C': [0.1, 40, 100, 400], 'gamma': [0.001, 0.0001, 0.00001], 'kernel': ['rbf']},
              {'C': [0.1, 40, 100, 400], 'kernel': ['linear'], 'gamma': [0.001, 0.0001, 0.00001]},
              {'C': [0.1, 40, 100, 400], 'kernel': ['poly'], 'gamma': [0.001, 0.0001, 0.00001],},]

gs=GridSearchCV(clf6,param_grid).fit(X_train_PCA,y_train)
gs.best_estimator_, gs.best_score_, gs.best_params_

(SVC(C=100, gamma=1e-05),
 0.46071428571428574,
 {'C': 100, 'gamma': 1e-05, 'kernel': 'rbf'})

In [None]:
red4=PCA()
sc2=StandardScaler()
X_train_sc=sc2.fit_transform(X_train)
X_test_sc=sc2.transform(X_test)
X_train_PCA=red4.fit_transform(X_train_sc)
X_test_PCA=red4.transform(X_test_sc)

clf6=LinearSVC(max_iter=5000)
param_grid = [{'C': [0.1, 40, 100, 400], 'penalty': ['l2'], 'loss': ['hinge','squared_hinge']},]
gs=GridSearchCV(clf6,param_grid).fit(X_train_PCA,y_train)
gs.best_estimator_, gs.best_score_, gs.best_params_

## Ensemble methods

In [38]:
dataset = pd.read_csv("dataset_new_features.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [39]:
ensemble1=RandomForestClassifier().fit(X_train, y_train)
ensemble1.score(X_test, y_test)

0.575

In [40]:
ensemble2=AdaBoostClassifier().fit(X_train, y_train)
ensemble2.score(X_test, y_test)

0.4666666666666667

In [41]:
ensemble3=GradientBoostingClassifier().fit(X_train, y_train)
ensemble3.score(X_test,y_test)

0.5

In [42]:
red4=PCA(0.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

In [43]:
ensemble1=RandomForestClassifier().fit(X_train_PCA, y_train)
ensemble1.score(X_test_PCA, y_test)

0.55

In [44]:
ensemble2=AdaBoostClassifier().fit(X_train_PCA, y_train)
ensemble2.score(X_test_PCA, y_test)

0.48333333333333334

In [45]:
ensemble3=GradientBoostingClassifier().fit(X_train_PCA, y_train)
ensemble3.score(X_test_PCA,y_test)

0.5333333333333333

In [68]:
dataset = pd.read_csv("dataset_new_features.csv")
y=dataset.iloc[:,-1]
X=dataset.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [47]:
ensemble4 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(ensemble4, param_grid, cv=3)
gbm.fit(X_train, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

(LGBMClassifier(learning_rate=0.001, n_estimators=10000, num_leaves=25),
 0.3608251353618546,
 0,
 {'learning_rate': 0.001, 'n_estimators': 10000, 'num_leaves': 25})

In [49]:
ensemble5 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [21,22,23,24,25,26],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100]
}
gbm2 = GridSearchCV(ensemble5, param_grid, cv=3)
gbm2.fit(X_train, y_train)
gbm2.best_estimator_, gbm2.best_score_, gbm2.best_index_, gbm2.best_params_

(LGBMClassifier(learning_rate=0.01, num_leaves=21),
 0.4716312056737588,
 0,
 {'learning_rate': 0.01, 'n_estimators': 100, 'num_leaves': 21})

In [50]:
gbm = lgb.LGBMClassifier(num_leaves=,
                        learning_rate=0.0001,
                        n_estimators=2000,
                        max_depth=7)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['multi_error','multiclass'],
        early_stopping_rounds=70)
gbm.score(X_test, y_test)

[1]	valid_0's multi_error: 0.7	valid_0's multi_logloss: 1.39241
Training until validation scores don't improve for 70 rounds
[2]	valid_0's multi_error: 0.7	valid_0's multi_logloss: 1.38761
[3]	valid_0's multi_error: 0.7	valid_0's multi_logloss: 1.38294
[4]	valid_0's multi_error: 0.7	valid_0's multi_logloss: 1.37834
[5]	valid_0's multi_error: 0.708333	valid_0's multi_logloss: 1.37416
[6]	valid_0's multi_error: 0.691667	valid_0's multi_logloss: 1.36986
[7]	valid_0's multi_error: 0.675	valid_0's multi_logloss: 1.36586
[8]	valid_0's multi_error: 0.666667	valid_0's multi_logloss: 1.362
[9]	valid_0's multi_error: 0.658333	valid_0's multi_logloss: 1.35846
[10]	valid_0's multi_error: 0.65	valid_0's multi_logloss: 1.35499
[11]	valid_0's multi_error: 0.591667	valid_0's multi_logloss: 1.35103
[12]	valid_0's multi_error: 0.575	valid_0's multi_logloss: 1.34762
[13]	valid_0's multi_error: 0.575	valid_0's multi_logloss: 1.34414
[14]	valid_0's multi_error: 0.566667	valid_0's multi_logloss: 1.34085
[15

0.4583333333333333

In [66]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)
ensemble6 = lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [40,38,37,34,32,30,28,25],
    'learning_rate': [0.01,0.001],
    'n_estimators': [100],
    'max_depth': [3,4,5,6,7,8,9,10]
}
gbm3 = GridSearchCV(ensemble6, param_grid, cv=3)
gbm3.fit(X_train_PCA, y_train)
gbm3.best_estimator_, gbm3.best_score_, gbm3.best_index_, gbm3.best_params_

(LGBMClassifier(learning_rate=0.01, max_depth=4, num_leaves=40),
 0.4356363913673454,
 8,
 {'learning_rate': 0.01,
  'max_depth': 4,
  'n_estimators': 100,
  'num_leaves': 40})

## Naive bayes

In [53]:
nb1=GaussianNB().fit(X_train, y_train)
nb1.score(X_test,y_test)

0.39166666666666666

In [54]:
nb2=ComplementNB().fit(X_train, y_train)
nb2.score(X_test,y_test)

0.475

In [55]:
nb4=BernoulliNB(alpha=0.075, binarize=0, fit_prior=True).fit(X_train, y_train)
nb4.score(X_test,y_test)

0.45

In [56]:
clf6 = NearestCentroid().fit(X_train, y_train)
clf6.score(X_test,y_test)

0.4083333333333333

In [57]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

In [58]:
nb1=GaussianNB().fit(X_train, y_train)
nb1.score(X_test,y_test)

0.39166666666666666

In [59]:
nb2=ComplementNB().fit(X_train, y_train)
nb2.score(X_test,y_test)

0.475

In [60]:
nb4=BernoulliNB(alpha=0.075, binarize=0, fit_prior=True).fit(X_train, y_train)
nb4.score(X_test,y_test)

0.45

In [61]:
clf6 = NearestCentroid().fit(X_train, y_train)
clf6.score(X_test,y_test)

0.4083333333333333

In [None]:
#Couldn't run this test in some computers...

#nca = NeighborhoodComponentsAnalysis().fit_transform(X_train, y_train) 
#nca.transform(X_test,y_test)
#knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
#knn.score(X_test, y_test)

## Preprocessing: FFT (fast fourier transform)

In [73]:
X_fourier=fft(X[0:12000])
X_fourier_abs=np.abs(X_fourier)
X_fourier=pd.DataFrame.from_records(X_fourier_abs)
X_f=pd.concat([X_fourier, X["threshold_pas"], X["peak_number"]], axis=1, sort=False)
X_fourier.shape
#from here X_train X_test... Are fourier transformed

(400, 12002)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_f, y, test_size=0.30)

In [None]:
clf_fft= LogisticRegression(random_state=0, penalty='none').fit(X_train, y_train)
clf_fft.score(X_test, y_test)
clf_fft.classes_

In [76]:
clf2_fft = SVC()
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
gs_fft=GridSearchCV(clf2_fft,param_grid).fit(X,y)
gs_fft.cv_results_
gs_fft.best_score_, gs_fft.best_params_, gs_fft.best_estimator_, gs_fft.best_index_

(0.4575, {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}, SVC(C=1, gamma=0.0001), 5)

In [None]:
clf3_fft = LinearSVC()
param_grid = [{'C': [0.5, 1, 10, 100, 1000], 'loss': ['hinge', 'squared_hinge']},]
gs2_fft=GridSearchCV(clf3,param_grid).fit(X,y)
gs2_fft.best_score_, gs2_fft.best_params_, gs2_fft.best_estimator_, gs2_fft.best_index_

In [77]:
#KNC n_neighbors iterators to test the best number of it
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train, y_train)
    if sc<clf3.score(X_test,y_test):
        sc=clf3.score(X_test,y_test)
        iteration=neig
print (sc, iteration)

0.6166666666666667 1


In [78]:
red5=PCA(.90)
X_train_PCA=red5.fit_transform(X_train)
X_test_PCA=red5.transform(X_test)
sc=0
iteration=0
for neig in range(1,200):
    clf3 = KNeighborsClassifier(n_neighbors=neig).fit(X_train_PCA, y_train)
    if sc<clf3.score(X_test_PCA,y_test):
        sc=clf3.score(X_test_PCA,y_test)
        iteration=neig
print (sc, iteration)

0.775 2


In [79]:
clf_kn_fft= KNeighborsClassifier(iteration).fit(X_train, y_train)
clf_kn_fft.score(X_test, y_test)

0.6166666666666667

In [82]:
nb4=BernoulliNB(alpha=1).fit(X_train, y_train)
nb4.score(X_test,y_test)

0.21666666666666667

In [83]:
clf4_fft = LogisticRegression(random_state=0)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['elasticnet'], 'solver': ['saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear']},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga']},])

gs=GridSearchCV(clf4_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1312, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also re



(0.7607142857142857,
 LogisticRegression(random_state=0, solver='liblinear'),
 {'penalty': 'l2', 'solver': 'liblinear'})

In [84]:
clf5_fft=LogisticRegression(penalty='none', solver='sag', max_iter=10000).fit(X_train, y_train)
clf5_fft.score(X_test, y_test)

0.8416666666666667

In [85]:
clf6_fft = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga','liblinear'], 'C': [0.1, 1, 100]},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.1, 1, 100]},])

gs=GridSearchCV(clf6_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

(0.8178571428571428,
 LogisticRegression(C=100, max_iter=10000, penalty='l1', random_state=0,
                    solver='liblinear'),
 {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'})

In [None]:
clf6_fft_b=LogisticRegression(random_state=0, max_iter=10000, C=150, penalty'l1', solver='liblinear')

In [86]:
clf7_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='multinomial').fit(X_train, y_train)
clf8_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='ovr').fit(X_train, y_train)
clf7_fft.score(X_test, y_test), clf8_fft.score(X_test, y_test)

(0.8416666666666667, 0.8333333333333334)

In [87]:
clf10_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='elasticnet', solver='saga')
params=([{'multi_class': ['ovr', 'multinomial'], 'l1_ratio':[0, 0.25, 0.5, 0.75, 1]}])
gs=GridSearchCV(clf10_fft,params).fit(X_train,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

(0.7821428571428573,
 LogisticRegression(l1_ratio=0, max_iter=10000, multi_class='ovr',
                    penalty='elasticnet', random_state=0, solver='saga'),
 {'l1_ratio': 0, 'multi_class': 'ovr'})

In [88]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)
clf8_fft=LogisticRegression(random_state=0, max_iter=10000, penalty='none', solver='sag', multi_class='ovr').fit(X_train_PCA, y_train)
clf8_fft.score(X_test_PCA, y_test)

0.7583333333333333

In [89]:
clf11_fft_PCA = LogisticRegression(random_state=0, max_iter=10000)
params=([{'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag','saga']},
        {'penalty': ['l2'], 'solver': ['newton-cg', 'sag','saga','liblinear'], 'C': [0.1, 1, 100]},
        {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.1, 1, 100]},])

gs=GridSearchCV(clf11_fft_PCA,params).fit(X_train_PCA,y_train)
gs.best_score_, gs.best_estimator_, gs.best_params_

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1312, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)



(0.7464285714285714,
 LogisticRegression(max_iter=10000, penalty='none', random_state=0, solver='sag'),
 {'penalty': 'none', 'solver': 'sag'})

In [None]:
lgbm_fft= lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(lgbm_fft, param_grid, cv=3)
gbm.fit(X_train, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

In [None]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

lgbm_fft_PCA= lgb.LGBMClassifier()
param_grid = {
    'num_leaves': [25,28,31,33,35],
    'learning_rate': [0.001, 0.01, 0.1, 1],
    'n_estimators': [10000] #max
}
gbm = GridSearchCV(lgbm_fft_PCA, param_grid)
gbm.fit(X_train_PCA, y_train)
gbm.best_estimator_, gbm.best_score_, gbm.best_index_, gbm.best_params_

In [None]:
red4=PCA(.95)
X_train_PCA=red4.fit_transform(X_train)
X_test_PCA=red4.transform(X_test)

gbm = lgb.LGBMClassifier(num_leaves=15,
                        learning_rate=0.001,
                        n_estimators=20000)#                        max_depth=10)             
gbm.fit(X_train_PCA, y_train,
eval_set=[(X_test_PCA, y_test)],
eval_metric=['multi_error','multiclass'],
early_stopping_rounds=100)


gbm.score(X_test_PCA, y_test)