# Running all Classification Models to get the best predictions
## Then, moving on to Neural Networks to get "Better Predictions"

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC

from dummies_bins_test_train_cv import *

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/use_for_analysis.csv')

In [3]:
X_train, X_test, y_train, y_test, X, y, df_clean = get_Xy_train_test(df, .98, .99)

y Shape: (2076,)
X Shape: (2076, 20)
X_train Shape: (2039, 20)
X_test Shape: (37, 20)
y_train Shape: (2039,)
y_test Shape: (37,)


### Linear Discriminant Analysis

In [4]:
LDA_clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto').fit(X_train, y_train)
LDA_scores = cross_validation_process(LDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=93.94%
Standard_Deviation=0.08
True_Score(Mean/SD)=11.717

Prediction_Confusion_Matrix=[6|9]:[4|18]
Prediction_Accuracy=64.86%


### Quadratic Discriminant Analysis

Best results with:
- reg_param = 0.26055
- reg_param = 0.36968
- reg_param = 0.96127
- reg_param = 0.83353
- reg_param = 0.77121

In [5]:
QDA_clf = QuadraticDiscriminantAnalysis(reg_param=0.26055).fit(X_train, y_train)
QDA_scores = cross_validation_process(QDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=86.36%
Standard_Deviation=0.152
True_Score(Mean/SD)=5.686

Prediction_Confusion_Matrix=[6|9]:[3|19]
Prediction_Accuracy=67.57%


### Gaussian Process Classifier

In [6]:
gpc_rbf_clf = GaussianProcessClassifier(n_jobs=-2, n_restarts_optimizer=10,
                                        random_state=9).fit(X_train, y_train)
gpc_rbf_score = cross_validation_process(gpc_rbf_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=92.42%
Standard_Deviation=0.083
True_Score(Mean/SD)=11.137

Prediction_Confusion_Matrix=[9|6]:[4|18]
Prediction_Accuracy=72.97%


### Logistic Regression

In [7]:
lgst_reg_clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=9, 
                                  max_iter=5000, C=1e-3, solver='lbfgs', n_jobs=8, 
                                  multi_class='auto').fit(X_train, y_train)

lgst_reg_score = cross_validation_process(lgst_reg_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=90.91%
Standard_Deviation=0.083
True_Score(Mean/SD)=10.954

Prediction_Confusion_Matrix=[6|9]:[5|17]
Prediction_Accuracy=62.16%


### Logistic Regression CV

Best results with:

- Cs = 10, cv = 6
- Cs = 16, cv = 4
- Cs = 19, cv = 6
- Cs = 19, cv = 12
- Cs = 25, cv = 6
- Cs = 25, cv = 12

In [9]:
lgst_reg_cv_clf = LogisticRegressionCV(Cs=10, penalty='l2', cv=6,
                                       class_weight='balanced', random_state=9,
                                       solver='newton-cg', n_jobs=-2).fit(X_train, y_train)
lgst_reg_cv_score = cross_validation_process(lgst_reg_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=88.64%
Standard_Deviation=0.089
True_Score(Mean/SD)=9.96

Prediction_Confusion_Matrix=[6|9]:[5|17]
Prediction_Accuracy=62.16%


### Ada Boost Classifier

In [22]:
ada_clf = AdaBoostClassifier(n_estimators=274, learning_rate=0.013,
                             random_state=9).fit(X_train, y_train)
ada_scores = cross_validation_process(ada_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.58%
Standard_Deviation=0.14
True_Score(Mean/SD)=5.911

Prediction_Confusion_Matrix=[8|7]:[4|18]
Prediction_Accuracy=70.27%


### SGD Classifier

In [21]:
SGD_clf = SGDClassifier(loss='hinge', penalty='elasticnet', max_iter=1e4,
                        shuffle=False, n_jobs=8, random_state=9,
                        class_weight='balanced').fit(X_train, y_train)
SGD_score = cross_validation_process(SGD_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=92.42%
Standard_Deviation=0.083
True_Score(Mean/SD)=11.137

Prediction_Confusion_Matrix=[7|8]:[2|20]
Prediction_Accuracy=72.97%


### Random Forest Classifier

In [14]:
rand_frst_clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-2,
                                       min_samples_leaf=2, random_state=9,
                                       class_weight='balanced').fit(X_train, y_train)

rand_frst_score = cross_validation_process(rand_frst_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=93.94%
Standard_Deviation=0.08
True_Score(Mean/SD)=11.717

Prediction_Confusion_Matrix=[11|4]:[6|16]
Prediction_Accuracy=72.97%


### Ridge Classifier

In [46]:
ridge_clf = RidgeClassifier(class_weight='balanced', random_state=9
                           ).fit(X_train, y_train)
ridge_score = cross_validation_process(ridge_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=93.94%
Standard_Deviation=0.08
True_Score(Mean/SD)=11.717

Prediction_Confusion_Matrix=[6|9]:[4|18]
Prediction_Accuracy=64.86%


### Ridge Classifier CV

In [47]:
ridge_cv_clf = RidgeClassifierCV(scoring='average_precision', cv=20,
                                 class_weight='balanced').fit(X_train, y_train)
ridge_cv_score = cross_validation_process(ridge_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=93.94%
Standard_Deviation=0.08
True_Score(Mean/SD)=11.717

Prediction_Confusion_Matrix=[6|9]:[4|18]
Prediction_Accuracy=64.86%


### K Neighbors Classifier

In [72]:
KNN_clf = KNeighborsClassifier(n_neighbors=19, leaf_size=88, n_jobs=8).fit(X_train, y_train)
KNN_score = cross_validation_process(KNN_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=85.61%
Standard_Deviation=0.155
True_Score(Mean/SD)=5.514

Prediction_Confusion_Matrix=[8|7]:[4|18]
Prediction_Accuracy=70.27%


### Multi-layer Perceptron classifier

In [79]:
MLP_clf = MLPClassifier(hidden_layer_sizes=(64,), activation='logistic', solver='lbfgs', 
                        alpha=0.0001, batch_size=8, learning_rate='constant', max_iter=5000, 
                        shuffle=False, random_state=9, validation_fraction=0.1, 
                        n_iter_no_change=30).fit(X_train, y_train)

MLP_score = cross_validation_process(MLP_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=84.85%
Standard_Deviation=0.132
True_Score(Mean/SD)=6.424

Prediction_Confusion_Matrix=[7|8]:[5|17]
Prediction_Accuracy=64.86%


### Best for small dataset:

15 for test:
1. KNN = 83% [[4 1][2 11]]
2. Random Forest = 83% [[5 0][3 10]]
3. Ada Boost Classifier = 83% [[4 1][2 11]]
4. Gaussian Process Classifier = 83% [[4 1][2 11]]

In [None]:
# classification_scoring = ['accuracy', 'balanced_accuracy', 'average_precision',
#                           'brier_score_loss', 'f1', 'f1_micro', 'f1_macro',
#                           'f1_weighted', 'neg_log_loss', 'precision',
#                           'recall', 'roc_auc']