# Running all Classification Models to get the best predictions
## Then, moving on to Neural Networks to get "Better Predictions"

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC

from dummies_bins_test_train_cv import *

from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/use_for_predictions.csv')

In [4]:
X_train, X_test, y_train, y_test, X, y, df_clean = get_Xy_train_test(df, .98, .99)

y Shape: (2082,)
X Shape: (2082, 20)
X_train Shape: (2053, 20)
X_test Shape: (29, 20)
y_train Shape: (2053,)
y_test Shape: (29,)


### Linear Discriminant Analysis

In [5]:
LDA_clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto').fit(X_train, y_train)
LDA_scores = cross_validation_process(LDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=90.15%
Standard_Deviation=0.154
True_Score(Mean/SD)=5.849

Prediction_Confusion_Matrix=[7|6]:[2|14]
Prediction_Accuracy=72.41%


### Quadratic Discriminant Analysis

Best results with:
- reg_param = 0.26055
- reg_param = 0.36968
- reg_param = 0.96127
- reg_param = 0.83353
- reg_param = 0.77121

In [6]:
QDA_clf = QuadraticDiscriminantAnalysis(reg_param=0.26055).fit(X_train, y_train)
QDA_scores = cross_validation_process(QDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.58%
Standard_Deviation=0.215
True_Score(Mean/SD)=3.849

Prediction_Confusion_Matrix=[7|6]:[1|15]
Prediction_Accuracy=75.86%


### Gaussian Process Classifier

In [7]:
gpc_rbf_clf = GaussianProcessClassifier(n_jobs=-2, n_restarts_optimizer=10,
                                        random_state=9).fit(X_train, y_train)
gpc_rbf_score = cross_validation_process(gpc_rbf_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=77.27%
Standard_Deviation=0.249
True_Score(Mean/SD)=3.104

Prediction_Confusion_Matrix=[9|4]:[3|13]
Prediction_Accuracy=75.86%


### Logistic Regression

In [8]:
lgst_reg_clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=9, 
                                  max_iter=5000, C=1e-3, solver='lbfgs', n_jobs=8, 
                                  multi_class='auto').fit(X_train, y_train)

lgst_reg_score = cross_validation_process(lgst_reg_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=86.36%
Standard_Deviation=0.186
True_Score(Mean/SD)=4.654

Prediction_Confusion_Matrix=[7|6]:[3|13]
Prediction_Accuracy=68.97%


### Logistic Regression CV

Best results with:

- Cs = 10, cv = 6
- Cs = 16, cv = 4
- Cs = 19, cv = 6
- Cs = 19, cv = 12
- Cs = 25, cv = 6
- Cs = 25, cv = 12

In [9]:
lgst_reg_cv_clf = LogisticRegressionCV(Cs=10, penalty='l2', cv=6,
                                       class_weight='balanced', random_state=9,
                                       solver='newton-cg', n_jobs=-2).fit(X_train, y_train)
lgst_reg_cv_score = cross_validation_process(lgst_reg_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=84.09%
Standard_Deviation=0.202
True_Score(Mean/SD)=4.154

Prediction_Confusion_Matrix=[7|6]:[3|13]
Prediction_Accuracy=68.97%


### Ada Boost Classifier

In [10]:
ada_clf = AdaBoostClassifier(n_estimators=274, learning_rate=0.013,
                             random_state=9).fit(X_train, y_train)
ada_scores = cross_validation_process(ada_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=85.61%
Standard_Deviation=0.207
True_Score(Mean/SD)=4.126

Prediction_Confusion_Matrix=[8|5]:[4|12]
Prediction_Accuracy=68.97%


### SGD Classifier

In [20]:
SGD_clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=1e3,
                        shuffle=False, n_jobs=8, random_state=9,
                        class_weight='balanced').fit(X_train, y_train)
SGD_score = cross_validation_process(SGD_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=87.12%
Standard_Deviation=0.193
True_Score(Mean/SD)=4.525

Prediction_Confusion_Matrix=[7|6]:[5|11]
Prediction_Accuracy=62.07%


### Random Forest Classifier

In [21]:
rand_frst_clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-2,
                                       min_samples_leaf=2, random_state=9,
                                       class_weight='balanced').fit(X_train, y_train)

rand_frst_score = cross_validation_process(rand_frst_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=87.12%
Standard_Deviation=0.193
True_Score(Mean/SD)=4.525

Prediction_Confusion_Matrix=[9|4]:[3|13]
Prediction_Accuracy=75.86%


### Ridge Classifier

In [22]:
ridge_clf = RidgeClassifier(class_weight='balanced', random_state=9
                           ).fit(X_train, y_train)
ridge_score = cross_validation_process(ridge_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=85.61%
Standard_Deviation=0.207
True_Score(Mean/SD)=4.126

Prediction_Confusion_Matrix=[7|6]:[2|14]
Prediction_Accuracy=72.41%


### Ridge Classifier CV

In [23]:
ridge_cv_clf = RidgeClassifierCV(scoring='average_precision', cv=20,
                                 class_weight='balanced').fit(X_train, y_train)
ridge_cv_score = cross_validation_process(ridge_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=80.3%
Standard_Deviation=0.199
True_Score(Mean/SD)=4.041

Prediction_Confusion_Matrix=[7|6]:[2|14]
Prediction_Accuracy=72.41%


### K Neighbors Classifier

In [24]:
KNN_clf = KNeighborsClassifier(n_neighbors=19, leaf_size=88, n_jobs=8).fit(X_train, y_train)
KNN_score = cross_validation_process(KNN_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=77.27%
Standard_Deviation=0.216
True_Score(Mean/SD)=3.571

Prediction_Confusion_Matrix=[9|4]:[3|13]
Prediction_Accuracy=75.86%


### Multi-layer Perceptron classifier

In [49]:
MLP_clf = MLPClassifier(hidden_layer_sizes=(64,), activation='logistic', solver='lbfgs', 
                        alpha=0.0001, batch_size=8, max_iter=5000,
                        random_state=9, validation_fraction=0.1,
                        verbose=True).fit(X_train, y_train)

MLP_score = cross_validation_process(MLP_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=87.12%
Standard_Deviation=0.211
True_Score(Mean/SD)=4.123

Prediction_Confusion_Matrix=[9|4]:[2|14]
Prediction_Accuracy=79.31%


In [46]:
# classification_scoring = ['accuracy', 'balanced_accuracy', 'average_precision',
#                           'brier_score_loss', 'f1', 'f1_micro', 'f1_macro',
#                           'f1_weighted', 'neg_log_loss', 'precision',
#                           'recall', 'roc_auc']