# Running all Classification Models to get the best predictions
## Then, moving on to Neural Networks to get "Better Predictions"

In [25]:
import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC

from dummies_bins_test_train_cv import *

from warnings import filterwarnings
filterwarnings('ignore')

In [26]:
df = pd.read_csv('../data/use_for_analysis.csv')

In [48]:
X_train, X_test, y_train, y_test = get_Xy_train_test(df, .98, .99)

y Shape: (2048,)
X Shape: (2048, 31)
X_train Shape: (2013, 31)
X_test Shape: (35, 31)
y_train Shape: (2013,)
y_test Shape: (35,)


### Linear Discriminant Analysis

In [49]:
# LDA_clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto').fit(X_train, y_train)
# LDA_scores = cross_validation_process(LDA_clf, X_test, y_test, cv=11)

### Quadratic Discriminant Analysis

Best results with:
- reg_param = 0.26055
- reg_param = 0.36968
- reg_param = 0.96127
- reg_param = 0.83353
- reg_param = 0.77121

In [50]:
# reg_param = [0.96127, 0.83353, 0.77121, 0.26055, 0.36968]
# for num in reg_param:
#     print(num)
#     QDA_clf = QuadraticDiscriminantAnalysis(reg_param=num).fit(X_train, y_train)
#     QDA_scores = cross_validation_process(QDA_clf, X_test, y_test, cv=11)

### Gaussian Process Classifier

In [51]:
gpc_rbf_clf = GaussianProcessClassifier(n_jobs=-2, max_iter_predict=100, n_restarts_optimizer=10,
                                        random_state=9).fit(X_train, y_train)
gpc_rbf_score = cross_validation_process(gpc_rbf_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=77.27%
Standard_Deviation=0.216
Scores(average_precision)=[0.5   0.75  0.5   0.583 0.583 1.    1.    0.583 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[ 9  5]
 [ 6 15]]
Prediction_Accuracy=68.57%


### Logistic Regression

In [52]:
lgst_reg_clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=9, 
                                  solver='newton-cg', n_jobs=-2, multi_class='auto').fit(X_train, y_train)

lgst_reg_score = cross_validation_process(lgst_reg_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.58%
Standard_Deviation=0.206
Scores(average_precision)=[0.583 1.    0.5   0.833 1.    0.583 1.    0.583 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[10  4]
 [ 6 15]]
Prediction_Accuracy=71.43%


### Logistic Regression CV

Best results with:

- Cs = 10, cv = 6
- Cs = 16, cv = 4
- Cs = 19, cv = 6
- Cs = 19, cv = 12
- Cs = 25, cv = 6
- Cs = 25, cv = 12

In [43]:
Cs_cv = {10: 6, 16: 4, 19: [6, 12], 25: [6, 12]}
for k, v in Cs_cv.items():
    if type(v) != list:
        lgst_reg_cv_clf = LogisticRegressionCV(Cs=k, penalty='l2', cv=v,
                                               class_weight='balanced', random_state=9,
                                               solver='newton-cg', n_jobs=-2).fit(X_train, y_train)
        lgst_reg_cv_score = cross_validation_process(
            lgst_reg_cv_clf, X_test, y_test, cv=11)
    else:
        for num in v:
            lgst_reg_cv_clf = LogisticRegressionCV(Cs=k, penalty='l2', cv=num,
                                                   class_weight='balanced', random_state=9,
                                                   solver='newton-cg', n_jobs=-2).fit(X_train, y_train)
            lgst_reg_cv_score = cross_validation_process(
                lgst_reg_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.0%
Standard_Deviation=0.091
Scores(average_precision)=[0.737 0.838 0.788 0.787 0.823 0.812 0.948 0.938 0.757 0.643 0.948]
No Feature Importances
Prediction_Confusion_Matrix=
[[57 26]
 [21 61]]
Prediction_Accuracy=71.52%
Average_Accuracy(average_precision)=81.98%
Standard_Deviation=0.085
Scores(average_precision)=[0.737 0.838 0.886 0.787 0.735 0.812 0.898 0.938 0.757 0.681 0.948]
No Feature Importances
Prediction_Confusion_Matrix=
[[55 28]
 [22 60]]
Prediction_Accuracy=69.7%
Average_Accuracy(average_precision)=81.47%
Standard_Deviation=0.091
Scores(average_precision)=[0.665 0.838 0.788 0.787 0.823 0.812 0.924 0.938 0.757 0.681 0.948]
No Feature Importances
Prediction_Confusion_Matrix=
[[57 26]
 [21 61]]
Prediction_Accuracy=71.52%
Average_Accuracy(average_precision)=79.59%
Standard_Deviation=0.107
Scores(average_precision)=[0.737 0.838 0.657 0.787 0.651 0.812 0.948 0.938 0.757 0.681 0.948]
No Feature Importances
Prediction_Confusion_Matrix=
[[56 27]

### Ada Boost Classifier 

Best results with:

- n_estimators=235, learning_rate=1.442
- n_estimators=52, learning_rate=0.155
- n_estimators=274, learning_rate=0.013
- n_estimators=162, learning_rate=0.767

In [53]:
n_estimators_learning_rate = {235: 1.442, 52:0.155, 274:0.013, 162:0.767}
for k, v in n_estimators_learning_rate.items():
    ada_clf = AdaBoostClassifier(n_estimators=k, learning_rate=v, 
                                 random_state=9).fit(X_train, y_train)
    ada_scores = cross_validation_process(ada_clf, X_test, y_test, cv=5)

Average_Accuracy(average_precision)=60.71%
Standard_Deviation=0.103
Scores(average_precision)=[0.66  0.733 0.43  0.646 0.567]
Feature importance = [0.01276596 0.87659574 0.00425532 0.00851064 0.         0.
 0.00425532 0.00425532 0.00425532 0.00425532 0.00851064 0.00425532
 0.00425532 0.00425532 0.00425532 0.00425532 0.00851064 0.00425532
 0.         0.00425532 0.         0.00425532 0.00851064 0.
 0.00425532 0.         0.         0.         0.01276596 0.
 0.00425532]
Prediction_Confusion_Matrix=
[[11  3]
 [ 5 16]]
Prediction_Accuracy=77.14%
Average_Accuracy(average_precision)=66.67%
Standard_Deviation=0.128
Scores(average_precision)=[0.72  0.817 0.43  0.667 0.7  ]
Feature importance = [0.11538462 0.63461538 0.         0.01923077 0.         0.
 0.01923077 0.         0.         0.         0.         0.07692308
 0.         0.         0.         0.         0.09615385 0.
 0.         0.         0.03846154 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0. 

### SGD Classifier

Best results with:

- max_iter = 28782, n_iter_no_change = 50
- max_iter = 32421, n_iter_no_change = 54
- max_iter = 43631, n_iter_no_change = 74
- max_iter = 37278, n_iter_no_change = 70
- max_iter = 38125, n_iter_no_change = 122

In [45]:
max_iter_no_change = {28782: 50, 32421: 54, 43631: 74, 37278: 70, 38125: 122}
for k, v in max_iter_no_change.items():
    SGD_clf = SGDClassifier(loss='squared_hinge', max_iter=k, shuffle=False,
                            n_jobs=-2, random_state=9, n_iter_no_change=v,
                            class_weight='balanced').fit(X_train, y_train)

    SGD_score = cross_validation_process(SGD_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=81.18%
Standard_Deviation=0.11
Scores(average_precision)=[0.655 0.89  0.763 0.817 0.732 0.86  0.938 0.898 0.804 0.605 0.968]
No Feature Importances
Prediction_Confusion_Matrix=
[[64 19]
 [28 54]]
Prediction_Accuracy=71.52%
Average_Accuracy(average_precision)=81.16%
Standard_Deviation=0.11
Scores(average_precision)=[0.651 0.885 0.763 0.812 0.732 0.86  0.938 0.898 0.816 0.605 0.968]
No Feature Importances
Prediction_Confusion_Matrix=
[[64 19]
 [28 54]]
Prediction_Accuracy=71.52%
Average_Accuracy(average_precision)=80.8%
Standard_Deviation=0.103
Scores(average_precision)=[0.651 0.885 0.763 0.812 0.732 0.848 0.895 0.898 0.816 0.619 0.968]
No Feature Importances
Prediction_Confusion_Matrix=
[[65 18]
 [29 53]]
Prediction_Accuracy=71.52%
Average_Accuracy(average_precision)=80.8%
Standard_Deviation=0.103
Scores(average_precision)=[0.651 0.885 0.763 0.812 0.732 0.848 0.895 0.898 0.816 0.619 0.968]
No Feature Importances
Prediction_Confusion_Matrix=
[[64 19]
 

### Extra Trees Classifier

In [109]:
XTsC_clf = ExtraTreesClassifier(n_estimators=1000, criterion='entropy',  n_jobs=-2, 
                                random_state=9, class_weight='balanced').fit(X_train, y_train)
XTsC_score = cross_validation_process(XTsC_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=70.01%
Standard_Deviation=0.141
Scores(average_precision)=[0.821 0.594 0.679 0.826 0.836 0.563 0.711 0.521 0.789 0.458 0.903]
Feature importance = [0.03060234 0.65005562 0.01421729 0.01500371 0.00488075 0.00121955
 0.0022092  0.00219584 0.00531976 0.00849126 0.00764247 0.0102056
 0.01037916 0.0116754  0.00861654 0.01122099 0.00809361 0.01307338
 0.00941251 0.00947067 0.00679895 0.00917978 0.00820033 0.0080446
 0.00788828 0.02442886 0.02262717 0.02160814 0.01647008 0.01916307
 0.02160509]
Prediction_Confusion_Matrix=
[[38 14]
 [20 40]]
Prediction_Accuracy=69.64%


### Random Forest Classifier

In [54]:
rand_frst_clf = RandomForestClassifier(n_estimators=4000, criterion='entropy', n_jobs=-2, 
                                       random_state=9).fit(X_train, y_train)

rand_frst_score = cross_validation_process(rand_frst_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=78.79%
Standard_Deviation=0.231
Scores(average_precision)=[0.417 1.    0.417 0.833 0.583 0.833 1.    0.583 1.    1.    1.   ]
Feature importance = [0.03799435 0.557336   0.01556679 0.0193183  0.00602155 0.0009961
 0.00162014 0.00266387 0.00562044 0.01252442 0.01160935 0.01436175
 0.01286127 0.01376815 0.01258245 0.01441936 0.01231022 0.01690866
 0.01385695 0.01507053 0.0124893  0.01254792 0.01055795 0.01071132
 0.01063345 0.02806966 0.02677807 0.02251057 0.02199995 0.0217764
 0.02451477]
Prediction_Confusion_Matrix=
[[11  3]
 [ 7 14]]
Prediction_Accuracy=71.43%


### Ridge Classifier

Best results with:

- alpha = 10.13
- alpha = 7.37
- alpha = 13.69

In [55]:
alphas = [10.13, 7.37, 13.69]
for a in alphas:
    ridge_clf = RidgeClassifier(alpha=a, class_weight='balanced', solver='auto',
                                random_state=9).fit(X_train, y_train)
    ridge_score = cross_validation_process(ridge_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.58%
Standard_Deviation=0.206
Scores(average_precision)=[0.583 1.    0.5   0.833 1.    0.583 1.    0.583 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[ 9  5]
 [ 5 16]]
Prediction_Accuracy=71.43%
Average_Accuracy(average_precision)=82.58%
Standard_Deviation=0.206
Scores(average_precision)=[0.583 1.    0.5   0.833 1.    0.583 1.    0.583 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[ 9  5]
 [ 5 16]]
Prediction_Accuracy=71.43%
Average_Accuracy(average_precision)=86.36%
Standard_Deviation=0.196
Scores(average_precision)=[1.    1.    0.5   0.833 1.    0.583 1.    0.583 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[ 9  5]
 [ 5 16]]
Prediction_Accuracy=71.43%


### Ridge Classifier CV

In [56]:
ridge_cv_clf = RidgeClassifierCV(scoring='average_precision', cv=20,
                                 class_weight='balanced').fit(X_train, y_train)
ridge_cv_score = cross_validation_process(ridge_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=78.03%
Standard_Deviation=0.202
Scores(average_precision)=[0.417 1.    0.75  0.833 0.583 0.583 1.    0.583 0.833 1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[ 9  5]
 [ 5 16]]
Prediction_Accuracy=71.43%


### K Neighbors Classifier

Best results with:

- n_neighbors = 19, leaf_size = 28
- n_neighbors = 17, leaf_size = 135
- n_neighbors = 19, leaf_size = 88
- n_neighbors = 14, leaf_size = 88
- n_neighbors = 18, leaf_size = 88

In [57]:
n_neighbors_leaf_size = {19:[28, 88], 17:88, 14:88, 18:88}
for k, v in n_neighbors_leaf_size.items():
    if type(v) != list:
        KNN_clf = KNeighborsClassifier(n_neighbors=k, leaf_size=v, n_jobs=-2).fit(X_train, y_train)
        KNN_score = cross_validation_process(KNN_clf, X_test, y_test, cv=11)
    else:
        for num in v:
            KNN_clf = KNeighborsClassifier(n_neighbors=k, leaf_size=num, n_jobs=-2).fit(X_train, y_train)
            KNN_score = cross_validation_process(KNN_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=81.82%
Standard_Deviation=0.191
Scores(average_precision)=[1.    1.    0.5   0.833 1.    0.583 0.833 0.583 1.    0.667 1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[11  3]
 [ 5 16]]
Prediction_Accuracy=77.14%
Average_Accuracy(average_precision)=81.82%
Standard_Deviation=0.191
Scores(average_precision)=[1.    1.    0.5   0.833 1.    0.583 0.833 0.583 1.    0.667 1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[11  3]
 [ 5 16]]
Prediction_Accuracy=77.14%
Average_Accuracy(average_precision)=89.39%
Standard_Deviation=0.163
Scores(average_precision)=[1.    1.    0.5   0.833 1.    0.667 1.    0.833 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[11  3]
 [ 5 16]]
Prediction_Accuracy=77.14%
Average_Accuracy(average_precision)=81.06%
Standard_Deviation=0.178
Scores(average_precision)=[0.75  1.    0.5   0.833 0.583 0.583 0.833 0.833 1.    1.    1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[10  

### Multi-layer Perceptron classifier

In [58]:
MLP_clf = MLPClassifier(hidden_layer_sizes=(200,), activation='logistic', solver='lbfgs', 
                        alpha=0.0001, batch_size=8, learning_rate='constant', max_iter=500, 
                        shuffle=False, random_state=9, validation_fraction=0.05, 
                        n_iter_no_change=30).fit(X_train, y_train)

MLP_score = cross_validation_process(MLP_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=77.27%
Standard_Deviation=0.192
Scores(average_precision)=[0.417 1.    0.833 0.833 0.583 0.833 1.    0.583 0.583 0.833 1.   ]
No Feature Importances
Prediction_Confusion_Matrix=
[[10  4]
 [ 6 15]]
Prediction_Accuracy=71.43%


In [18]:
# classification_scoring = ['accuracy', 'balanced_accuracy', 'average_precision',
#                           'brier_score_loss', 'f1', 'f1_micro', 'f1_macro',
#                           'f1_weighted', 'neg_log_loss', 'precision',
#                           'recall', 'roc_auc']

In [19]:
# classifiers = [
#     KNeighborsClassifier(3),
#     SVC(kernel="linear", C=0.025),
#     SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
#     DecisionTreeClassifier(max_depth=5),
#     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     MLPClassifier(alpha=1),
#     AdaBoostClassifier(),
#     GaussianNB(),
#     QuadraticDiscriminantAnalysis()]

In [None]:
# clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',
#                           random_state=1)
# clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
# clf3 = GaussianNB()

# eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
#     scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
#     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
# clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,
#     random_state=0)
# scores = cross_val_score(clf, X, y, cv=5)
# scores.mean()


# clf = RandomForestClassifier(n_estimators=10, max_depth=None,
#     min_samples_split=2, random_state=0)
# scores = cross_val_score(clf, X, y, cv=5)
# scores.mean()


# clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,
#     min_samples_split=2, random_state=0)
# scores = cross_val_score(clf, X, y, cv=5)
# scores.mean() > 0.999

In [None]:
# est = GradientBoostingRegressor
# mean_squared_error(y_test, est.predict(X_test))  
# clf.score(X_test, y_test)