# Running all Classification Models to get the best predictions
## Then, moving on to Neural Networks to get "Better Predictions"

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC

from dummies_bins_test_train_cv import get_Xy_train_test, cross_validation_process
from clean_chess_game_log import main_cleanup

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
_,_,_ = main_cleanup('../data/dest.pgn')

In [35]:
df = pd.read_csv('../data/use_for_predictions.csv')

In [36]:
df = df[500:]

In [37]:
X_train, X_test, y_train, y_test, X, y, df_clean = get_Xy_train_test(df, .95, .97)

y Shape: (1624,)
X Shape: (1624, 20)
X_train Shape: (1555, 20)
X_test Shape: (69, 20)
y_train Shape: (1555,)
y_test Shape: (69,)


### Linear Discriminant Analysis

In [38]:
LDA_clf = LinearDiscriminantAnalysis(solver = 'lsqr', shrinkage='auto').fit(X_train, y_train)
LDA_scores = cross_validation_process(LDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=84.31%
Standard_Deviation=0.146
True_Score(Mean/SD)=5.767

Prediction_Confusion_Matrix=[17|12]:[10|30]
Prediction_Accuracy=68.12%


### Quadratic Discriminant Analysis

In [39]:
QDA_clf = QuadraticDiscriminantAnalysis(reg_param=0.26055).fit(X_train, y_train)
QDA_scores = cross_validation_process(QDA_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=81.64%
Standard_Deviation=0.133
True_Score(Mean/SD)=6.149

Prediction_Confusion_Matrix=[14|15]:[6|34]
Prediction_Accuracy=69.57%


### Gaussian Process Classifier

In [40]:
gpc_rbf_clf = GaussianProcessClassifier(n_jobs=-2, n_restarts_optimizer=10,
                                        random_state=9).fit(X_train, y_train)
gpc_rbf_score = cross_validation_process(gpc_rbf_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=87.95%
Standard_Deviation=0.112
True_Score(Mean/SD)=7.885

Prediction_Confusion_Matrix=[21|8]:[9|31]
Prediction_Accuracy=75.36%


### Logistic Regression

In [41]:
lgst_reg_clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=9, 
                                  max_iter=5000, C=1e-3, solver='lbfgs', n_jobs=8, 
                                  multi_class='auto').fit(X_train, y_train)

lgst_reg_score = cross_validation_process(lgst_reg_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=86.94%
Standard_Deviation=0.082
True_Score(Mean/SD)=10.539

Prediction_Confusion_Matrix=[16|13]:[11|29]
Prediction_Accuracy=65.22%


### Logistic Regression CV

In [42]:
lgst_reg_cv_clf = LogisticRegressionCV(Cs=10, penalty='l2', cv=6,
                                       class_weight='balanced', random_state=9,
                                       solver='newton-cg', n_jobs=-2).fit(X_train, y_train)
lgst_reg_cv_score = cross_validation_process(lgst_reg_cv_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=86.6%
Standard_Deviation=0.091
True_Score(Mean/SD)=9.552

Prediction_Confusion_Matrix=[15|14]:[11|29]
Prediction_Accuracy=63.77%


### Ada Boost Classifier

In [43]:
ada_clf = AdaBoostClassifier(n_estimators=274, learning_rate=0.013,
                             random_state=9).fit(X_train, y_train)
ada_scores = cross_validation_process(ada_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.1%
Standard_Deviation=0.151
True_Score(Mean/SD)=5.443

Prediction_Confusion_Matrix=[17|12]:[12|28]
Prediction_Accuracy=65.22%


### SGD Classifier

In [44]:
SGD_clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=1e3,
                        shuffle=False, n_jobs=8, random_state=9,
                        class_weight='balanced').fit(X_train, y_train)
SGD_score = cross_validation_process(SGD_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=81.81%
Standard_Deviation=0.155
True_Score(Mean/SD)=5.268

Prediction_Confusion_Matrix=[17|12]:[12|28]
Prediction_Accuracy=65.22%


### Random Forest Classifier

In [45]:
rand_frst_clf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-2,
                                       min_samples_leaf=2, random_state=9,
                                       class_weight='balanced').fit(X_train, y_train)

rand_frst_score = cross_validation_process(rand_frst_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=82.87%
Standard_Deviation=0.152
True_Score(Mean/SD)=5.463

Prediction_Confusion_Matrix=[21|8]:[9|31]
Prediction_Accuracy=75.36%


### Ridge Classifier

In [46]:
ridge_clf = RidgeClassifier(class_weight='balanced', random_state=9
                           ).fit(X_train, y_train)
ridge_score = cross_validation_process(ridge_clf, X_test, y_test, cv=11)

Average_Accuracy(average_precision)=81.03%
Standard_Deviation=0.178
True_Score(Mean/SD)=4.562

Prediction_Confusion_Matrix=[15|14]:[8|32]
Prediction_Accuracy=68.12%


### Ridge Classifier CV

In [47]:
ridge_cv_clf = RidgeClassifierCV(scoring='average_precision', cv=20,
                                 class_weight='balanced').fit(X_train, y_train)
ridge_cv_score = cross_validation_process(ridge_cv_clf, X_test, y_test, cv=11, scoring='f1')

Average_Accuracy(f1)=63.72%
Standard_Deviation=0.254
True_Score(Mean/SD)=2.507

Prediction_Confusion_Matrix=[15|14]:[9|31]
Prediction_Accuracy=66.67%


### K Neighbors Classifier

In [48]:
KNN_clf = KNeighborsClassifier(n_neighbors=19, leaf_size=88, n_jobs=8).fit(X_train, y_train)
KNN_score = cross_validation_process(KNN_clf, X_test, y_test, cv=11, scoring='f1')

Average_Accuracy(f1)=76.02%
Standard_Deviation=0.142
True_Score(Mean/SD)=5.348

Prediction_Confusion_Matrix=[20|9]:[10|30]
Prediction_Accuracy=72.46%


### Multi-layer Perceptron classifier

In [49]:
MLP_clf = MLPClassifier(hidden_layer_sizes=(64,), activation='logistic', solver='lbfgs', 
                        alpha=0.0001, batch_size=8, max_iter=5000,
                        random_state=9, validation_fraction=0.1,
                        verbose=True).fit(X_train, y_train)

MLP_score = cross_validation_process(MLP_clf, X_test, y_test, cv=11, scoring='f1')

Average_Accuracy(f1)=63.85%
Standard_Deviation=0.152
True_Score(Mean/SD)=4.195

Prediction_Confusion_Matrix=[18|11]:[14|26]
Prediction_Accuracy=63.77%


In [34]:
# classification_scoring = ['accuracy', 'balanced_accuracy', 'average_precision',
#                           'brier_score_loss', 'f1', 'f1_micro', 'f1_macro',
#                           'f1_weighted', 'neg_log_loss', 'precision',
#                           'recall', 'roc_auc']