# Running all Classification Models to get the best predictions

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaseEnsemble
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import VotingClassifier

In [None]:
AdaBoostClassifier(
    base_estimator=[], n_estimators=[10, 50, 100], learning_rate=[0.5, 1.0, 2.0],
    algorithm=['SAMME', 'SAMME.R'], random_state=5)

BaggingClassifier(
    base_estimator=[], n_estimators=[10, 50, 100], max_features=[0.25, 0.5, 0.75, 1.0],
    bootstrap=[True, False], bootstrap_features=[True, False], random_state=5)

ExtraTreesClassifier(
    n_estimators=[50, 100, 200, 400], criterion=['gini', 'entropy'],
    max_depth=[None, 10, 20, 100], min_samples_split=[2, 5, 10, 20],
    min_samples_leaf=[1, 5, 10, 20], max_features=['auto', 'sqrt', 'log2'],
    max_leaf_nodes=[None, 5, 20, 100], min_impurity_decrease=[0.0, 0.05, 0.1],
    bootstrap=[True, False], random_state=5, class_weight=['balanced', None])

GradientBoostingClassifier(
    loss=['deviance', 'exponential'], learning_rate=[0.05, 0.1, 0.2],
    n_estimators=[100, 200, 400], min_samples_split=[2, 5, 10],
    min_samples_leaf=[1, 5, 10], max_depth=[3, 6, 15], min_impurity_decrease=[0.0, 0.05, 0.10],
    random_state=5, max_features=['auto', 'sqrt', 'log2'], max_leaf_nodes=[None, 5, 20, 100],
    tol=[0.01, 0.001, 0.0001])

IsolationForest(n_estimators=[100, 200, 400], contamination=[0.05, 0.1, 0.2, 'auto'],
                max_features=[0.25, 0.5, 0.75, 1.0], bootstrap=[True, False],
                behaviour=['old', 'new'], random_state=5)

VotingClassifier(estimators=[], voting='soft', n_jobs=-1)

In [17]:
# from dummies_bins_test_train_cv import initial_df
# from dummies_bins_test_train_cv import bin_df_get_y
# from dummies_bins_test_train_cv import partial_df
# from dummies_bins_test_train_cv import xy_custom

# from col_info import all_cols

from pandas import read_csv, DataFrame

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from random import randint

In [18]:
from clean_chess_game_log import main_cleanup
_, _, _ = main_cleanup('../data/dest.pgn')

In [15]:
df, df_len = initial_df('../data/use_for_predictions.csv')

NameError: name 'initial_df' is not defined

In [4]:
def get_df_splits_dict(df):
    LDF = len(df)

    sfr = int((LDF/4)*.2)
    sfv = int((LDF/5)*.2)
    ssx = int((LDF/6)*.2)
    sght = int((LDF/8)*.2)
    slvn = int((LDF/11)*.2)
    sftn = int((LDF/15)*.2)
    stwnt = int((LDF/20)*.2)

    split_dict = {1/4: [sfr-(randint(6, 9)*i) for i in range(0,7)],
                  1/5: [sfv-(randint(5, 7)*i) for i in range(0,7)],
                  1/6: [ssx-(randint(3, 4)*i) for i in range(0,7)],
                  1/8: [sght-(randint(2, 4)*i) for i in range(0,7)],
                  1/11: [slvn-(randint(2, 3)*i) for i in range(0,7)],
                  1/15: [sftn-(randint(1, 2)*i) for i in range(0,7)],
                  1/20: [stwnt-(1*i) for i in range(0,7)]}
    
    return split_dict

In [5]:
# result_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])

## Linear Discriminant Analysis

In [6]:
LDA_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            LDA_clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'
                                                 ).fit(X_train, y_train)
            y_pred = LDA_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if LDA_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'LDA'
                        LDA_df = LDA_df.append([results], ignore_index=True)
print(len(LDA_df))
LDA_df.to_csv('../data/LDA_results.csv', index=False)

{0.25: [110, 103, 98, 86, 82, 70, 56], 0.2: [88, 83, 78, 70, 64, 58, 46], 0.16666666666666666: [73, 69, 67, 64, 57, 53, 49], 0.125: [55, 52, 49, 46, 43, 45, 37], 0.09090909090909091: [40, 37, 34, 31, 28, 30, 22], 0.06666666666666667: [29, 28, 25, 23, 25, 24, 23], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
6185


## Logistic Regression

In [7]:
LRC_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            lgst_reg_clf = LogisticRegression(penalty='l2', class_weight='balanced', random_state=8,
                                              max_iter=5000, C=1e-3, solver='lbfgs',
                                              multi_class='auto').fit(X_train, y_train)
            y_pred = lgst_reg_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if lgst_reg_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'LRC'
                        LRC_df = LRC_df.append([results], ignore_index=True)
print(len(LRC_df))
LRC_df.to_csv('../data/LRC_results.csv', index=False)

{0.25: [110, 103, 92, 89, 82, 70, 74], 0.2: [88, 83, 78, 73, 60, 58, 52], 0.16666666666666666: [73, 70, 67, 61, 57, 53, 49], 0.125: [55, 53, 49, 49, 47, 35, 37], 0.09090909090909091: [40, 38, 34, 31, 28, 25, 28], 0.06666666666666667: [29, 27, 27, 26, 25, 24, 17], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
7258


## K Neighbors Classifier

In [8]:
KNN_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            KNN_clf = KNeighborsClassifier(n_neighbors=11, algorithm='auto',
                                           leaf_size=int(len(df_s)*0.04),
                                           metric='manhattan',
                                           p=1).fit(X_train, y_train)
            
            y_pred = KNN_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            
            if (cm[0][0] + cm[1][0] > 1):
                if (cm[0][1] + cm[1][1] > 1):
                    if KNN_clf.score(X_test, y_test) >= .6:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'KNN'
                        KNN_df = KNN_df.append([results], ignore_index=True)
print(len(KNN_df))
KNN_df.to_csv('../data/KNN_results.csv', index=False)

{0.25: [110, 104, 96, 83, 78, 80, 68], 0.2: [88, 83, 78, 67, 60, 53, 46], 0.16666666666666666: [73, 70, 67, 64, 61, 53, 49], 0.125: [55, 51, 49, 49, 47, 35, 37], 0.09090909090909091: [40, 37, 34, 31, 28, 25, 28], 0.06666666666666667: [29, 27, 27, 26, 21, 19, 17], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
7107


## SGD Classifier

In [9]:
SGD_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            SGD_clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=1e3,
                                    shuffle=False, random_state=8,
                                    class_weight='balanced').fit(X_train, y_train)
            y_pred = SGD_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if SGD_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'SGD'
                        SGD_df = SGD_df.append([results], ignore_index=True)
print(len(SGD_df))
SGD_df.to_csv('../data/SGD_results.csv', index=False)

{0.25: [110, 102, 98, 89, 78, 70, 56], 0.2: [88, 83, 76, 70, 60, 58, 58], 0.16666666666666666: [73, 70, 65, 61, 57, 53, 49], 0.125: [55, 51, 47, 43, 47, 45, 43], 0.09090909090909091: [40, 38, 34, 31, 28, 30, 22], 0.06666666666666667: [29, 27, 27, 23, 25, 19, 23], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
5249


## Ridge Classifier

In [10]:
RDC_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            ridge_clf = RidgeClassifier(class_weight='balanced', random_state=8
                                        ).fit(X_train, y_train)
            y_pred = ridge_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 1):
                if (cm[0][1] + cm[1][1] > 1):
                    if ridge_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'RDG'
                        RDC_df = RDC_df.append([results], ignore_index=True)
print(len(RDC_df))
RDC_df.to_csv('../data/RDC_results.csv', index=False)

{0.25: [110, 102, 98, 86, 86, 75, 62], 0.2: [88, 82, 74, 73, 68, 63, 46], 0.16666666666666666: [73, 70, 67, 64, 61, 58, 49], 0.125: [55, 53, 47, 43, 47, 45, 31], 0.09090909090909091: [40, 37, 34, 31, 32, 25, 28], 0.06666666666666667: [29, 27, 25, 23, 25, 24, 17], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
6234


## Gaussian Process Classifier

In [11]:
GPC_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            gpc_rbf_clf = GaussianProcessClassifier(n_restarts_optimizer=10,
                                                    random_state=9).fit(X_train, y_train)
            y_pred = gpc_rbf_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if gpc_rbf_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'GPC'
                        GPC_df = GPC_df.append([results], ignore_index=True)
print(len(GPC_df))
GPC_df.to_csv('../data/GPC_results.csv', index=False)

{0.25: [110, 101, 94, 92, 74, 80, 68], 0.2: [88, 82, 74, 70, 60, 63, 46], 0.16666666666666666: [73, 69, 67, 64, 61, 58, 49], 0.125: [55, 51, 47, 49, 43, 35, 37], 0.09090909090909091: [40, 38, 34, 34, 32, 30, 28], 0.06666666666666667: [29, 28, 27, 26, 25, 19, 17], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
2434


## Random Forest Classifier

In [12]:
RFC_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            rand_frst_clf = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                                   min_samples_leaf=2, random_state=8,
                                                   class_weight='balanced').fit(X_train, y_train)
            y_pred = rand_frst_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if rand_frst_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'RFC'
                        RFC_df = RFC_df.append([results], ignore_index=True)
print(len(RFC_df))
RFC_df.to_csv('../data/RFC_results.csv', index=False)

{0.25: [110, 102, 98, 83, 74, 75, 74], 0.2: [88, 82, 74, 70, 68, 63, 52], 0.16666666666666666: [73, 70, 67, 61, 61, 53, 49], 0.125: [55, 52, 47, 49, 43, 40, 37], 0.09090909090909091: [40, 38, 36, 31, 32, 30, 22], 0.06666666666666667: [29, 28, 25, 23, 21, 19, 17], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
6477


## Ada Boost Classifier

In [13]:
ABC_df = DataFrame(columns=['cols', 'df_split', 'test_split', 'cm', 'model'])
split_dict = get_df_splits_dict(df)
print(split_dict)
for k, v in split_dict.items():
    print(round(k, 4))
    df_s = partial_df(df, k)
    df_s, y = bin_df_get_y(df_s)
    for i in v:
        for clm in all_cols:
            results = {}
            X_train, X_test, y_train, y_test, X = xy_custom(df_s, y, i, clm)
            ada_clf = AdaBoostClassifier(n_estimators=150, learning_rate=0.01,
                                         random_state=8).fit(X_train, y_train)
            y_pred = ada_clf.predict(X_test)
            y_pred = (y_pred > 0.5)
            cm = confusion_matrix(y_test, y_pred)
            if (cm[0][0] + cm[1][0] > 0):
                if (cm[0][1] + cm[1][1] > 0):
                    if ada_clf.score(X_test, y_test) >= .66:
                        results['cols'] = clm
                        results['df_split'] = round(k, 3)
                        results['test_split'] = i
                        results['cm'] = list(cm)
                        results['model'] = 'ABC'
                        ABC_df = ABC_df.append([results], ignore_index=True)
print(len(ABC_df))
ABC_df.to_csv('../data/ABC_results.csv', index=False)

{0.25: [110, 102, 98, 92, 74, 65, 68], 0.2: [88, 82, 76, 67, 60, 58, 46], 0.16666666666666666: [73, 69, 65, 61, 61, 53, 49], 0.125: [55, 52, 51, 46, 43, 40, 37], 0.09090909090909091: [40, 38, 36, 34, 32, 30, 28], 0.06666666666666667: [29, 28, 27, 23, 25, 19, 23], 0.05: [22, 21, 20, 19, 18, 17, 16]}
0.25
0.2
0.1667
0.125
0.0909
0.0667
0.05
6820
