In [1]:
import pandas as pd
import database_load
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def dataframe_creation(cursor, query):
    #load query results into pd df
    cursor.execute(query)
    colnames = [col[0] for col in cursor.description]
    df = pd.DataFrame(cursor.fetchall(), columns=colnames)
    return df


db_connection = database_load.connect_database()
db_cursor = db_connection.cursor()

df = dataframe_creation(db_cursor, 'select * from games')

#close database connection
db_cursor.close()
db_connection.close()

In [17]:
#Question of whether player 1, faced with a higher rank, will resign
#Columns: white, black, rank_diff, who won, resigned

results_df = pd.DataFrame(columns=['white', 'black', 'rank_diff', 'winner', 'resigned'])
results_df['white'] = df['white_rating']
results_df['black'] = df['black_rating']
results_df['rank_diff'] = df['white_rating'] - df['black_rating']
results_df['winner'] = np.select([df['winner'] == 'white', df['winner'] == 'black'], [1,-1], default=0)
results_df['resigned'] = np.select([df['victory_status']=='resign'], [1], default=0)
results_df['expected_win'] = np.sign(results_df['winner']/results_df['rank_diff']).fillna(1)
print(results_df)

       white  black  rank_diff  winner  resigned  expected_win
0       1500   1191        309       1         0           1.0
1       1322   1261         61      -1         1          -1.0
2       1496   1500         -4       1         0          -1.0
3       1439   1454        -15       1         0          -1.0
4       1523   1469         54       1         0           1.0
...      ...    ...        ...     ...       ...           ...
19108   1691   1220        471       1         1           1.0
19109   1233   1196         37      -1         0          -1.0
19110   1219   1286        -67       1         0          -1.0
19111   1360   1227        133       1         1           1.0
19112   1235   1339       -104      -1         0           1.0

[19113 rows x 6 columns]


In [37]:
#Columns: player_1, player_2, rank_diff, player_1_resigned

class_df = pd.DataFrame(columns=['player_1','player_2', 'rank_diff', 'resigned'])
class_df['player_1'] = results_df[["white", "black"]].min(axis=1)
class_df['player_2'] = results_df[["white", "black"]].max(axis=1)
class_df['rank_diff'] = class_df['player_2']- class_df['player_1']
#This returns 1 if player 1 resigned, -1 if player 2 resigned, and 0 for anything else
class_df['resigned'] = results_df['expected_win']*results_df['resigned']
class_df['resigned'] = class_df['resigned'].replace(-1,0)
class_df['resigned_int'] = class_df['resigned'].astype(int)
print(class_df)

       player_1  player_2  rank_diff  resigned  resigned_int
0          1191      1500        309       0.0             0
1          1261      1322         61       0.0             0
2          1496      1500          4      -0.0             0
3          1439      1454         15      -0.0             0
4          1469      1523         54       0.0             0
...         ...       ...        ...       ...           ...
19108      1220      1691        471       1.0             1
19109      1196      1233         37      -0.0             0
19110      1219      1286         67      -0.0             0
19111      1227      1360        133       1.0             1
19112      1235      1339        104       0.0             0

[19113 rows x 5 columns]


In [127]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score, accuracy_score
from sklearn import metrics

In [128]:
x_train , x_test, y_train, y_test = train_test_split(class_df[['rank_diff']], class_df[['resigned_int']], test_size=0.2)

In [129]:
def print_statistics(model, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test):
    y_pred = model.fit(x_train,y_train).predict(x_test)
    print('Accuracy', accuracy_score(y_test, y_pred))
    print('R2', r2_score(y_test, y_pred))
    print('F1', f1_score(y_test, y_pred))
    print('Confusion Matrix')
    print(metrics.confusion_matrix(y_test, y_pred))

In [130]:
from sklearn.naive_bayes import GaussianNB

In [131]:
gnb = GaussianNB()
print_statistics(gnb)

Accuracy 0.6298718284070102
R2 -0.5995721302119832
F1 0.15421398684997012
Confusion Matrix
[[2279  154]
 [1261  129]]


  y = column_or_1d(y, warn=True)


In [141]:
from sklearn.linear_model import SGDClassifier

In [150]:
sgd = SGDClassifier(loss='log_loss', alpha=0.01)
print_statistics(sgd)

Accuracy 0.5291655767721685
R2 -1.0347914023898017
F1 0.49636261891438166
Confusion Matrix
[[1136 1297]
 [ 503  887]]


  y = column_or_1d(y, warn=True)


In [156]:
sgd = SGDClassifier(loss='log_loss', alpha=0.02)
print_statistics(sgd)

Accuracy 0.6235940360973058
R2 -0.6267026822438471
F1 0.2586295723853684
Confusion Matrix
[[2133  300]
 [1139  251]]


  y = column_or_1d(y, warn=True)


In [152]:
from sklearn.neighbors import KNeighborsClassifier

In [135]:
#default neighbour =5
knn = KNeighborsClassifier()
print_statistics(knn)

Accuracy 0.5786031912110907
R2 -0.8211383051388725
F1 0.3432531593966572
Confusion Matrix
[[1791  642]
 [ 969  421]]


  return self._fit(X, y)


In [136]:
knn = KNeighborsClassifier(n_neighbors=1)
print_statistics(knn)

Accuracy 0.5430290347894324
R2 -0.9748780999861022
F1 0.3957108267035628
Confusion Matrix
[[1504  929]
 [ 818  572]]


  return self._fit(X, y)


In [137]:
knn = KNeighborsClassifier(n_neighbors=10)
print_statistics(knn)

Accuracy 0.61182317551661
R2 -0.6775724673035921
F1 0.208955223880597
Confusion Matrix
[[2143  290]
 [1194  196]]


  return self._fit(X, y)


In [144]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg_model = reg.fit(x_train, y_train)
y_pred = reg_model.predict(x_test)
reg_r2 = r2_score(y_test, y_pred)
print(reg_r2)

0.010674700683430793
