In [24]:
# Librerías
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score)
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
df_path = "../bridge_project/chess_games.csv"
df = pd.read_csv(df_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   game_id            20058 non-null  int64 
 1   rated              20058 non-null  bool  
 2   turns              20058 non-null  int64 
 3   victory_status     20058 non-null  object
 4   winner             20058 non-null  object
 5   time_increment     20058 non-null  object
 6   white_id           20058 non-null  object
 7   white_rating       20058 non-null  int64 
 8   black_id           20058 non-null  object
 9   black_rating       20058 non-null  int64 
 10  moves              20058 non-null  object
 11  opening_code       20058 non-null  object
 12  opening_moves      20058 non-null  int64 
 13  opening_fullname   20058 non-null  object
 14  opening_shortname  20058 non-null  object
 15  opening_response   1207 non-null   object
 16  opening_variation  14398 non-null  objec

In [26]:
# La columna opening_variation representa la variacion a la apertura tradicional
# en el analisis exploratorio se demuestra que cuando esta columna esta vacia
# es porque se usa la apertura tradicional
# rellenaremos entonces los faltantes con esta cadena
df["opening_variation"] = df["opening_variation"].fillna("traditional opening")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   game_id            20058 non-null  int64 
 1   rated              20058 non-null  bool  
 2   turns              20058 non-null  int64 
 3   victory_status     20058 non-null  object
 4   winner             20058 non-null  object
 5   time_increment     20058 non-null  object
 6   white_id           20058 non-null  object
 7   white_rating       20058 non-null  int64 
 8   black_id           20058 non-null  object
 9   black_rating       20058 non-null  int64 
 10  moves              20058 non-null  object
 11  opening_code       20058 non-null  object
 12  opening_moves      20058 non-null  int64 
 13  opening_fullname   20058 non-null  object
 14  opening_shortname  20058 non-null  object
 15  opening_response   1207 non-null   object
 16  opening_variation  20058 non-null  objec

In [27]:
def codificar(column_name, df_target):
    """Codifica una variable en el total de categorias

    Args:
        column_name (string): Nombre de la columna a codificar
        df_target (dataframe): Dataframe pandas donde se va a agregar la nueva variable

    Returns:
        tuple: Nombre de la columna original y nombre de la columna modificada
    """
    le = LabelEncoder()
    new_column_name = f"{column_name}_cod"
    df_target[new_column_name] = le.fit_transform(df_target[column_name])
    return column_name, new_column_name

In [28]:
column_name, new_column_name = codificar("opening_variation", df)
# Se filtra el dataframe por las columnas original y codificada
# Se eliminan duplicados con efectos de mostrar las clases que existen
df[[column_name, new_column_name]].drop_duplicates()

Unnamed: 0,opening_variation,opening_variation_cod
0,Exchange Variation,181
1,Kennedy Variation,263
2,Leonardis Variation,299
3,Zukertort Variation,610
4,traditional opening,612
...,...,...
18870,Richter Variation,454
19118,Dutch Defense,170
19120,Semi-Leningrad Variation,488
19607,Spielmann Variation,519


In [29]:
column_name, new_column_name = codificar("rated", df)
# Se filtra el dataframe por las columnas original y codificada
# Se eliminan duplicados con efectos de mostrar las clases que existen
df[[column_name, new_column_name]].drop_duplicates()


Unnamed: 0,rated,rated_cod
0,False,0
1,True,1


In [30]:
column_name, new_column_name = codificar("opening_code", df)
# Se filtra el dataframe por las columnas original y codificada
# Se eliminan duplicados con efectos de mostrar las clases que existen
df[[column_name, new_column_name]].drop_duplicates()

Unnamed: 0,opening_code,opening_code_cod
0,D10,248
1,B00,71
2,C20,171
3,D02,241
4,C41,192
...,...,...
19527,E35,327
19532,E27,322
19566,E48,334
19605,C75,225


In [31]:
column_name, new_column_name = codificar("opening_shortname", df)
# Se filtra el dataframe por las columnas original y codificada
# Se eliminan duplicados con efectos de mostrar las clases que existen
df[[column_name, new_column_name]].drop_duplicates()

Unnamed: 0,opening_shortname,opening_shortname_cod
0,Slav Defense,110
1,Nimzowitsch Defense,74
2,King's Pawn Game,61
3,Queen's Pawn Game,94
4,Philidor Defense,83
...,...,...
10257,King's Indian,56
10807,Barnes Opening,6
12072,Canard Opening,16
13092,Pterodactyl Defense,89


In [32]:
x_names = ["rated",
           "turns",
           "opening_code_cod",
           "opening_moves",
           "opening_shortname_cod",
           "opening_variation_cod",
           "black_rating",
           "white_rating"]

y_names = ["winner"]
X = df[x_names]
Y = df[y_names]

In [33]:
def eval_perform(Y,Yhat):
    accu = accuracy_score(Y,Yhat)
    prec = precision_score(Y,Yhat,average='weighted')
    reca = recall_score(Y,Yhat,average='weighted')
    print('\n \t Accu \t Prec \t Reca\n Eval \t %0.3f \t %0.3f \t %0.3f'%(accu,prec,reca))

In [34]:
# Dividir conjuntos de datos en entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.winner,
                                                    test_size=0.3)

In [35]:
# Crear un clasificador SVM para clasificación con kernel lineal, polinomial y de funcion de base radial
mod_linear = svm.SVC(kernel='linear',C=1, probability=True)
mod_poly = svm.SVC(kernel='poly',degree=2,C=1, probability=True)
mod_rbf = svm.SVC(kernel='rbf',C=1,gamma='auto', probability=True)

In [36]:
# Entrenar el clasificador con los datos de entrenamiento
mod_linear.fit(X_train, Y_train)

In [None]:
mod_poly.fit(X_train, Y_train)

In [None]:
mod_rbf.fit(X_train, Y_train)

In [None]:
# Salida Y & "hat" (ŷ) que denota predicciones estimadas.
Yhat_linear = mod_linear.predict(X_test)
eval_perform(Y,Yhat_linear)

In [None]:
Yhat_poly = mod_poly.predict(X_test)
eval_perform(Y,Yhat_poly)

In [None]:
Yhat_rbf = mod_rbf.predict(X_test)
eval_perform(Y,Yhat_rbf)

In [None]:
from sklearn.linear_model import LogisticRegression
# Comparacion con regresion logistica
# Usar regularización L1 (Lasso)
reg_log = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
reg_log.fit(X_train,Y_train)
# Salida Y & "hat" (ŷ) que denota predicciones estimadas.
Yhat_log_test = reg_log.predict(X_test)
Yhat_log_train = reg_log.predict(X_train)
print(f'Entrenamiento accuracy score: {accuracy_score(Y_train,Yhat_log_train):0.2f}')
print(f'Prueba accuracy score: {accuracy_score(Y_test,Yhat_log_test):0.2f}')

Entrenamiento accuracy score: 0.62
Prueba accuracy score: 0.62
