In [None]:
"""
Dataset: Chess Game Dataset from Lichess
    
    20,000+ Lichess games
    This is a set of just over 20,000 games collected from a selection of users on the site Lichess.org. 
    
    This set contains the:

    Game ID;
    Rated (T/F);
    Start Time;
    End Time;
    Number of Turns;
    Game Status;
    Winner;
    Time Increment;
    White Player ID;
    White Player Rating;
    Black Player ID;
    Black Player Rating;
    All Moves in Standard Chess Notation;
    Opening Eco (Uses ECO Codes);
    Opening Name;
    Opening Ply (Number of moves in the opening phase)
"""

In [None]:
"""
QUESTION 1: 
                DO CERTAIN OPENINGS CONSISTENTLY LEAD TO BETTER OUTCOMES?
"""

In [7]:
#Load dataset 
import pandas as pd 
games = pd.read_csv("/Users/exequielfleitas/data/chess_games.csv")
games.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [None]:
# MODEL 1: Logidtic Regression
# Encode "winner" variables as binary variable (White wins = 1, White loses or draw = 0)
# Calculate and create varible rating_diff = white_rating - black_rating
# Encode "rated" as a binary variable (True = 1 and False = 0)
# Encode the opening ECO code A,B,C,D,E (Something more broad)
# NaNs; Numeric columns = median, Categorical columns = "Unknown"
# Time control we will use "increment_code" (ex: 3+0 = blitz, 5+3 = rapid)

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report


# Create "white_win" binary variable column
games["white_win"] = (games["winner"] == "white").astype(int)

# Create "rating_diff" column 
games["rating_diff"] = (
    games["white_rating"] - games["black_rating"] )

# Create "opening_family" column 
games["opening_family"] = games["opening_eco"].str[0]

games.head(2)

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply,white_win,rating_diff,opening_family
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5,1,309,D
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4,0,61,B


In [11]:
#Select the features we need
features = [
    "rating_diff",
    "opening_ply",
    "rated",
    "opening_family",
    "increment_code"
]

X = games[features]
y = games["white_win"]

In [13]:
# Train / test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, 
                                                    stratify = y )

In [15]:
# Processes 
numeric_features = ["rating_diff", "opening_ply"]
binary_features = ["rated"]
categorical_features = ["opening_family", "increment_code"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [17]:
# Logistic regression model 
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter = 1000))
])

In [19]:
# Train model
model.fit(X_train, y_train)

In [21]:
# Evaluate 
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.6306081754735793
ROC AUC: 0.6884829771371769

Classification Report:

              precision    recall  f1-score   support

           0       0.63      0.64      0.64      2012
           1       0.63      0.62      0.62      2000

    accuracy                           0.63      4012
   macro avg       0.63      0.63      0.63      4012
weighted avg       0.63      0.63      0.63      4012





In [23]:
# Rating-Only Model vs. Rating & Openings
# Isolates the effect of openings

features_rating_only = [
    "rating_diff",
    "rated",
    "increment_code"
]

X_r = games[features_rating_only]
y = games["white_win"]

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_r, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
numeric_features_r = ["rating_diff"]
binary_features_r = ["rated"]
categorical_features_r = ["increment_code"]

preprocessor_r = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features_r),
        ("bin", "passthrough", binary_features_r),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
        ]), categorical_features_r)
    ]
)

model_ratings_only = Pipeline([
    ("preprocessor", preprocessor_r),
    ("classifier", LogisticRegression(max_iter=1000))
])

model_ratings_only.fit(Xr_train, yr_train)

yr_prob = model_ratings_only.predict_proba(Xr_test)[:, 1]

print("Ratings-only ROC AUC:", roc_auc_score(yr_test, yr_prob))
print("Ratings + Openings ROC AUC:", roc_auc_score(y_test, y_prob))

Ratings-only ROC AUC: 0.6911048707753479
Ratings + Openings ROC AUC: 0.6884829771371769




In [27]:
# Extract coefficients 

# Get feature names
feature_names = model.named_steps["preprocessor"].get_feature_names_out()

coefficients = model.named_steps["classifier"].coef_[0]

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefficients
}).sort_values(by="coefficient", ascending=False)

coef_df.head(10)

Unnamed: 0,feature,coefficient
214,cat__increment_code_25+4,1.233331
27,cat__increment_code_10+12,1.175112
145,cat__increment_code_17+3,1.135904
72,cat__increment_code_12+8,1.123538
89,cat__increment_code_13+40,1.103692
165,cat__increment_code_19+0,1.069577
206,cat__increment_code_25+10,1.061833
328,cat__increment_code_60+30,0.973763
299,cat__increment_code_5+25,0.944947
0,num__rating_diff,0.930318
