In [None]:
"""
QUESTON 2:
                DO OPENINGS AFFECT GAME LENGTH OR DRAW LIKELIHOOD?   
"""

In [None]:
#Load dataset 

import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error

games = pd.read_csv("/Users/exequielfleitas/data/chess_games.csv")

games.head()

In [None]:
# Create "rating_diff" column 
games["rating_diff"] = (
    games["white_rating"] - games["black_rating"] )

In [None]:
# Clean the dataset 

# Drop rows with missing values 
games = games.dropna(subset = ["turns"])

# Ensure that key numeric columns are numeric!
games["turns"] = pd.to_numeric(games["turns"], errors = "coerce")
games["rating_diff"] = pd.to_numeric(games["rating_diff"], errors = "coerce")
games["opening_ply"] = pd.to_numeric(games["opening_ply"], errors = "coerce")

# Remove any rows where turns became NaN
games = games.dropna(subset = ["turns"])


# Make "rated" variable a binary variable 
games["rated"] = games["rated"].astype(int)

games.head(2)

In [None]:
#Filter out not common ECO codes (ECO Codes < 100)

MIN_ECO_COUNT = 100

eco_counts = games["opening_eco"].value_counts()
common_ecos = eco_counts[eco_counts >= MIN_ECO_COUNT].index

games["opening_eco_filtered"] = games["opening_eco"].where(games["opening_eco"].isin(common_ecos),
                                                           "Other"
                                                          )

In [None]:
# Define X and y 

features = [
    "rating_diff",
    "opening_ply",
    "rated",
    "increment_code",
    "opening_eco_filtered"
]

X = games[features].copy()
y = games["turns"].copy()

In [None]:
# Train & split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Processing 
numeric_features = ["rating_diff", "opening_ply"]
binary_features = ["rated"]
categorical_features = ["increment_code", "opening_eco_filtered"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("bin", "passthrough", binary_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# Linear Regression 

linreg_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

linreg_model.fit(X_train, y_train)

y_pred_lr = linreg_model.predict(X_test)

lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("=== Linear Regression ===")
print("R²:", lr_r2)
print("RMSE (turns):", lr_rmse)

In [None]:
# Random Forest Regressor 

rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        min_samples_leaf=5
    ))
])

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rf_r2 = r2_score(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("\n=== Random Forest Regressor ===")
print("R²:", rf_r2)
print("RMSE (turns):", rf_rmse)

In [None]:
# Compare both models 
print("\n=== Comparison ===")
print(f"Linear Regression  -> R²: {lr_r2:.4f}, RMSE: {lr_rmse:.2f}")
print(f"Random Forest      -> R²: {rf_r2:.4f}, RMSE: {rf_rmse:.2f}")

In [None]:
# Shows which openings are the longest!
eco_summary = (
    games.groupby("opening_eco_filtered")["turns"]
    .agg(["count", "mean", "median"])
    .sort_values("mean", ascending=False)
)

print("\n=== Top 10 openings by average turns ===")
print(eco_summary.head(10))

print("\n=== Bottom 10 openings by average turns ===")
print(eco_summary.tail(10))