# Intro

In [1]:
# increase the width of the notebook
from IPython.display import display, HTML, Markdown

display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data_ML.csv",sep=",")

In [4]:
df.head(5)

Unnamed: 0,WhiteElo,BlackElo,EloDif,Opening_name,Time_format,Increment_binary,Score
0,1851,1901,-50,Alekhine's defense,classical,Yes,1.0
1,2060,2111,-51,French Defense,blitz,Yes,0.0
2,2307,2290,17,Philidor Defense,blitz,No,0.5
3,2380,2419,-39,Sicilian defense,rapid,No,0.0
4,2686,2848,-162,Ruy Lopez,rapid,No,0.0


In [5]:
df = df.drop("BlackElo", axis=1) #"BlackElo" is redundant

### Keep the top 10 openings.

In [6]:
top10_openings = df['Opening_name'].value_counts().nlargest(10).index

In [7]:
# Merge all other openings into "Other"
df['Opening_name'] = df['Opening_name'].where(
    df['Opening_name'].isin(top10_openings), 
    'Other'
)

In [8]:
df['Opening_name'].value_counts()

Other                                            27530
Sicilian defense                                 14435
Queen's Pawn Game                                 8721
French Defense                                    5398
English Opening                                   5181
Caro-Kann defense                                 3747
Irregular Openings                                3565
Queen's Gambit                                    3413
Scandinavian Defense (Center-Counter Defense)     3046
Closed Game, Irregular Responses                  2584
Zukertort Opening                                 2380
Name: Opening_name, dtype: int64

## I want to make sure that the same train and test sets will be used in all Jupyter notebooks.

In [9]:
train, test = train_test_split(df, test_size=10000, random_state=42,  stratify=df['Score'])

In [10]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

## Create Preprocessing pipelines

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

In [12]:
#Preprocessing pipelines
numeric_features = ["WhiteElo", "EloDif"]
categorical_features = ["Opening_name", "Time_format", "Increment_binary"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [13]:
joblib.dump(preprocessor, "preprocessor_unfitted.joblib")

['preprocessor_unfitted.joblib']

## Create two dummy classification models  

In [14]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [15]:
X_train = train.drop('Score', axis=1)
y_train = train['Score']
X_test  = test.drop('Score', axis=1)
y_test  = test['Score']

In [16]:
# Dummy Regressor #1: predict the mean of y_train
dummy_mean = DummyRegressor(strategy='mean')
dummy_mean.fit(X_train, y_train)
y_pred_mean = dummy_mean.predict(X_test)

In [17]:
print("=== Dummy (mean) Regressor ===")
print("MSE: ", mean_squared_error(y_test, y_pred_mean))
print("MAE: ", mean_absolute_error(y_test, y_pred_mean))
print("R²:  ", r2_score(y_test, y_pred_mean))

=== Dummy (mean) Regressor ===
MSE:  0.23547751127551014
MAE:  0.47204675714285715
R²:   -5.4166962026869214e-09


In [18]:
# Dummy Regressor #2: predict the median of y_train
dummy_median = DummyRegressor(strategy='median')
dummy_median.fit(X_train, y_train)
y_pred_median = dummy_median.predict(X_test)

In [19]:
print("\n=== Dummy (median) Regressor ===")
print("MSE: ", mean_squared_error(y_test, y_pred_median))
print("MAE: ", mean_absolute_error(y_test, y_pred_median))
print("R²:  ", r2_score(y_test, y_pred_median))


=== Dummy (median) Regressor ===
MSE:  0.23585
MAE:  0.4717
R²:   -0.0015818495787560671
