# Random Forest and Feature Importance

In [1]:
# increase the width of the notebook
from IPython.display import display, HTML, Markdown

display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
rf = RandomForestClassifier(random_state=42)

In [5]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y_train = train["Score"]
y_test = test["Score"]

X_train = train.drop("Score", axis=1)
X_test = test.drop("Score", axis=1)

## Transformations

In [6]:
from sklearn.pipeline import Pipeline

In [7]:
#Preprocessing pipelines
numeric_features = ["WhiteElo", "EloDif"]
categorical_features = ["Opening_name", "Time_format", "Increment_binary"]

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [8]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed  = preprocessor.transform(X_test)


## Train

In [9]:

param_grid_rf = {
    'n_estimators':    [50, 100, 200],
    'max_depth':       [None, 10],
    'min_samples_split':[2, 5],
    'min_samples_leaf': [1, 2]
}

#  Wrap in a GridSearchCV
grid_search_rf = GridSearchCV(
    rf,
    param_grid_rf,
    cv=3,
    scoring='accuracy',
    n_jobs=1,
)

# Fit on the preprocessed training set
grid_search_rf.fit(X_train_transformed, y_train)

# Inspect CV results
print("Best RF parameters:", grid_search_rf.best_params_)
print("Best RF CV accuracy: {:.3f}".format(grid_search_rf.best_score_))

# Evaluate on the test set
y_pred_rf = grid_search_rf.predict(X_test_transformed)
print("Test set accuracy (RF): {:.3f}".format(accuracy_score(y_test, y_pred_rf)))
print("\nClassification Report (RF):\n", classification_report(y_test, y_pred_rf))


Best RF parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best RF CV accuracy: 0.546
Test set accuracy (RF): 0.548

Classification Report (RF):
               precision    recall  f1-score   support

   Black Win       0.55      0.45      0.50      4524
        Draw       0.00      0.00      0.00       566
   White Win       0.55      0.70      0.61      4910

    accuracy                           0.55     10000
   macro avg       0.37      0.38      0.37     10000
weighted avg       0.52      0.55      0.53     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### The Random Forest model has achieved an accuracy of 0.548, which is the best performance we have observed so far.

In [10]:
import joblib
joblib.dump(grid_search_rf.best_estimator_, 'best_random_forest.joblib')

['best_random_forest.joblib']

## Feature Importance

In [14]:
# Get feature importances from the best estimator
best_rf_model = grid_search_rf.best_estimator_
# Get feature names after preprocessing
ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
encoded_categorical_features = ohe.get_feature_names(categorical_features)
feature_names = numeric_features + list(encoded_categorical_features)

In [15]:
# Print feature importances
if hasattr(best_rf_model, 'feature_importances_'):
    for score, name in zip(best_rf_model.feature_importances_, feature_names):
        print(f"{round(score, 2)}: {name}")
else:
    print("The best estimator does not have feature_importances_ attribute.")

0.29: WhiteElo
0.61: EloDif
0.0: Opening_name_Caro-Kann defense
0.0: Opening_name_Closed Game, Irregular Responses
0.0: Opening_name_English Opening
0.0: Opening_name_French Defense
0.0: Opening_name_Irregular Openings
0.01: Opening_name_Other
0.0: Opening_name_Queen's Gambit
0.01: Opening_name_Queen's Pawn Game
0.0: Opening_name_Scandinavian Defense (Center-Counter Defense)
0.0: Opening_name_Sicilian defense
0.0: Opening_name_Zukertort Opening
0.01: Time_format_ blitz 
0.01: Time_format_ bullet 
0.01: Time_format_ classical 
0.01: Time_format_ rapid 
0.01: Increment_binary_No
0.01: Increment_binary_Yes


### 0.61: EloDif (Elo Difference): This is by far the most important feature according to the model.
### 0.29: WhiteElo (White's Elo): White's individual Elo rating also has a significant impact, although less than the Elo difference. This suggests that even when the Elo difference is the same, the absolute strength of White can still influence the outcome.
### The rest features have a very small, almost negligible importance score.