In [2]:
import pandas as pd
import numpy as np

In [3]:
data =  pd.read_csv("../data/csv/matches.csv")
data.head()

Unnamed: 0,id,player_id,game_id,role,is_new,is_new_role,division,team_avg_sr,opponent_avg_sr,target,cum_wins,cum_matches,win_rate,team_chemistry,opponent_chemistry
0,1,1,1,1,True,True,11,800.0,714.0,False,0,1,0.0,0,0
1,2,2,1,1,True,True,17,800.0,714.0,False,0,1,0.0,0,0
2,3,3,1,2,True,True,11,800.0,714.0,False,0,1,0.0,0,0
3,4,4,1,2,True,True,17,800.0,714.0,False,0,1,0.0,0,0
4,5,5,1,3,True,True,7,800.0,714.0,False,0,1,0.0,0,0


In [7]:
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split, StratifiedKFold)

needed = [
    "role",
    "is_new",
    "is_new_role",
    "division",
    "team_avg_sr",
    "opponent_avg_sr",
    "win_rate",
    "team_chemistry",
    "opponent_chemistry"
]

y = data["target"]

X = data[needed]

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.3, random_state=17
)


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

X_train_scaled, X_holdout_scaled

(array([[ 1.09392713, -0.34712955, -0.44883056, ..., -0.2202618 ,
          1.11475089, -0.953872  ],
        [ 1.09392713,  2.88076888,  2.22801228, ...,  0.37486268,
         -0.06894833,  0.03795702],
        [ 1.09392713, -0.34712955, -0.44883056, ..., -0.83026439,
          1.38047929, -0.953872  ],
        ...,
        [-0.19583094, -0.34712955, -0.44883056, ...,  0.06386214,
          0.55913697,  1.22815185],
        [-0.19583094, -0.34712955, -0.44883056, ..., -0.98840324,
          0.14846581,  1.62488346],
        [ 1.09392713, -0.34712955, -0.44883056, ..., -0.19225594,
          0.55913697,  1.82324926]]),
 array([[-1.48558902, -0.34712955, -0.44883056, ...,  0.16938492,
          1.13890802, -0.953872  ],
        [ 1.09392713, -0.34712955, -0.44883056, ..., -0.03022205,
         -0.14141971,  0.59999347],
        [-1.48558902, -0.34712955, -0.44883056, ...,  0.86375804,
         -0.02063408,  1.72406636],
        ...,
        [-0.19583094, -0.34712955, -0.44883056, ..., -

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, random_state=42, class_weight="balanced"
)

parameters = {
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

rf_grid_search = GridSearchCV(
    rf, parameters, n_jobs=-1, scoring="roc_auc", cv=skf, verbose=True
)
rf_grid_search = rf_grid_search.fit(X_train_scaled, y_train)

rf_grid_search.best_score_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


np.float64(0.8402812321337935)

In [11]:
pd.DataFrame(
    {"feat": needed, "coef": rf_grid_search.best_estimator_.feature_importances_}
).sort_values(by="coef", ascending=False)

Unnamed: 0,feat,coef
8,opponent_chemistry,0.337448
6,win_rate,0.254612
7,team_chemistry,0.180868
5,opponent_avg_sr,0.076907
4,team_avg_sr,0.073076
3,division,0.043657
0,role,0.014051
1,is_new,0.012682
2,is_new_role,0.0067


In [13]:
from sklearn.metrics import mean_squared_error

print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, rf_grid_search.predict(X_holdout_scaled))
)

Mean squared error (test): 0.264


In [15]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=sum(y_train==0)/sum(y_train==1), n_estimators=200)
xgb.fit(X_train_scaled, y_train)

In [17]:
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, xgb.predict(X_holdout_scaled))
)

Mean squared error (test): 0.173


In [22]:
pd.DataFrame(
    {"feat": needed, "coef": xgb.feature_importances_}
).sort_values(by="coef", ascending=False)

Unnamed: 0,feat,coef
8,opponent_chemistry,0.214115
6,win_rate,0.207074
7,team_chemistry,0.143605
1,is_new,0.122381
4,team_avg_sr,0.09112
5,opponent_avg_sr,0.089364
3,division,0.046145
2,is_new_role,0.043524
0,role,0.042673


In [19]:
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

xgb_grid = GridSearchCV(
    XGBClassifier(
        scale_pos_weight=sum(y_train==0)/sum(y_train==1),
        n_estimators=200
    ),
    params,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)

xgb_grid.fit(X_train_scaled, y_train)

print(f"Best ROC AUC: {xgb_grid.best_score_:.4f}")
print(f"Best params: {xgb_grid.best_params_}")

Best ROC AUC: 0.9131
Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'subsample': 1.0}


In [21]:
pd.DataFrame(
    {"feat": needed, "coef": xgb_grid.best_estimator_.feature_importances_}
).sort_values(by="coef", ascending=False)

Unnamed: 0,feat,coef
6,win_rate,0.224088
8,opponent_chemistry,0.219253
7,team_chemistry,0.150601
1,is_new,0.100288
4,team_avg_sr,0.091567
5,opponent_avg_sr,0.086281
3,division,0.045783
2,is_new_role,0.043847
0,role,0.038293


In [24]:
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, xgb_grid.predict(X_holdout_scaled))
)

Mean squared error (test): 0.179


In [26]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    xgb,
    X_holdout_scaled, 
    y_holdout,
    cv=skf,
    scoring='roc_auc'
)


print(f"Best ROC AUC xgb_grid: {xgb_grid.best_score_:.4f}")
print(f"Best ROC AUC xgb: {cv_scores.mean():.4f}")
print(f"Best ROC AUC rf_grid: {rf_grid_search.best_score_:.4f}")

Best ROC AUC xgb_grid: 0.9131
Best ROC AUC xgb: 0.8493
Best ROC AUC rf_grid: 0.8403


In [28]:
from sklearn.metrics import roc_auc_score

y_holdout_proba = xgb_grid.best_estimator_.predict_proba(X_holdout_scaled)[:, 1]
holdout_auc = roc_auc_score(y_holdout, y_holdout_proba)
print(f"Holdout ROC AUC: {holdout_auc:.4f}")

Holdout ROC AUC: 0.9215
