In [57]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('../match_data.csv')
df.head()

Unnamed: 0,Date,Team1,Team2,Score,Team1_Position,Team1_Win_Percentage,Team2_Position,Team2_Win_Percentage,Team1_Player1_Rating,Team1_Player1_Kills_Per_Round,...,Team2_Player3_Deaths_Per_Round,Team2_Player3_Headshots,Team2_Player4_Rating,Team2_Player4_Kills_Per_Round,Team2_Player4_Deaths_Per_Round,Team2_Player4_Headshots,Team2_Player5_Rating,Team2_Player5_Kills_Per_Round,Team2_Player5_Deaths_Per_Round,Team2_Player5_Headshots
0,28/03/2024,spirit,faze,0,4,79.2,1,72.4,1.03,0.63,...,0.63,54.0,1.12,0.71,0.6,52.5,1.18,0.74,0.59,35.1
1,28/03/2024,spirit,natus_vincere,1,4,79.2,2,77.3,1.03,0.63,...,0.65,65.9,1.16,0.73,0.65,51.2,1.17,0.74,0.57,38.4
2,28/03/2024,spirit,imperial,1,4,79.2,18,72.0,1.03,0.63,...,0.64,48.6,1.14,0.73,0.61,52.2,1.11,0.69,0.66,53.5
3,28/03/2024,spirit,cloud9,1,4,79.2,7,63.0,1.03,0.63,...,0.65,35.8,1.09,0.65,0.67,48.0,1.12,0.68,0.57,45.9
4,28/03/2024,spirit,metizport,1,4,79.2,30,62.0,1.03,0.63,...,0.66,55.2,1.1,0.68,0.68,55.5,1.09,0.66,0.66,52.3


In [22]:
def standardize_features(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [23]:
X = df.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
X.head()

Unnamed: 0,Team1_Position,Team1_Win_Percentage,Team2_Position,Team2_Win_Percentage,Team1_Player1_Rating,Team1_Player1_Kills_Per_Round,Team1_Player1_Deaths_Per_Round,Team1_Player1_Headshots,Team1_Player2_Rating,Team1_Player2_Kills_Per_Round,...,Team2_Player3_Deaths_Per_Round,Team2_Player3_Headshots,Team2_Player4_Rating,Team2_Player4_Kills_Per_Round,Team2_Player4_Deaths_Per_Round,Team2_Player4_Headshots,Team2_Player5_Rating,Team2_Player5_Kills_Per_Round,Team2_Player5_Deaths_Per_Round,Team2_Player5_Headshots
0,4,79.2,1,72.4,1.03,0.63,0.64,56.1,1.21,0.74,...,0.63,54.0,1.12,0.71,0.6,52.5,1.18,0.74,0.59,35.1
1,4,79.2,2,77.3,1.03,0.63,0.64,56.1,1.21,0.74,...,0.65,65.9,1.16,0.73,0.65,51.2,1.17,0.74,0.57,38.4
2,4,79.2,18,72.0,1.03,0.63,0.64,56.1,1.21,0.74,...,0.64,48.6,1.14,0.73,0.61,52.2,1.11,0.69,0.66,53.5
3,4,79.2,7,63.0,1.03,0.63,0.64,56.1,1.21,0.74,...,0.65,35.8,1.09,0.65,0.67,48.0,1.12,0.68,0.57,45.9
4,4,79.2,30,62.0,1.03,0.63,0.64,56.1,1.21,0.74,...,0.66,55.2,1.1,0.68,0.68,55.5,1.09,0.66,0.66,52.3


In [24]:
X = standardize_features(X)
X.head()

Unnamed: 0,Team1_Position,Team1_Win_Percentage,Team2_Position,Team2_Win_Percentage,Team1_Player1_Rating,Team1_Player1_Kills_Per_Round,Team1_Player1_Deaths_Per_Round,Team1_Player1_Headshots,Team1_Player2_Rating,Team1_Player2_Kills_Per_Round,...,Team2_Player3_Deaths_Per_Round,Team2_Player3_Headshots,Team2_Player4_Rating,Team2_Player4_Kills_Per_Round,Team2_Player4_Deaths_Per_Round,Team2_Player4_Headshots,Team2_Player5_Rating,Team2_Player5_Kills_Per_Round,Team2_Player5_Deaths_Per_Round,Team2_Player5_Headshots
0,-1.051453,1.65133,-1.264441,1.042065,-0.231966,-0.272536,-0.313153,0.900032,1.167393,0.97813,...,-0.208777,0.361315,0.501169,0.620592,-1.28011,0.208757,0.64147,0.585975,-1.200167,-1.82837
1,-1.051453,1.65133,-1.206761,1.412826,-0.231966,-0.272536,-0.313153,0.900032,1.167393,0.97813,...,0.254064,1.548783,1.014442,0.981164,-0.023992,0.052045,0.54632,0.585975,-1.644802,-1.443074
2,-1.051453,1.65133,-0.283884,1.011798,-0.231966,-0.272536,-0.313153,0.900032,1.167393,0.97813,...,0.022644,-0.177535,0.757806,0.981164,-1.028887,0.172593,-0.024579,-0.08294,0.356056,0.319949
3,-1.051453,1.65133,-0.918362,0.330808,-0.231966,-0.272536,-0.313153,0.900032,1.167393,0.97813,...,0.254064,-1.454811,0.116215,-0.461122,0.478456,-0.333706,0.070571,-0.216723,-1.644802,-0.5674
4,-1.051453,1.65133,0.408274,0.255143,-0.231966,-0.272536,-0.313153,0.900032,1.167393,0.97813,...,0.485485,0.48106,0.244533,0.079735,0.72968,0.570399,-0.214879,-0.48429,0.356056,0.179842


In [25]:
y = df['Score']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [6]:
#round 1
param_grid = {
   'max_depth': [3, 4, 5],
   'learning_rate': [0.1, 0.01, 0.05],
   'gamma': [0, 0.25, 1.0],
   'reg_lambda': [0.1, 1.0, 10.0],
   'scale_pos_weight': [1, 3, 5]
}

In [27]:
#round 2
param_grid = {
    'max_depth': [1, 2, 3],
    'learning_rate': [0.1],
    'gamma': [1.0],
    'reg_lambda': [5.0 , 10.0, 15.0],
    'scale_pos_weight': [1, 2, 3]
}

In [30]:
#round 3
param_grid = {
    'max_depth': [2],
    'learning_rate': [0.1],
    'gamma': [1.0],
    'reg_lambda': [5.0],
    'scale_pos_weight': [3]
}

In [42]:
#round 4
param_grid = {
    'max_depth': [1, 2, 3],
    'learning_rate': [0.05, 0.1, 0.2],
    'gamma': [0, 0.5, 1.0],
    'reg_lambda': [1.0, 5.0, 10.0],
    'scale_pos_weight': [1, 2, 3]
}

In [46]:
optimal_params = GridSearchCV(
   estimator=XGBClassifier(objective='binary:logistic', seed=42, eval_metric='aucpr',use_label_encoder=False),
   param_grid=param_grid,
   scoring='roc_auc',
   verbose=0,
   cv=3
)

In [58]:
random_search = RandomizedSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', seed=42, eval_metric='aucpr', use_label_encoder=False),
    param_distributions=param_grid,
    n_iter=100,
    scoring='roc_auc',
    verbose=1,
    cv=3,
    random_state=42
)

In [47]:
optimal_params.fit(X_train,
           y_train)

print(optimal_params.best_params_)

#colsample_bytree parameter for overfitting
#subsample parameter for overfitting

{'gamma': 0, 'learning_rate': 0.05, 'max_depth': 2, 'reg_lambda': 5.0, 'scale_pos_weight': 2}


In [59]:
random_search.fit(X_train, y_train)
print(random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'scale_pos_weight': 2, 'reg_lambda': 5.0, 'max_depth': 2, 'learning_rate': 0.05, 'gamma': 1.0}


In [52]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=10000, **optimal_params.best_params_)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print("Average ROC-AUC: ", scores.mean())

Average ROC-AUC:  0.6006167195865753


In [73]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', learning_rate=0.1, n_estimators=10000)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print("Average ROC-AUC: ", scores.mean())

Average ROC-AUC:  0.5995876235388842


In [61]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=10000, **random_search.best_params_)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print("Average ROC-AUC: ", scores.mean())

Average ROC-AUC:  0.6627257122766181


In [74]:
model.fit(X_train, y_train)
importances = pd.Series(model.feature_importances_, index=X_train.columns)
importances = importances.sort_values(ascending=False)
print(importances)

Team1_Position                    0.044786
Team2_Position                    0.037263
Team1_Player2_Kills_Per_Round     0.035212
Team1_Player5_Deaths_Per_Round    0.029955
Team2_Player1_Deaths_Per_Round    0.029735
Team1_Player2_Deaths_Per_Round    0.028845
Team1_Player4_Rating              0.027070
Team1_Player2_Rating              0.025448
Team2_Win_Percentage              0.025102
Team1_Player3_Rating              0.024967
Team2_Player3_Kills_Per_Round     0.024909
Team2_Player5_Headshots           0.024850
Team1_Player5_Kills_Per_Round     0.024608
Team2_Player4_Kills_Per_Round     0.023747
Team2_Player3_Headshots           0.023720
Team1_Win_Percentage              0.023605
Team2_Player2_Deaths_Per_Round    0.023577
Team2_Player3_Rating              0.023314
Team1_Player3_Deaths_Per_Round    0.022825
Team1_Player1_Headshots           0.022743
Team1_Player4_Kills_Per_Round     0.022725
Team1_Player1_Kills_Per_Round     0.022456
Team2_Player4_Rating              0.021997
Team1_Playe

In [75]:
y_pred = model.predict(X_test)

In [76]:
print(classification_report(y_test, y_pred))
confusion_matr = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(confusion_matr, index=['Actual Negative', 'Actual Positive'],
                              columns=['Predicted Negative', 'Predicted Positive'])
print(conf_matrix_df)

              precision    recall  f1-score   support

           0       0.57      0.52      0.54       520
           1       0.54      0.59      0.56       502

    accuracy                           0.55      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.55      0.55      0.55      1022

                 Predicted Negative  Predicted Positive
Actual Negative                 272                 248
Actual Positive                 208                 294


In [77]:
print(classification_report(y_test, y_pred))
confusion_m = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(confusion_m, index=['Actual Negative', 'Actual Positive'],
                              columns=['Predicted Negative', 'Predicted Positive'])
print(conf_matrix_df)

print({
    'true negatives': round(100*confusion_m[0, 0]/len(y_test)),
    'false positives': round(100*confusion_m[0, 1]/len(y_test)),
    'false negatives': round(100*confusion_m[1, 0]/len(y_test)),
    'true positives': round(100*confusion_m[1, 1]/len(y_test))
})

              precision    recall  f1-score   support

           0       0.57      0.52      0.54       520
           1       0.54      0.59      0.56       502

    accuracy                           0.55      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.55      0.55      0.55      1022

                 Predicted Negative  Predicted Positive
Actual Negative                 272                 248
Actual Positive                 208                 294
{'true negatives': 27, 'false positives': 24, 'false negatives': 20, 'true positives': 29}
