In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Selected stats

In [2]:
df = pd.read_pickle("../Datasets/matches_full.pkl")
df.head()

Unnamed: 0,date,comp,round,venue,result,gf,ga,opponent,xg,xga,...,crspa_average_opponent,tkl_average_opponent,tklw_average_opponent,blocks_average_opponent,int_average_opponent,clr_average_opponent,touches_average_opponent,att_y_average_opponent,succ_average_opponent,succ%_average_opponent
0,2019-09-20,Bundesliga,Matchweek 5,Away,L,1.0,2.0,Schalke 04,0.8,1.0,...,1.75,20.0,13.75,14.5,11.75,18.0,567.25,13.5,8.25,63.775
1,2019-09-20,Bundesliga,Matchweek 5,Home,W,2.0,1.0,Mainz 05,1.0,0.8,...,1.5,17.75,11.25,11.25,11.5,21.5,528.75,15.25,7.75,49.675
2,2019-09-21,Bundesliga,Matchweek 5,Away,D,1.0,1.0,Freiburg,1.0,1.3,...,2.0,14.75,8.25,11.75,6.5,19.25,505.0,16.0,9.5,59.925
3,2019-09-21,Bundesliga,Matchweek 5,Home,L,0.0,3.0,RB Leipzig,0.7,1.5,...,1.25,19.75,11.5,11.75,13.5,26.0,632.75,18.25,12.0,65.65
4,2019-09-21,Bundesliga,Matchweek 5,Away,W,3.0,0.0,Werder Bremen,1.5,0.7,...,1.25,11.5,8.0,7.0,5.75,15.75,650.0,9.5,6.0,63.825


In [3]:
df.shape

(2706, 145)

In [4]:
# train test
train = df.loc[:int(df.shape[0]*0.7)]
test = df.loc[int(df.shape[0]*0.7)+1:]

In [5]:
# selected stats from team
selected_stats_from_team = ['gf_average', 'ga_average', 'poss_average', 'sot%_average', 'saves_average', 
                  'stp_average', '#opa_average', 'kp_average', 'crspa_average', 
                  'tkl_average', 'blocks_average', 'int_average', 'att_y_average', 
                            'succ%_average', 'venue_encoded', 'team_encoded']

In [6]:
# select stats
selected_stats_from_opponent = [f"{stat}_opponent" for stat in selected_stats_from_team]
predictors = selected_stats_from_team + selected_stats_from_opponent
predictors.remove('venue_encoded_opponent')
predictors

['gf_average',
 'ga_average',
 'poss_average',
 'sot%_average',
 'saves_average',
 'stp_average',
 '#opa_average',
 'kp_average',
 'crspa_average',
 'tkl_average',
 'blocks_average',
 'int_average',
 'att_y_average',
 'succ%_average',
 'venue_encoded',
 'team_encoded',
 'gf_average_opponent',
 'ga_average_opponent',
 'poss_average_opponent',
 'sot%_average_opponent',
 'saves_average_opponent',
 'stp_average_opponent',
 '#opa_average_opponent',
 'kp_average_opponent',
 'crspa_average_opponent',
 'tkl_average_opponent',
 'blocks_average_opponent',
 'int_average_opponent',
 'att_y_average_opponent',
 'succ%_average_opponent',
 'team_encoded_opponent']

# GridsearchCV

In [7]:
rf = RandomForestClassifier()
parameters = {'n_estimators':[50,100,200,300], 'min_samples_split':[2, 5, 10, 15, 20], 'max_features':['sqrt', 'log2', None]}

In [None]:
clf = GridSearchCV(rf, param_grid=parameters)
clf.fit(train[predictors], train['result_encoded'])

In [None]:
# best paragrams
clf.best_params_

In [None]:
# best score
clf.best_score_

In [None]:
# the rows with rank 1
pd.DataFrame(clf.cv_results_)[pd.DataFrame(clf.cv_results_)['rank_test_score']==1]

# Evaluation

In [None]:
# check accuracy
rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
rf.fit(train[predictors], train['result_encoded'])

prob_preds = rf.predict_proba(test[predictors])
preds = rf.predict(test[predictors])

print("Accuracy on test set", accuracy_score(test['result_encoded'], preds))

preds_on_training = rf.predict(train[predictors])
print("Accuracy on training set", accuracy_score(train['result_encoded'], preds_on_training))

In [None]:
# confusion matrix
cm = confusion_matrix(test['result_encoded'], preds)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

- Win = 2
- Draw = 0
- Lose = 1

In [None]:
metrics = [precision_score, recall_score, f1_score]
averages = ['micro', 'macro', 'weighted']

for metric in metrics:
    for average in averages:
        name = metric.__name__.replace("_score","")
        print(f"{name} of the model with average = {average}:",  metric(test['result_encoded'], preds, average=average), "\n")

# features importance

In [None]:
# Get feature importances
importances = rf.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")
for f in range(train[predictors].shape[1]):
    print(f"{f + 1}. {predictors[f]} ({importances[indices[f]]})")