# Model testing (Random Forest)
In this notebook we try to train a model using random forest algorithm and the preprocessed data

In [23]:
# Importing the pickle package for the storing of the model so that in future execution 
# we will not be forced to recreate the model losing time.
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


df = pd.read_csv("../data/preprocessed/matches.csv")

print(pd.DataFrame(df).iloc[0])
model = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)


predictors = ["venue_code", "opp_code", "hour", "day_code"]
targets = "target"


X_train = (df[df["date"] < '2022-01-01'])[predictors]
X_test = (df[df["date"] > "2022-01-01"])[predictors]

Y_train = (df[df["date"] < "2022-01-01"])[targets]
Y_test = (df[df["date"] > "2022-01-01"])[targets]

model.fit(X_train[predictors], Y_train)


date                 2021-08-15
time                      16:30
round               Matchweek 1
day                         Sun
venue                      Away
result                        L
gf                          0.0
ga                          1.0
opponent              Tottenham
xg                          1.9
xga                         1.3
poss                       64.0
attendance              58262.0
captain             Fernandinho
formation                 4-3-3
referee          Anthony Taylor
match report       Match Report
sh                         18.0
sot                         4.0
dist                       16.9
fk                          1.0
pk                          0.0
pkatt                       0.0
season                     2022
team            Manchester City
target                        0
venue_code                    0
opp_code                     18
hour                         16
day_code                      6
Name: 0, dtype: object


# Prediction and metrics/accuracy section

In [28]:
from sklearn.metrics import accuracy_score

# Generating model predictions.
preds = model.predict(X_test[predictors])

error = accuracy_score(Y_test, preds)

print("The error with the given model is", error)

The error with the given model is 0.6123188405797102


# Visualizing the prediction formats

In [29]:
combination = pd.DataFrame(dict(actual = Y_test, predicted = preds))
pd.crosstab(index=combination["actual"], columns=combination["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


# Problem
--> Notice that the error is actually not bad, but since we want to predict the win (predicted = 1, actual = 1) we had poor performance, in the code to detect this possible erro in the binary classification problem we use precision score package.

In [31]:
from sklearn.metrics import precision_score

precision = precision_score(Y_test, preds)

print("The precisione score is:", precision)

The precisione score is: 0.4745762711864407


# Team grouping test
In This section we try to do the  grouping by teams to detect score based on the team playing.

In [37]:
grouped_matches = df.groupby("team")
group = grouped_matches.get_group("Manchester City").sort_values("date")
print(group)

def rolling_averages(group, cols, new_cols):
    """
        This function is used to compute the rolling avarages of the groups.
        --> rolling avarages means to take the elements in some columns remove them and save the mean.
    """
    # Sorting the group elements by date.
    group = group.sort_values("date")
    
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

           date   time         round  day venue result   gf   ga  \
629  2020-09-21  20:15   Matchweek 2  Mon  Away      W  3.0  1.0   
630  2020-09-27  16:30   Matchweek 3  Sun  Home      L  2.0  5.0   
631  2020-10-03  17:30   Matchweek 4  Sat  Away      D  1.0  1.0   
632  2020-10-17  17:30   Matchweek 5  Sat  Home      W  1.0  0.0   
633  2020-10-24  12:30   Matchweek 6  Sat  Away      D  1.0  1.0   
..          ...    ...           ...  ...   ...    ...  ...  ...   
28   2022-03-14  20:00  Matchweek 29  Mon  Away      D  0.0  0.0   
29   2022-04-02  15:00  Matchweek 31  Sat  Away      W  2.0  0.0   
30   2022-04-10  16:30  Matchweek 32  Sun  Home      D  2.0  2.0   
31   2022-04-20  20:00  Matchweek 30  Wed  Home      W  3.0  0.0   
32   2022-04-23  15:00  Matchweek 34  Sat  Home      W  5.0  1.0   

           opponent   xg  ...   fk   pk  pkatt season             team target  \
629          Wolves  1.9  ...  2.0  1.0    1.0   2021  Manchester City      1   
630  Leicester City  

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
632,2020-10-17,17:30,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,1.5,...,17,5,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
633,2020-10-24,12:30,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,1.1,...,12,5,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
634,2020-10-31,12:30,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,1.5,...,12,5,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
635,2020-11-08,16:30,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,1.6,...,16,6,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
636,2020-11-21,17:30,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,1.3,...,17,5,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,2022-03-14,20:00,Matchweek 29,Mon,Away,D,0.0,0.0,Crystal Palace,2.3,...,20,0,2.333333,1.333333,19.000000,7.000000,15.366667,0.333333,0.333333,0.333333
29,2022-04-02,15:00,Matchweek 31,Sat,Away,W,2.0,0.0,Burnley,1.8,...,15,5,1.666667,0.333333,18.333333,7.333333,16.000000,0.333333,0.000000,0.000000
30,2022-04-10,16:30,Matchweek 32,Sun,Home,D,2.0,2.0,Liverpool,2.0,...,16,6,2.000000,0.333333,20.000000,6.666667,16.133333,0.333333,0.000000,0.000000
31,2022-04-20,20:00,Matchweek 30,Wed,Home,W,3.0,0.0,Brighton,1.2,...,20,2,1.333333,0.666667,15.666667,4.666667,16.700000,0.333333,0.000000,0.000000


In [54]:
matches_rolling = df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.head()

matches_rolling.to_csv("../data/preprocessed/matches_rolling.csv")

  matches_rolling = df.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [56]:
# Making new prediction with the new processed data. 

predictors = predictors + new_cols

X_train = (matches_rolling[matches_rolling["date"] < '2022-01-01'])[predictors]
X_test = (matches_rolling[matches_rolling["date"] > "2022-01-01"])[predictors]

Y_train = (matches_rolling[matches_rolling["date"] < "2022-01-01"])[targets]
Y_test = (matches_rolling[matches_rolling["date"] > "2022-01-01"])[targets]


model_rolling = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
model_rolling.fit(X_train[predictors], Y_train)

# Generating model predictions.
preds = model_rolling.predict(X_test[predictors])

error = accuracy_score(Y_test, preds)

combination = pd.DataFrame(dict(actual = Y_test, predicted = preds))
combination = combination.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

print("The new error with the given model is", error)

print("Printing the result of the training on the rolling avarages :", combination.head(10))

The new error with the given model is 0.6159420289855072
Printing the result of the training on the rolling avarages :     actual  predicted        date     team        opponent result
55       0          1  2022-01-23  Arsenal         Burnley      D
56       1          0  2022-02-10  Arsenal          Wolves      W
57       1          0  2022-02-19  Arsenal       Brentford      W
58       1          1  2022-02-24  Arsenal          Wolves      W
59       1          1  2022-03-06  Arsenal         Watford      W
60       1          1  2022-03-13  Arsenal  Leicester City      W
61       0          1  2022-03-16  Arsenal       Liverpool      L
62       1          0  2022-03-19  Arsenal     Aston Villa      W
63       0          0  2022-04-04  Arsenal  Crystal Palace      L
64       0          0  2022-04-09  Arsenal        Brighton      L


# Saving to file
In this section we store the model produced to a pickles file, so that in case we need to do a computation with it we can just load it instead of retraining. 

In [59]:
# Save the model to a file in pickle format.
with open('../models/random_forest.pkl', 'wb') as f:
    pickle.dump(model, f)


with open('../models/random_forest_rolling.pkl', 'wb') as f:
    pickle.dump(model_rolling, f)
