# Building a Machine Learning Model to predict Serie A match winners

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
data = pd.read_csv("data.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,Match Report,,25.0,8.0,15.2,0.0,0,0,2023,Napoli
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,Match Report,,22.0,5.0,15.3,1.0,0,0,2023,Napoli
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,Match Report,,13.0,2.0,14.7,1.0,0,0,2023,Napoli
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,Match Report,,19.0,7.0,17.7,0.0,0,0,2023,Napoli
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,Match Report,,19.0,7.0,16.1,0.0,0,0,2023,Napoli


There are 38 games each season, 20 teams in Seria A, and we have data for 3 seasons. Thus, we should have 38 * 20 * 3 = 2280 rows in our dataframe. But looking at the dataframe shape, we notice we have two more rows.

In [4]:
data.shape

(2282, 27)

In [5]:
data["Team"].value_counts()

Spezia            115
Hellas Verona     115
Sampdoria         114
Lazio             114
Sassuolo          114
Udinese           114
Torino            114
Napoli            114
Fiorentina        114
Juventus          114
Roma              114
Atalanta          114
Milan             114
Internazionale    114
Bologna           114
Genoa              76
Cagliari           76
Empoli             76
Salernitana        76
Cremonese          38
Lecce              38
Monza              38
Venezia            38
Benevento          38
Crotone            38
Parma              38
Name: Team, dtype: int64

The extra two rows are as a result of a relegation tie-breaker.

In [6]:
data["Round"].value_counts()

Matchweek 1               60
Matchweek 30              60
Matchweek 23              60
Matchweek 24              60
Matchweek 25              60
Matchweek 26              60
Matchweek 27              60
Matchweek 28              60
Matchweek 29              60
Matchweek 31              60
Matchweek 21              60
Matchweek 32              60
Matchweek 33              60
Matchweek 34              60
Matchweek 35              60
Matchweek 36              60
Matchweek 37              60
Matchweek 38              60
Matchweek 22              60
Matchweek 20              60
Matchweek 2               60
Matchweek 10              60
Matchweek 3               60
Matchweek 4               60
Matchweek 5               60
Matchweek 6               60
Matchweek 7               60
Matchweek 8               60
Matchweek 9               60
Matchweek 11              60
Matchweek 19              60
Matchweek 12              60
Matchweek 13              60
Matchweek 14              60
Matchweek 15  

In [7]:
data.loc[data['Round'] == "Relegation tie-breaker"]

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
41,2023-06-11,20:45,Serie A,Relegation tie-breaker,Sun,Home,L,1,3,Hellas Verona,...,Match Report,,16.0,8.0,18.0,1.0,0,1,2023,Spezia
39,2023-06-11,20:45,Serie A,Relegation tie-breaker,Sun,Away,W,3,1,Spezia,...,Match Report,,9.0,5.0,16.8,0.0,0,0,2023,Hellas Verona


Now we need to do some data cleanup before we apply machine learning.

In [8]:
data.dtypes

Date             object
Time             object
Comp             object
Round            object
Day              object
Venue            object
Result           object
GF                int64
GA                int64
Opponent         object
xG              float64
xGA             float64
Poss            float64
Attendance      float64
Captain          object
Formation        object
Referee          object
Match Report     object
Notes            object
Sh              float64
SoT             float64
Dist            float64
FK              float64
PK                int64
PKatt             int64
Season            int64
Team             object
dtype: object

Converting the Date object in the dateframe into a datetime object will make it easier to compute predictors based on that column. 

In [9]:
# convert existing date column to a date time object
data["Date"] = pd.to_datetime(data["Date"])

In [10]:
data.dtypes

Date            datetime64[ns]
Time                    object
Comp                    object
Round                   object
Day                     object
Venue                   object
Result                  object
GF                       int64
GA                       int64
Opponent                object
xG                     float64
xGA                    float64
Poss                   float64
Attendance             float64
Captain                 object
Formation               object
Referee                 object
Match Report            object
Notes                   object
Sh                     float64
SoT                    float64
Dist                   float64
FK                     float64
PK                       int64
PKatt                    int64
Season                   int64
Team                    object
dtype: object

We convert the venue to a numeric value as well.

In [11]:
# convert into categorical data type and then convert that into numbers
data["Venue-Code"] = data["Venue"].astype("category").cat.codes

In [12]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,Venue-Code
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,,25.0,8.0,15.2,0.0,0,0,2023,Napoli,0
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,,22.0,5.0,15.3,1.0,0,0,2023,Napoli,1
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,,13.0,2.0,14.7,1.0,0,0,2023,Napoli,0
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,,19.0,7.0,17.7,0.0,0,0,2023,Napoli,1
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,,19.0,7.0,16.1,0.0,0,0,2023,Napoli,0


We convert the opponent into a numerical value.

In [13]:
data["Opponent-Code"] = data["Opponent"].astype("category").cat.codes

In [14]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Sh,SoT,Dist,FK,PK,PKatt,Season,Team,Venue-Code,Opponent-Code
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,25.0,8.0,15.2,0.0,0,0,2023,Napoli,0,9
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,22.0,5.0,15.3,1.0,0,0,2023,Napoli,1,15
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,13.0,2.0,14.7,1.0,0,0,2023,Napoli,0,7
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,19.0,7.0,17.7,0.0,0,0,2023,Napoli,1,13
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,19.0,7.0,16.1,0.0,0,0,2023,Napoli,0,12


We will look at the time column, and remove the minutes and just keep the hours. Certain teams may play better at certain times of the day.

In [15]:
data["Hour"] = data["Time"].str.replace(":.+", "", regex=True).astype("int")

In [16]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,SoT,Dist,FK,PK,PKatt,Season,Team,Venue-Code,Opponent-Code,Hour
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,8.0,15.2,0.0,0,0,2023,Napoli,0,9,18
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,5.0,15.3,1.0,0,0,2023,Napoli,1,15,18
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,2.0,14.7,1.0,0,0,2023,Napoli,0,7,20
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,7.0,17.7,0.0,0,0,2023,Napoli,1,13,20
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,7.0,16.1,0.0,0,0,2023,Napoli,0,12,20


We will convert the day in which a particular match occurs to a numerical value i.e., 0 - Monday, 1 - Tuesday and so on.

In [17]:
data["Day-Code"] = data["Date"].dt.dayofweek

In [18]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,Season,Team,Venue-Code,Opponent-Code,Hour,Day-Code
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,15.2,0.0,0,0,2023,Napoli,0,9,18,0
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,15.3,1.0,0,0,2023,Napoli,1,15,18,6
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,14.7,1.0,0,0,2023,Napoli,0,7,20,6
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,17.7,0.0,0,0,2023,Napoli,1,13,20,2
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,16.1,0.0,0,0,2023,Napoli,0,12,20,5


We can start training our machine learning model. First we set up our target, that is to be able to predict whether a team wins or not. Win - 1, Loss/Draw = 0.

In [19]:
data["Target"] = (data["Result"] == "W").astype("int")

In [20]:
data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,FK,PK,PKatt,Season,Team,Venue-Code,Opponent-Code,Hour,Day-Code,Target
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,0.0,0,0,2023,Napoli,0,9,18,0,1
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,1.0,0,0,2023,Napoli,1,15,18,6,1
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,1.0,0,0,2023,Napoli,0,7,20,6,0
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,0.0,0,0,2023,Napoli,1,13,20,2,0
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,0.0,0,0,2023,Napoli,0,12,20,5,1


We will be using a Random Forest Classifier. This can pick up non-linearities in the data. By non-linearities, for example, observe the opponent-code. A higher code doesn't imply that the opponent is more difficult than a team with a lower code. A linear model would not be able to pick up the non-linearities.

Random Forest operate on multiple decision trees. This works by first generating bootstrapped datasets, and for each, we individually train a decision tree. We randomly select a subset of features for each tree and use only those features for training the corresponding tree. The resulting target prediction on a test input is then the aggregation of the results on each tree.

Why bootstrapping and feature selection? We are not using the same data in every tree, so it allows us to be less sensitive to our original training data, and the random feature selection helps reduce the correlation between different features.

In [21]:
# n_estimators is the number of individual decision trees we want to train
# min_samples_split is the minimum number of samples required to split an internal node
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

Note the following metrics we will be computing when making predictions:
- Accuracy is a measure of how many predictions your model got correct out of the total number of predictions. It is calculated as: (Number of Correct Predictions) / (Total Number of Predictions).
- Precision is a measure of how many of the positive predictions made by your model were actually correct. It is calculated as: (True Positives) / (True Positives + False Positives).

In [22]:
def makePredictions(data, predictors):
    """
    Input: dataframe containing our data and the column name predictors
    Output: dataframe containing the actual target values and the predicted target values,
            the accuracy and precision of the predicted target values compared
            to the actual target values
    """
    train = data[data["Date"] <= "2023-01-01"]
    test = data[data["Date"] > "2023-01-01"]
    rf.fit(train[predictors], train["Target"])
    predictions = rf.predict(test[predictors])
    results = pd.DataFrame(dict(actual=test["Target"], predicted=predictions), index=test.index)
    accuracy = accuracy_score(test["Target"], predictions)
    precision = precision_score(test["Target"], predictions)
    return results, accuracy, precision

def mergeResultsOn(results, data, fields):
    return results.merge(data[fields], left_index=True, right_index=True)

As our first attempt, let's use the following predictors:
- Venue-Code (Home/Away)
- Opponent-Code
- Hour
- Day-Code

In [23]:
predictors = ["Venue-Code", "Opponent-Code", "Hour", "Day-Code"]
results, accuracy, precision = makePredictions(data, predictors)

In [24]:
results

Unnamed: 0,actual,predicted
21,0,0
22,1,0
23,1,0
25,1,1
26,1,0
...,...,...
36,0,0
37,0,1
38,0,0
39,0,0


To get additional information we merge our results with our original data on certain fields at the corresponding indices. In parcticular we focus on the 'Date', 'Team', 'Opponent' and 'Result' fields.

In [25]:
fields = ["Date", "Team", "Opponent", "Result"]
mergedResults = mergeResultsOn(results, data, fields)
mergedResults

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
15,0,1,2023-01-04,Atalanta,Spezia,D
15,0,1,2022-10-17,Fiorentina,Lecce,D
15,0,1,2022-11-09,Bologna,Inter,L
15,0,1,2022-11-09,Torino,Sampdoria,W
15,0,1,2022-11-10,Monza,Lazio,L
...,...,...,...,...,...,...
55,0,0,2023-06-03,Internazionale,Torino,W
55,0,0,2023-06-04,Juventus,Udinese,W
55,0,0,2023-05-21,Fiorentina,Torino,D
57,1,1,2023-05-27,Fiorentina,Roma,W


Let's check the accuracy and precision of the model.

In [26]:
print(f"The accuracy of the model is: {round(accuracy*100, 2)}%.")
print(f"The precision of the model is: {round(precision*100, 2)}%.")

The accuracy of the model is: 63.42%.
The precision of the model is: 49.14%.


Let's try to observe when our accuracy was high and when it was low.

In [27]:
pd.crosstab(index=results["actual"], columns=results["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,236,59
1,110,57


When we predicted a loss/draw (i.e., 0), we were correct 236 times and we were wrong 110 times, and when we predicted a win (i.e., 1), we were correct 59 times and we were wrong 57 times. This resulted in an accuracy of 63.4%. Note, while accuracy is a useful measure, we are more interested in precision as we want to successfully determine whether a team wins or not, i.e., if it is a 1.

We will improve precision by grouping data by team and then use rolling averages. In other words, suppose AC Milan lost Matchweek 1 to 5, then we are more likely to expect that AC Milan will also lose Matchweek 6. Thus, we want to incorporate this in our machine learning model. For rolling averages, we take into account the following: goals for, goals against, shots taken, shots on target, distance that each shot travels, free kicks, penalty kicks and penalty kick attempts.

In [28]:
def rollingAvg(group, columns, newColumns):
    # Inputs the group i.e., team, columns for which we want to compute rolling averages for, newColumns to which we assign rolling averages
    group = group.sort_values("Date")
    # computes rolling avg of 3 weeks, closed="left" ensures we don't include 'future' data
    rollingStats = group[columns].rolling(5, closed="left").mean()
    group[newColumns] = rollingStats
    # drop any missing values
    group = group.dropna(subset=newColumns)
    return group

In [29]:
# goals for, goals against, shots taken, shots on target, distance that each shot travels, free kicks, penalty kicks and penalty kick attempts
columns = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
newColumns = [f"{c}Rolling" for c in columns]

In [30]:
# we group our data by team i.e., create a dataframe for each team
# we then apply the rollingAvg function to each team
dataRolling = data.groupby("Team").apply(lambda x: rollingAvg(x, columns, newColumns))

In [31]:
dataRolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day-Code,Target,GFRolling,GARolling,ShRolling,SoTRolling,DistRolling,FKRolling,PKRolling,PKattRolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Atalanta,7,2020-10-31,15:00,Serie A,Matchweek 6,Sat,Away,W,2,1,Crotone,...,5,1,3.0,2.4,16.2,6.6,18.02,1.0,0.2,0.2
Atalanta,9,2020-11-08,15:00,Serie A,Matchweek 7,Sun,Home,D,1,1,Inter,...,6,0,2.6,2.2,16.4,6.4,17.36,0.8,0.2,0.2
Atalanta,10,2020-11-21,18:00,Serie A,Matchweek 8,Sat,Away,D,0,0,Spezia,...,5,0,2.0,2.2,16.2,5.6,17.96,1.0,0.2,0.2
Atalanta,12,2020-11-28,20:45,Serie A,Matchweek 9,Sat,Home,L,0,2,Hellas Verona,...,5,0,1.0,1.8,14.2,4.0,17.84,0.8,0.2,0.2
Atalanta,15,2020-12-13,15:00,Serie A,Matchweek 11,Sun,Home,W,3,0,Fiorentina,...,6,1,0.8,1.4,15.6,4.2,17.16,0.8,0.2,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezia,36,2022-05-01,12:30,Serie A,Matchweek 35,Sun,Away,L,1,2,Juventus,...,6,0,0.4,1.8,9.4,2.2,17.32,1.0,0.0,0.0
Venezia,37,2022-05-05,18:00,Serie A,Matchweek 20,Thu,Away,L,1,2,Salernitana,...,3,0,0.6,1.8,8.6,2.6,18.92,1.2,0.0,0.0
Venezia,38,2022-05-08,15:00,Serie A,Matchweek 36,Sun,Home,W,4,3,Bologna,...,6,1,0.8,2.0,7.8,2.8,17.52,1.2,0.0,0.0
Venezia,39,2022-05-14,20:45,Serie A,Matchweek 37,Sat,Away,D,1,1,Roma,...,5,0,1.4,2.2,9.2,3.4,18.50,1.0,0.2,0.4


In [32]:
# we are not interested in the team multi-level heading
dataRolling = dataRolling.droplevel("Team")
# we want unique indices for each row
dataRolling.index = range(dataRolling.shape[0])

In [33]:
dataRolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Day-Code,Target,GFRolling,GARolling,ShRolling,SoTRolling,DistRolling,FKRolling,PKRolling,PKattRolling
0,2020-10-31,15:00,Serie A,Matchweek 6,Sat,Away,W,2,1,Crotone,...,5,1,3.0,2.4,16.2,6.6,18.02,1.0,0.2,0.2
1,2020-11-08,15:00,Serie A,Matchweek 7,Sun,Home,D,1,1,Inter,...,6,0,2.6,2.2,16.4,6.4,17.36,0.8,0.2,0.2
2,2020-11-21,18:00,Serie A,Matchweek 8,Sat,Away,D,0,0,Spezia,...,5,0,2.0,2.2,16.2,5.6,17.96,1.0,0.2,0.2
3,2020-11-28,20:45,Serie A,Matchweek 9,Sat,Home,L,0,2,Hellas Verona,...,5,0,1.0,1.8,14.2,4.0,17.84,0.8,0.2,0.2
4,2020-12-13,15:00,Serie A,Matchweek 11,Sun,Home,W,3,0,Fiorentina,...,6,1,0.8,1.4,15.6,4.2,17.16,0.8,0.2,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2135,2022-05-01,12:30,Serie A,Matchweek 35,Sun,Away,L,1,2,Juventus,...,6,0,0.4,1.8,9.4,2.2,17.32,1.0,0.0,0.0
2136,2022-05-05,18:00,Serie A,Matchweek 20,Thu,Away,L,1,2,Salernitana,...,3,0,0.6,1.8,8.6,2.6,18.92,1.2,0.0,0.0
2137,2022-05-08,15:00,Serie A,Matchweek 36,Sun,Home,W,4,3,Bologna,...,6,1,0.8,2.0,7.8,2.8,17.52,1.2,0.0,0.0
2138,2022-05-14,20:45,Serie A,Matchweek 37,Sat,Away,D,1,1,Roma,...,5,0,1.4,2.2,9.2,3.4,18.50,1.0,0.2,0.4


In [34]:
newResults, newAccuracy, newPrecision = makePredictions(dataRolling, predictors + newColumns)

In [35]:
mergedResults = mergeResultsOn(newResults, dataRolling, fields)

In [36]:
mergedResults

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
86,0,0,2023-01-04,Atalanta,Spezia,D
87,1,0,2023-01-09,Atalanta,Bologna,W
88,1,1,2023-01-15,Atalanta,Salernitana,W
89,0,0,2023-01-22,Atalanta,Juventus,D
90,1,0,2023-01-28,Atalanta,Sampdoria,W
...,...,...,...,...,...,...
2102,1,0,2023-05-08,Udinese,Sampdoria,W
2103,0,0,2023-05-14,Udinese,Fiorentina,L
2104,0,0,2023-05-21,Udinese,Lazio,L
2105,0,0,2023-05-27,Udinese,Salernitana,L


Let's check the accuracy and precision of the model after we apply rolling averages and make use of the additional predictors.

In [37]:
print(f"The accuracy of the new model is: {round(newAccuracy*100, 2)}%. The increase in accuracy from the previous model is {round((newAccuracy-accuracy)*100, 2)}%.")
print(f"The precision of the new model is: {round(newPrecision*100, 2)}%. The increase in precision from the previous model is {round((newPrecision-precision)*100, 2)}%.")

The accuracy of the new model is: 64.72%. The increase in accuracy from the previous model is 1.3%.
The precision of the new model is: 52.33%. The increase in precision from the previous model is 3.19%.


Finally, we will look at how our model performs when looking at both home and away matches.

First normalise the names so they are consistent.

In [38]:
set(mergedResults["Team"].tolist())

{'Atalanta',
 'Bologna',
 'Cremonese',
 'Empoli',
 'Fiorentina',
 'Hellas Verona',
 'Internazionale',
 'Juventus',
 'Lazio',
 'Lecce',
 'Milan',
 'Monza',
 'Napoli',
 'Roma',
 'Salernitana',
 'Sampdoria',
 'Sassuolo',
 'Spezia',
 'Torino',
 'Udinese'}

In [39]:
set(mergedResults["Opponent"].tolist())

{'Atalanta',
 'Bologna',
 'Cremonese',
 'Empoli',
 'Fiorentina',
 'Hellas Verona',
 'Inter',
 'Juventus',
 'Lazio',
 'Lecce',
 'Milan',
 'Monza',
 'Napoli',
 'Roma',
 'Salernitana',
 'Sampdoria',
 'Sassuolo',
 'Spezia',
 'Torino',
 'Udinese'}

Observe there is an inconsistency between names for 'Team' and names for 'Opponent', i.e., Internazionale in 'Team' but Inter in 'Opponent'. Thus, replace all Internazionale instances to Inter.

In [40]:
mergedResults["Team"] = mergedResults["Team"].replace("Internazionale", "Inter")

In [41]:
set(mergedResults["Team"].tolist()) == set(mergedResults["Opponent"].tolist())

True

Now we merge the mergedResults on itself to display home team and away team prediction side by side.

In [42]:
newMergedResults = mergedResults.merge(mergedResults, left_on=["Date", "Team"], right_on=["Date", "Opponent"])

In [43]:
newMergedResults

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y
0,0,0,2023-01-04,Atalanta,Spezia,D,0,0,Spezia,Atalanta,D
1,1,0,2023-01-09,Atalanta,Bologna,W,0,0,Bologna,Atalanta,L
2,1,1,2023-01-15,Atalanta,Salernitana,W,0,0,Salernitana,Atalanta,L
3,0,0,2023-01-22,Atalanta,Juventus,D,0,0,Juventus,Atalanta,D
4,1,0,2023-01-28,Atalanta,Sampdoria,W,0,0,Sampdoria,Atalanta,L
...,...,...,...,...,...,...,...,...,...,...,...
457,1,0,2023-05-08,Udinese,Sampdoria,W,0,0,Sampdoria,Udinese,L
458,0,0,2023-05-14,Udinese,Fiorentina,L,1,1,Fiorentina,Udinese,W
459,0,0,2023-05-21,Udinese,Lazio,L,1,0,Lazio,Udinese,W
460,0,0,2023-05-27,Udinese,Salernitana,L,1,0,Salernitana,Udinese,W


Measure confidence of our model, i.e., when one team was predicted to win and the other team was predicted to lose.

In [44]:
# this gives us the rows where the first team wins and the second team draws/loses
newMergedResults[(newMergedResults["predicted_x"] == 1) & (newMergedResults["predicted_y"] == 0)]

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y
2,1,1,2023-01-15,Atalanta,Salernitana,W,0,0,Salernitana,Atalanta,L
5,0,1,2023-02-04,Atalanta,Sassuolo,L,1,0,Sassuolo,Atalanta,W
30,1,1,2023-02-18,Bologna,Sampdoria,W,0,0,Sampdoria,Bologna,L
32,0,1,2023-03-06,Bologna,Torino,L,1,0,Torino,Bologna,W
40,0,1,2023-05-04,Bologna,Empoli,L,1,0,Empoli,Bologna,W
...,...,...,...,...,...,...,...,...,...,...,...
384,0,1,2023-04-22,Sassuolo,Salernitana,L,1,0,Salernitana,Sassuolo,W
395,0,1,2023-01-22,Spezia,Roma,L,1,0,Roma,Spezia,W
438,0,1,2023-06-03,Torino,Inter,L,1,0,Inter,Torino,W
445,0,1,2023-02-12,Udinese,Sassuolo,D,0,0,Sassuolo,Udinese,D


In [45]:
# we get actual counts 0s and 1s for those rows of confidence of our model
newMergedResults[(newMergedResults["predicted_x"] == 1) & (newMergedResults["predicted_y"] == 0)]["actual_x"].value_counts()

1    41
0    35
Name: actual_x, dtype: int64

In [46]:
print(f"The accuracy of our confidence is thus {round((41/(35+41))*100, 2)}%.")

The accuracy of our confidence is thus 53.95%.
