# The baseline model
We can start out by creating a baseline model, that we can use to compare our new and improved model up against. We first of, start by importing necessary packages, and fixing the seed for reprodusability

In [21]:
import numpy as np
import pandas as pd
from utils import DataAggregator

In [22]:
np.random.seed(42)

In [23]:
data_aggregator = DataAggregator()
data = data_aggregator.get_data(["E0"])
data.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,E0,13/08/05,Aston Villa,Bolton,2,2,D,2,2,D,...,16,7,8,0,2,0,0,2.3,3.25,3.0
1,E0,13/08/05,Everton,Man United,0,2,A,0,1,A,...,14,8,6,3,1,0,0,5.0,3.4,1.72
2,E0,13/08/05,Fulham,Birmingham,0,0,D,0,0,D,...,13,6,6,1,2,0,0,2.37,3.25,2.87
3,E0,13/08/05,Man City,West Brom,0,0,D,0,0,D,...,11,3,6,2,3,0,0,1.72,3.4,5.0
4,E0,13/08/05,Middlesbrough,Liverpool,0,0,D,0,0,D,...,11,5,0,2,3,1,0,2.87,3.2,2.4


In [24]:
home_percentage, draw_percentage, away_percentage = data["FTR"].value_counts(normalize=True)

In [25]:
print(f"The number of games won by the home team is {home_percentage:.2%}")
print(f"The number of games ending in a draw is {draw_percentage:.2%}")
print(f"The number of games won by the away team is {away_percentage:.2%}")

The number of games won by the home team is 45.95%
The number of games ending in a draw is 29.49%
The number of games won by the away team is 24.56%


Now we can start by implementing the base model. This model will based on the probabilities defined above, pick the outcome of the game. This can be done using numpys np.random.choice(array, probabilites).

In [26]:
possible_outcomes = ["H", "D", "A"]
for index, row in data.iterrows():
    data.at[index, "prediction"] = np.random.choice(possible_outcomes, p=[home_percentage, draw_percentage, away_percentage])

In [27]:
accuracy, won = data_aggregator.calculate_accuracy(data, "FTR", "prediction")

In [28]:
print(f"""The accuracy of the model is {accuracy:.2%}
The accuracy of the model is {accuracy*len(data):.0f} out of {len(data)} games.
""")

The accuracy of the model is 35.83%
The accuracy of the model is 3024 out of 8440 games.



In [29]:
print(f"With this model, the expected return on value would be {won:.2f}€")

With this model, the expected return on value would be -4895.83€


For further comparison, lets save the metrics of the model.

In [30]:
data_aggregator.save_metrics("baseline", accuracy, won)