Importing Libraries

In [1]:
import pandas as pd  # Import Pandas library for data manipulation and analysis
import glob  # Import glob module for file path pattern matching
import math  # Import math module for mathematical functions
import os  # Import os module for operating system-related functions
import numpy as np  # Import NumPy library for numerical operations
import warnings  # Import warnings module to handle warnings
import chardet  # Import chardet module for character encoding detection
from sklearn.model_selection import train_test_split  # Import train_test_split function for data splitting
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier for classification
from sklearn.metrics import accuracy_score  # Import accuracy_score for model evaluation
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for classification
from sklearn.ensemble import GradientBoostingClassifier  # Import GradientBoostingClassifier for classification
import joblib  # Import joblib for model persistence
import requests  # Import requests library for making HTTP requests
from io import StringIO  # Import StringIO for string-based I/O operations
warnings.filterwarnings("ignore")

### Declaring Functions

In [2]:
# Function to download and store CSV in a Pandas DataFrame
def download_csv_to_df(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Convert the CSV content to a Pandas DataFrame
        df = pd.read_csv(StringIO(response.text))
        return df
    else:
        print(f"Failed to download CSV from {url}")
        return None

In [3]:
def matches_simulator(df):
    # Iterate through each match in the dataframe 'df'
    for i, match in df.iterrows():
        try:
            # Create a copy of the standings before the match
            classification_before = classification.copy()

            # Update the standings after the match
            update_classification(match)

            # Calculate team rankings up to that point
            classification['Position'] = classification['Points'].rank(method='min', ascending=False)

            # Get ranking attributes for the home team
            pos_ht = classification_before.loc[classification_before['Team'] == match['HomeTeam'], 'Position'].values[0]
            p_ht = classification_before.loc[classification_before['Team'] == match['HomeTeam'], 'Points'].values[0]
            g_ht = classification_before.loc[classification_before['Team'] == match['HomeTeam'], 'GF'].values[0] - classification_before.loc[classification_before['Team'] == match['HomeTeam'], 'GC'].values[0]

            # Get ranking attributes for the away team
            pos_at = classification_before.loc[classification_before['Team'] == match['AwayTeam'], 'Position'].values[0]
            p_at = classification_before.loc[classification_before['Team'] == match['AwayTeam'], 'Points'].values[0]
            g_at = classification_before.loc[classification_before['Team'] == match['AwayTeam'], 'GF'].values[0] - classification_before.loc[classification_before['Team'] == match['AwayTeam'], 'GC'].values[0]

            # Add the new attributes to the 'df' dataframe
            df.loc[i, 'POS_HT'] = pos_ht
            df.loc[i, 'POS_AT'] = pos_at
            df.loc[i, 'P_HT'] = p_ht
            df.loc[i, 'P_AT'] = p_at
            df.loc[i, 'G_HT'] = g_ht
            df.loc[i, 'G_AT'] = g_at

            # Add columns for ELO Home and ELO Away ratings
            df['ELO_Home'] = None
            df['ELO_Away'] = None

            # Update ELO scores in the dataset
            k_factor = 20  # Adjust the K-Factor value as needed
            df = ELO(df, k_factor)

            # Get the last ELO values for Home and Away teams
            last_elo_home = df.groupby('HomeTeam')['ELO_Home'].last()
            last_elo_away = df.groupby('AwayTeam')['ELO_Away'].last()

            # Update the 'clasificacion' dataset with the latest ELO values
            classification['ELO'] = classification['Team'].map(last_elo_home).fillna(
                classification['Team'].map(last_elo_away)
            )
        except IndexError:
            continue

In [4]:
# Function to update the team standings based on match results
def update_classification(match):
    # Get the home and away teams and the final score of the match
    home_team = match['HomeTeam']
    away_team = match['AwayTeam']
    fthg = match['FTHG']  # Home team goals
    ftag = match['FTAG']  # Away team goals
    
    # Update goals for and against each team in the standings DataFrame
    classification.loc[classification['Team'] == home_team, 'GF'] += fthg
    classification.loc[classification['Team'] == home_team, 'GC'] += ftag
    classification.loc[classification['Team'] == away_team, 'GF'] += ftag
    classification.loc[classification['Team'] == away_team, 'GC'] += fthg
    
    # Update points for each team based on the match result
    if fthg > ftag:  # Home team wins
        classification.loc[classification['Team'] == home_team, 'Points'] += 3
    elif fthg == ftag:  # Draw
        classification.loc[classification['Team'] == home_team, 'Points'] += 1
        classification.loc[classification['Team'] == away_team, 'Points'] += 1
    else:  # Away team wins
        classification.loc[classification['Team'] == away_team, 'Points'] += 3

In [5]:
def ELO(df, k_factor):
    def expected_result(rating_a, rating_b):
        # Calculate the expected probability of team A winning
        return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400))

    def update_rating(rating, expected, actual, k_factor):
        # Update the team's rating based on the match outcome
        return rating + k_factor * (actual - expected)

    teams = {}  # Dictionary to store team ratings
    
    for index, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        result = row['FTR']  # Full-time result of the match (H, A, or D)

        # Initialize team ratings to 1500 if not already in the dictionary
        if home_team not in teams:
            teams[home_team] = 1500
        if away_team not in teams:
            teams[away_team] = 1500
        
        rating_home = teams[home_team]
        rating_away = teams[away_team]
        
        expected_home = expected_result(rating_home, rating_away)
        expected_away = expected_result(rating_away, rating_home)
        
        # Determine actual outcomes (1 for win, 0 for loss, 0.5 for draw)
        if result == 'H':
            actual_home = 1
            actual_away = 0
        elif result == 'A':
            actual_home = 0
            actual_away = 1
        else:
            actual_home = 0.5
            actual_away = 0.5
        
        # Update team ratings based on the match outcome
        updated_rating_home = update_rating(rating_home, expected_home, actual_home, k_factor)
        updated_rating_away = update_rating(rating_away, expected_away, actual_away, k_factor)
        
        teams[home_team] = updated_rating_home
        teams[away_team] = updated_rating_away
        
        # Store updated ELO ratings in the DataFrame
        df.at[index, 'ELO_Home'] = updated_rating_home
        df.at[index, 'ELO_Away'] = updated_rating_away
    
    return df

### Match Simulator
Simple demonstration of how the match simulator works for one single season

In [8]:
# URL of the CSV files. One CSV file contains one single season
url = "https://www.football-data.co.uk/mmz4281/2324/SP1.csv"

# Download the CSV file from the given URL and store it as a DataFrame
df = download_csv_to_df(url)

# Select specific columns from the DataFrame for analysis
df = df[['Div',  # Division (e.g., 'SP1' for Spanish La Liga)
         'Date',  # Date of the match
         'HomeTeam',  # Name of the home team
         'AwayTeam',  # Name of the away team
         'FTHG',  # Full-Time Home Team Goals
         'FTAG',  # Full-Time Away Team Goals
         'FTR'    # Full-Time Result (e.g., 'H' for Home Win, 'A' for Away Win, 'D' for Draw)
        ]]

# Display the last 10 rows of the DataFrame
df.tail(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
69,SP1,29/09/2023,Barcelona,Sevilla,1,0,H
70,SP1,30/09/2023,Getafe,Villarreal,0,0,D
71,SP1,30/09/2023,Vallecano,Mallorca,2,2,D
72,SP1,30/09/2023,Girona,Real Madrid,0,3,A
73,SP1,30/09/2023,Sociedad,Ath Bilbao,3,0,H
74,SP1,01/10/2023,Almeria,Granada,3,3,D
75,SP1,01/10/2023,Alaves,Osasuna,0,2,A
76,SP1,01/10/2023,Ath Madrid,Cadiz,3,2,H
77,SP1,01/10/2023,Betis,Valencia,3,0,H
78,SP1,02/10/2023,Las Palmas,Celta,2,1,H


In [10]:
# Get unique team names from the 'HomeTeam' column
teams = df['HomeTeam'].unique()

# Create an initial standings table with columns for team statistics
classification = pd.DataFrame({'Team': teams, 'Points': 0, 'Position': 0, 'GF': 0, 'GC': 0})

# Call the match simulator to enrich the df
matches_simulator(df)

As seen, the fixtures dataframe is enriched with additional values like 

In [11]:
df.tail(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,POS_HT,POS_AT,P_HT,P_AT,G_HT,G_AT,ELO_Home,ELO_Away
69,SP1,29/09/2023,Barcelona,Sevilla,1,0,H,3.0,12.0,17.0,7.0,10.0,2.0,1554.056753,1482.445459
70,SP1,30/09/2023,Getafe,Villarreal,0,0,D,11.0,12.0,8.0,7.0,-3.0,-3.0,1492.126758,1482.911256
71,SP1,30/09/2023,Vallecano,Mallorca,2,2,D,7.0,16.0,11.0,6.0,-2.0,-3.0,1507.425909,1484.546197
72,SP1,30/09/2023,Girona,Real Madrid,0,3,A,2.0,3.0,19.0,18.0,10.0,7.0,1543.803056,1554.602525
73,SP1,30/09/2023,Sociedad,Ath Bilbao,3,0,H,6.0,4.0,12.0,14.0,3.0,7.0,1530.219113,1517.751134
74,SP1,01/10/2023,Almeria,Granada,3,3,D,20.0,19.0,2.0,4.0,-10.0,-8.0,1453.60041,1463.197388
75,SP1,01/10/2023,Alaves,Osasuna,0,2,A,13.0,13.0,7.0,7.0,-4.0,-3.0,1470.830359,1490.669121
76,SP1,01/10/2023,Ath Madrid,Cadiz,3,2,H,6.0,10.0,13.0,9.0,10.0,-2.0,1538.951372,1490.139344
77,SP1,01/10/2023,Betis,Valencia,3,0,H,10.0,8.0,9.0,10.0,-5.0,2.0,1509.383004,1487.935855
78,SP1,02/10/2023,Las Palmas,Celta,2,1,H,17.0,17.0,5.0,5.0,-4.0,-4.0,1482.680924,1462.724065


Here you can see the final classification calculated by the match simulator

In [12]:
classification.sort_values(by='Points', ascending=False)

Unnamed: 0,Team,Points,Position,GF,GC,ELO
19,Real Madrid,21,1.0,16,6,1544.321616
14,Barcelona,20,2.0,19,8,1554.056753
13,Girona,19,3.0,18,11,1543.803056
9,Ath Madrid,16,4.0,18,7,1538.951372
2,Sociedad,15,5.0,16,10,1530.219113
4,Ath Bilbao,14,6.0,13,9,1527.981122
15,Betis,12,7.0,10,12,1509.383004
18,Vallecano,12,7.0,9,11,1507.425909
11,Valencia,10,9.0,9,10,1497.891659
12,Osasuna,10,9.0,9,10,1480.664197


### Download, simulate and enrich several seasons

Once we have validated that the simulator works fine, we can extract data from some old seasons and enrich it with the simulator's data output.
Be aware because it may cost several minutes for calculating all dataframes

In [None]:
# URLs of the CSV files
urls = [
    "https://www.football-data.co.uk/mmz4281/2223/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/2223/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/2122/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/2021/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/1920/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/1920/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/1819/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/1819/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/1718/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/1718/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/1617/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/1617/SP2.csv",
    "https://www.football-data.co.uk/mmz4281/2223/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2223/E1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2122/E1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2021/E1.csv",
    "https://www.football-data.co.uk/mmz4281/1920/E0.csv",
    "https://www.football-data.co.uk/mmz4281/1920/E1.csv",
    "https://www.football-data.co.uk/mmz4281/1819/E0.csv",
    "https://www.football-data.co.uk/mmz4281/1819/E1.csv",
    "https://www.football-data.co.uk/mmz4281/1718/E0.csv",
    "https://www.football-data.co.uk/mmz4281/1718/E1.csv",
    "https://www.football-data.co.uk/mmz4281/1617/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2223/I1.csv",
    "https://www.football-data.co.uk/mmz4281/2223/F1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/I1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/F1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/I1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/F1.csv",
    "https://www.football-data.co.uk/mmz4281/1920/I1.csv",
    "https://www.football-data.co.uk/mmz4281/1920/F1.csv",
]

# List to store the DataFrames
dataframes = []

# Download CSVs and store in the list with progress tracking
for i, url in enumerate(urls):
    df = download_csv_to_df(url)
    if df is not None:
        dataframes.append(df)

# Now, 'dataframes' contains a list of Pandas DataFrames with the downloaded CSV data

dataframes_list = []
print("Processed Dataframes:")

# Loop through each dataframe in the list 'dataframes'
for i, df in enumerate(dataframes):
    print(f"DF: {i} / {len(dataframes)} (PROGRESS {i / len(dataframes) * 100:.1f}%)")
    
    # Select specific columns from the dataframe
    df = df[['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']]

    # Get unique team names from the 'HomeTeam' column
    teams = df['HomeTeam'].unique()

    # Create an initial standings table with columns for team statistics
    classification = pd.DataFrame({'Team': teams, 'Points': 0, 'Position': 0, 'GF': 0, 'GC': 0})
    
    # Iterate through each match in the dataframe 'df'
    matches_simulator(df)
    
    # Append the updated 'df' to a list called 'dataframes_list'
    dataframes_list.append(df[30:]) # avoid the first 30 matches because their ELO points and position are not reliable


At the final of the process, we have 13755 records enriched and ready to feed our machine learning model.

In [22]:
df_concat = pd.concat(dataframes_list, ignore_index=True)

len(df_concat)

13755

### Model training

In [33]:
# Select the features and the target variable
features = ['POS_HT', 'POS_AT', 'P_HT', 'P_AT', 'G_HT', 'G_AT', 'ELO_Home', 'ELO_Away']
target = 'FTR'

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(df_concat[features], df_concat[target], test_size=0.2, random_state=42)

Classification model as Random Forest

In [34]:
# Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 70.88%


Gradient Boost Algorithm

In [35]:
# Create the Gradient Boosting model
model = GradientBoostingClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 69.57%


Logistic Regression Algorithm

In [36]:
# Create the Multinomial Logistic Regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 74.63%


Finally, we have:
- Random Forest Model accuracy: 70.88%
- Gradient Boosting Model accuracy: 69.57%
- Logistic Regression Model accuracy: 74.63%

So we chose Logistic Regression Model as our final model because it has the best accuracy

Saving either the final model and the concatenation of all dataframes

In [39]:
# We save the final model in .pkl format
joblib.dump(model, "Logistic Regression.pkl")

# Save the DataFrame to a CSV file
df_concat.to_csv('enriched_df.csv', index=False)  # set index=False to omit row numbers

In [18]:
# Make predictions on the test dataset
predictions = model.predict(X_test)
predictions

array(['H', 'D', 'H', ..., 'H', 'H', 'D'], dtype=object)