### Football Analyser

The goal is to predict the outcome of a match between 2 teams from the Top Five Leagues (2024/25 Edition)

We will try to do so using Machine Learning with Decision Trees.

In [17]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load the latest version directly from Kaggle
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "hubertsidorowicz/football-players-stats-2024-2025",
    "players_data_light-2024_2025.csv"  # Specify the exact file name
)


  df = kagglehub.load_dataset(


### Just getting to know the data

In [18]:
df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att (GK),Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist
0,1,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,24.0,2000.0,3,1,...,,,,,,,,,,
1,2,Max Aarons,eng ENG,"DF,MF",Valencia,es La Liga,24.0,2000.0,4,1,...,,,,,,,,,,
2,3,Rodrigo Abajas,es ESP,DF,Valencia,es La Liga,21.0,2003.0,1,1,...,,,,,,,,,,
3,4,James Abankwah,ie IRL,"DF,MF",Udinese,it Serie A,20.0,2004.0,6,0,...,,,,,,,,,,
4,5,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,18.0,2006.0,1,0,...,,,,,,,,,,


In [19]:
players_by_gls = df.sort_values(by="Gls", ascending=False)
players_by_gls.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att (GK),Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist
1691,1692,Kylian Mbappé,fr FRA,FW,Real Madrid,es La Liga,25.0,1998.0,34,34,...,,,,,,,,,,
2304,2305,Mohamed Salah,eg EGY,FW,Liverpool,eng Premier League,32.0,1992.0,38,38,...,,,,,,,,,,
1483,1484,Robert Lewandowski,pl POL,FW,Barcelona,es La Liga,35.0,1988.0,34,32,...,,,,,,,,,,
1317,1318,Harry Kane,eng ENG,FW,Bayern Munich,de Bundesliga,31.0,1993.0,31,28,...,,,,,,,,,,
2201,2202,Mateo Retegui,it ITA,FW,Atalanta,it Serie A,25.0,1999.0,36,32,...,,,,,,,,,,


In [20]:
liverpool_players = df[df["Squad"] == "Liverpool"]
liverpool_players.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att (GK),Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist
83,84,Trent Alexander-Arnold,eng ENG,DF,Liverpool,eng Premier League,25.0,1998.0,33,28,...,,,,,,,,,,
87,88,Alisson,br BRA,GK,Liverpool,eng Premier League,31.0,1992.0,28,28,...,877.0,128.0,19.7,26.6,255.0,11.0,4.3,49.0,1.76,16.0
378,379,Conor Bradley,nir NIR,DF,Liverpool,eng Premier League,21.0,2003.0,19,7,...,,,,,,,,,,
547,548,Federico Chiesa,it ITA,FW,Liverpool,eng Premier League,26.0,1997.0,6,1,...,,,,,,,,,,
658,659,Jayden Danns,eng ENG,MF,Liverpool,eng Premier League,18.0,2006.0,1,0,...,,,,,,,,,,


In [21]:
psg_players = df[df["Squad"] == "Paris S-G"]
psg_players.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att (GK),Thr,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist
165,166,Marco Asensio,es ESP,"FW,MF",Paris S-G,fr Ligue 1,28.0,1996.0,12,8,...,,,,,,,,,,
238,239,Bradley Barcola,fr FRA,FW,Paris S-G,fr Ligue 1,21.0,2002.0,34,27,...,,,,,,,,,,
305,306,Lucas Beraldo,br BRA,DF,Paris S-G,fr Ligue 1,20.0,2003.0,25,22,...,,,,,,,,,,
697,698,Ousmane Dembélé,fr FRA,FW,Paris S-G,fr Ligue 1,27.0,1997.0,29,20,...,,,,,,,,,,
765,766,Gianluigi Donnarumma,it ITA,GK,Paris S-G,fr Ligue 1,25.0,1999.0,24,24,...,543.0,90.0,13.6,25.1,199.0,10.0,5.0,19.0,0.82,14.4


### Actual Project

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [23]:
def calculate_team_strength(df):
    # Group by team (Squad) and calculate mean stats
    team_stats = df.groupby('Squad').agg({
        'Gls': 'mean',  # Goals
        'Ast': 'mean',  # Assists
        'xG': 'mean',   # Expected Goals
        'xAG': 'mean',  # Expected Assisted Goals
        'SoT%': 'mean', # Shots on Target %
        'Cmp%': 'mean', # Pass Completion %
        'Tkl': 'mean',  # Tackles
        'Int': 'mean',  # Interceptions
    }).fillna(0)
    
    return team_stats

# Calculate team statistics
team_stats = calculate_team_strength(df)
team_stats.head()

Unnamed: 0_level_0,Gls,Ast,xG,xAG,SoT%,Cmp%,Tkl,Int
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alavés,1.310345,0.689655,1.496552,0.87931,24.092,68.413793,25.413793,10.655172
Angers,1.033333,0.566667,1.213333,0.79,24.079167,74.796667,20.8,11.2
Arsenal,2.68,2.2,2.492,1.864,31.778261,82.308696,23.92,9.0
Aston Villa,2.0,1.607143,2.057143,1.5,29.792,80.571429,22.75,8.571429
Atalanta,2.375,1.6875,2.0875,1.475,37.952,81.175,17.84375,9.25


In [24]:
t_one = team_stats.loc['Paris S-G'].values.reshape(1, -1)
t_two = team_stats.loc['Angers'].values.reshape(1, -1)
print(t_one, t_two)

[[ 3.17857143  2.46428571  3.23571429  2.55357143 38.67391304 88.97857143
  22.10714286  9.46428571]] [[ 1.03333333  0.56666667  1.21333333  0.79       24.07916667 74.79666667
  20.8        11.2       ]]


In [25]:
def predict_match(team1, team2, team_stats, model):
    # Get stats for both teams
    team1_stats = team_stats.loc[team1].values.reshape(1, -1)
    team2_stats = team_stats.loc[team2].values.reshape(1, -1)
    
    # Calculate the difference in stats (team1 - team2)
    match_features = team1_stats - team2_stats
    
    # Predict the outcome
    # 0: Team2 wins, 1: Draw, 2: Team1 wins
    prediction = model.predict(match_features)[0]
    probabilities = model.predict_proba(match_features)[0]
    
    return prediction, probabilities

# Create synthetic match data and train the model
matches = []
teams = list(team_stats.index)
np.random.seed(42)

for _ in range(1000):
    team1, team2 = np.random.choice(teams, 2, replace=False)
    # Generate synthetic result based on team stats
    team1_strength = team_stats.loc[team1].mean()
    team2_strength = team_stats.loc[team2].mean()
    
    # Calculate win probability based on relative team strengths
    prob = (team1_strength - team2_strength + 1) / 2
    # Ensure probabilities are non-negative and sum to 1
    prob = max(0, min(1, prob))  # Clamp between 0 and 1
    draw_prob = 0.2
    team2_win_prob = max(0, 1 - prob - draw_prob)  # Ensure non-negative
    
    # Normalize probabilities to sum to 1
    total = prob + draw_prob + team2_win_prob
    prob = prob / total
    draw_prob = draw_prob / total
    team2_win_prob = team2_win_prob / total
    
    result = np.random.choice([2, 1, 0], p=[prob, draw_prob, team2_win_prob])
    
    matches.append([team1, team2, result])

matches_df = pd.DataFrame(matches, columns=['team1', 'team2', 'result'])

# Prepare features for the model
X = []
y = matches_df['result']

for _, match in matches_df.iterrows():
    team1_stats = team_stats.loc[match['team1']].values
    team2_stats = team_stats.loc[match['team2']].values
    X.append(team1_stats - team2_stats)

X = np.array(X)

# Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
def predict_match_outcome(team1, team2):
    prediction, probabilities = predict_match(team1, team2, team_stats, model)
    
    print(f"\nMatch Prediction: {team1} vs {team2}")
    print("-" * 50)
    print(f"Win probability for {team1}: {probabilities[2]:.2%}")
    print(f"Draw probability: {probabilities[1]:.2%}")
    print(f"Win probability for {team2}: {probabilities[0]:.2%}")
    
    result_map = {0: f"{team2} wins", 1: "Draw", 2: f"{team1} wins"}
    print(f"\nPredicted outcome: {result_map[prediction]}")

# Show available teams
print("Available teams:")
print("-" * 50)
print("\n".join(teams[:10]))  # Show first 10 teams
print("\nUse predict_match_outcome(team1, team2) to predict a match")

Available teams:
--------------------------------------------------
Alavés
Angers
Arsenal
Aston Villa
Atalanta
Athletic Club
Atlético Madrid
Augsburg
Auxerre
Barcelona

Use predict_match_outcome(team1, team2) to predict a match


In [27]:
predict_match_outcome("Paris S-G", "Barcelona")


Match Prediction: Paris S-G vs Barcelona
--------------------------------------------------
Win probability for Paris S-G: 78.00%
Draw probability: 16.00%
Win probability for Barcelona: 6.00%

Predicted outcome: Paris S-G wins


In [28]:
predict_match_outcome("Real Madrid", "Bayern Munich")


Match Prediction: Real Madrid vs Bayern Munich
--------------------------------------------------
Win probability for Real Madrid: 33.00%
Draw probability: 20.17%
Win probability for Bayern Munich: 46.83%

Predicted outcome: Bayern Munich wins
