# Data Analysis

In [20]:
import pandas as pd
import plotly.express as px

In [21]:
a_data=pd.read_csv("SerieA_2015_2026_Data.csv")
team_occurence=a_data['squad'].value_counts()
print(team_occurence)

squad
Juventus         11
Napoli           11
Roma             11
Inter            11
Fiorentina       11
Milan            11
Lazio            11
Udinese          11
Bologna          11
Atalanta         11
Torino           11
Genoa            10
Sassuolo         10
Cagliari          9
Hellas Verona     9
Sampdoria         8
Empoli            7
Lecce             5
Parma             5
Chievo            4
Monza             3
Frosinone         3
Crotone           3
SPAL              3
Spezia            3
Salernitana       3
Palermo           2
Cremonese         2
Benevento         2
Como              2
Venezia           2
Carpi             1
Pescara           1
Brescia           1
Pisa              1
Name: count, dtype: int64


In [22]:
a_data['squad'].groupby(a_data['rk']).value_counts().head(10)

rk  squad   
1   Juventus    5
    Inter       3
    Napoli      2
    Milan       1
2   Inter       3
    Milan       3
    Napoli      3
    Lazio       1
    Roma        1
3   Atalanta    4
Name: count, dtype: int64

In [23]:
ranking_counts = a_data['squad'].groupby(a_data['rk']).value_counts().reset_index(name='count')
ranking_counts = ranking_counts[ranking_counts['rk'].isin([1,2,3,4])]

fig = px.bar(ranking_counts, x='squad', y='count', color='rk',
             barmode='group',
             title='Top 4 Finishes by Team and Rank (Last 10 Years)',
             labels={'squad': 'Team', 'count': 'Finish Count', 'rk': 'Rank'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [24]:
league_winners_a = a_data[a_data["rk"] == 1][["season", "squad", "pts", "pts/mp", "gd"]].sort_values("season")
print(league_winners_a)

     season      squad  pts  pts/mp  gd
0      2015   Juventus   91    2.39  55
20     2016   Juventus   91    2.39  50
40     2017   Juventus   95    2.50  62
60     2018   Juventus   90    2.37  40
80     2019   Juventus   83    2.18  33
100    2020      Inter   91    2.39  54
120    2021      Milan   86    2.26  38
140    2022     Napoli   90    2.37  49
160    2023      Inter   94    2.47  67
180    2024     Napoli   82    2.16  32
200    2025      Inter   33    2.20  20


In [25]:
league_winners_a_last_10 = a_data[(a_data["rk"] == 1) & (a_data["season"] < 2024)]
avg_pts_mp_winners = league_winners_a_last_10["pts/mp"].mean().__round__(2)
print(avg_pts_mp_winners)

2.37


In [26]:
a_data['league_winner'] = (a_data['rk'] == 1).astype(int)

champions=a_data[a_data['league_winner'] == 1]
print(champions)

     rk      squad  mp   w   d  l  gf  ga  gd  pts  ...  \
0     1   Juventus  38  29   4  5  75  20  55   91  ...   
20    1   Juventus  38  29   4  5  77  27  50   91  ...   
40    1   Juventus  38  30   5  3  86  24  62   95  ...   
60    1   Juventus  38  28   6  4  70  30  40   90  ...   
80    1   Juventus  38  26   5  7  76  43  33   83  ...   
100   1      Inter  38  28   7  3  89  35  54   91  ...   
120   1      Milan  38  26   8  4  69  31  38   86  ...   
140   1     Napoli  38  28   6  4  77  28  49   90  ...   
160   1      Inter  38  29   7  2  89  22  67   94  ...   
180   1     Napoli  38  24  10  4  59  27  32   82  ...   
200   1      Inter  15  11   0  4  34  14  20   33  ...   

                      top_team_scorer         goalkeeper  \
0                   Paulo Dybala - 19   Gianluigi Buffon   
20               Gonzalo Higuaín - 24   Gianluigi Buffon   
40                  Paulo Dybala - 22   Gianluigi Buffon   
60             Cristiano Ronaldo - 21  Wojciech Szc

In [27]:
mean_pts_mp_champions = champions["pts/mp"].mean().__round__(2)

print(mean_pts_mp_champions)

2.33


# Machine Learning-Random Forests

In [28]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler     
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, log_loss  
from sklearn.model_selection import cross_val_score  
import pandas as pd                                  
import numpy as np  

In [29]:
a_data["games_left"] = 38 - a_data["mp"]
a_data["projected_pts"] = a_data["pts"] + a_data["games_left"] * a_data["pts/mp"]
a_data["is_champion"] = (a_data["rk"] == 1).astype(int)

In [30]:
features = ["pts/mp", "gd", "w", "l", "projected_pts"]

df_train = a_data[a_data["season"] < 2025]

df_test = a_data[a_data["season"] == 2025].copy()

X_train = df_train[features]
y_train = df_train["is_champion"]
X_test = df_test[features]

In [31]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [32]:
train_preds = model.predict(X_train)
train_probs = model.predict_proba(X_train)[:, 1]

In [33]:
accuracy = accuracy_score(y_train, train_preds)

roc_auc = roc_auc_score(y_train, train_probs)

logloss = log_loss(y_train, train_probs)

print(accuracy)
print(roc_auc)
print(logloss)

print("\nClassification Report:")
print(classification_report(y_train, train_preds))

print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

1.0
1.0
0.014837595967170927

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       190
           1       1.00      1.00      1.00        10

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[190   0]
 [  0  10]]


In [34]:
df_test["win_probability"] = model.predict_proba(X_test)[:, 1]

In [35]:
df_test.head()

Unnamed: 0,rk,squad,mp,w,d,l,gf,ga,gd,pts,...,xg,xga,xgd,xgd/90,last_5,league_winner,games_left,projected_pts,is_champion,win_probability
200,1,Inter,15,11,0,4,34,14,20,33,...,27.3,13.4,13.9,0.93,W L W W W,1,23,83.6,1,0.565
201,2,Milan,15,9,5,1,24,13,11,32,...,23.3,16.9,6.4,0.43,D W W W D,0,23,80.99,0,0.115
202,3,Napoli,15,10,1,4,22,13,9,31,...,21.3,16.1,5.2,0.35,L W W W L,0,23,78.61,0,0.045
203,4,Roma,15,10,0,5,16,8,8,30,...,18.1,16.7,1.4,0.09,W W L L W,0,23,76.0,0,0.01
204,5,Juventus,15,7,5,3,19,14,5,26,...,20.6,13.3,7.3,0.49,D D W L W,0,23,65.79,0,0.035


In [36]:
df_test["win_probability (%)"] = df_test["win_probability"] / df_test["win_probability"].sum() * 100
df_test["win_probability (%)"] = df_test["win_probability (%)"].round(3)

In [37]:
df_test = df_test.sort_values("projected_pts", ascending=False)
print("\nChampionship Win Probabilities (2025):")
df_test[["squad", "pts", "projected_pts", "pts/mp", "gd", "games_left", "win_probability (%)"]]


Championship Win Probabilities (2025):


Unnamed: 0,squad,pts,projected_pts,pts/mp,gd,games_left,win_probability (%)
200,Inter,33,83.6,2.2,20,23,63.842
201,Milan,32,80.99,2.13,11,23,12.994
202,Napoli,31,78.61,2.07,9,23,5.085
203,Roma,30,76.0,2.0,8,23,1.13
204,Juventus,26,65.79,1.73,5,23,3.955
205,Bologna,25,63.41,1.67,10,23,3.39
206,Como,24,60.8,1.6,7,23,3.955
207,Lazio,22,55.81,1.47,6,23,1.13
208,Sassuolo,21,53.2,1.4,2,23,0.0
209,Udinese,21,53.2,1.4,-6,23,0.0


In [38]:
import plotly.express as px

df_test_sorted = df_test.sort_values("projected_pts", ascending=False)

fig = px.bar(
    df_test_sorted,
    x="squad",
    y="win_probability (%)",
    color_discrete_sequence=["#1f77b4"],
    title="Championship Win Probabilities (2025 Season)",
    labels={"squad": "Team", "win_probability (%)": "Win Probability (%)"},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Team",
    yaxis_title="Win Probability (%)",
    xaxis_tickangle=-45,
    bargap=0.3,
    title_x=0.5
)

fig.show()