# Data Analysis

In [2]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px

In [3]:
la_liga_data=pd.read_csv("ll_data_2015_2026.csv")
team_occurence=la_liga_data['squad'].value_counts()
print(team_occurence)

squad
Barcelona          11
Real Madrid        11
Atlético Madrid    11
Villarreal         11
Athletic Club      11
Celta Vigo         11
Sevilla            11
Real Sociedad      11
Betis              11
Valencia           11
Getafe             10
Alavés              9
Espanyol            9
Osasuna             8
Rayo Vallecano      7
Levante             7
Granada             6
Mallorca            6
Eibar               6
Girona              6
Valladolid          5
Las Palmas          5
Leganés             5
Elche               4
Cádiz               4
La Coruña           3
Málaga              3
Sporting Gijón      2
Huesca              2
Almería             2
Oviedo              1
Name: count, dtype: int64


In [4]:
la_liga_data['squad'].groupby(la_liga_data['rk']).value_counts().head(10)

rk  squad          
1   Barcelona          5
    Real Madrid        5
    Atlético Madrid    1
2   Barcelona          5
    Real Madrid        4
    Atlético Madrid    2
3   Atlético Madrid    6
    Real Madrid        2
    Barcelona          1
    Girona             1
Name: count, dtype: int64

In [5]:
ranking_counts = la_liga_data['squad'].groupby(la_liga_data['rk']).value_counts().reset_index(name='count')
ranking_counts = ranking_counts[ranking_counts['rk'].isin([1,2,3,4])]

fig = px.bar(ranking_counts, x='squad', y='count', color='rk',
             barmode='group',
             title='Top 4 Finishes by Team and Rank (Last 10 Years)',
             labels={'squad': 'Team', 'count': 'Finish Count', 'rk': 'Rank'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [40]:
league_winners_laliga = la_liga_data[la_liga_data["rk"] == 1][["season", "squad", "pts", "pts/mp", "gd"]].sort_values("season")
print(league_winners_laliga)

     season            squad  pts  pts/mp  gd
0      2015        Barcelona   91    2.39  83
20     2016      Real Madrid   93    2.45  65
40     2017        Barcelona   93    2.45  70
60     2018        Barcelona   87    2.29  54
80     2019      Real Madrid   87    2.29  45
100    2020  Atlético Madrid   86    2.26  42
120    2021      Real Madrid   86    2.26  49
140    2022        Barcelona   88    2.32  50
160    2023      Real Madrid   95    2.50  61
180    2024        Barcelona   88    2.32  63
200    2025      Real Madrid   27    2.70  12


In [41]:
league_winners_laliga_last_10 = la_liga_data[(la_liga_data["rk"] == 1) & (la_liga_data["season"] < 2024)]
avg_pts_mp_winners = league_winners_laliga_last_10["pts/mp"].mean().__round__(2)
print(avg_pts_mp_winners)

2.36


In [42]:
la_liga_data['league_winner'] = (la_liga_data['rk'] == 1).astype(int)

champions=la_liga_data[la_liga_data['league_winner'] == 1]
print(champions)

     rk            squad  mp   w  d  l   gf  ga  gd  pts  ...  \
0     1        Barcelona  38  29  4  5  112  29  83   91  ...   
20    1      Real Madrid  38  29  6  3  106  41  65   93  ...   
40    1        Barcelona  38  28  9  1   99  29  70   93  ...   
60    1        Barcelona  38  26  9  3   90  36  54   87  ...   
80    1      Real Madrid  38  26  9  3   70  25  45   87  ...   
100   1  Atlético Madrid  38  26  8  4   67  25  42   86  ...   
120   1      Real Madrid  38  26  8  4   80  31  49   86  ...   
140   1        Barcelona  38  28  4  6   70  20  50   88  ...   
160   1      Real Madrid  38  29  8  1   87  26  61   95  ...   
180   1        Barcelona  38  28  4  6  102  39  63   88  ...   
200   1      Real Madrid  10   9  0  1   22  10  12   27  ...   

             top_team_scorer             goalkeeper  \
0           Luis Suárez - 40          Claudio Bravo   
20    Cristiano Ronaldo - 25           Keylor Navas   
40         Lionel Messi - 34  Marc-André ter Stegen   

In [43]:
mean_pts_mp_champions = champions["pts/mp"].mean().__round__(2)

print(mean_pts_mp_champions)

2.38


# Machine Learning-Random Forests

In [44]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler     
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, log_loss  
from sklearn.model_selection import cross_val_score  
import pandas as pd                                  
import numpy as np  

In [45]:
la_liga_data["games_left"] = 38 - la_liga_data["mp"]
la_liga_data["projected_pts"] = la_liga_data["pts"] + la_liga_data["games_left"] * la_liga_data["pts/mp"]
la_liga_data["is_champion"] = (la_liga_data["rk"] == 1).astype(int)

In [46]:
features = ["pts/mp", "gd", "w", "l", "projected_pts"]

df_train = la_liga_data[la_liga_data["season"] < 2025]

df_test = la_liga_data[la_liga_data["season"] == 2025].copy()

X_train = df_train[features]
y_train = df_train["is_champion"]
X_test = df_test[features]

In [47]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [48]:
train_preds = model.predict(X_train)
train_probs = model.predict_proba(X_train)[:, 1]

In [49]:
accuracy = accuracy_score(y_train, train_preds)

roc_auc = roc_auc_score(y_train, train_probs)

logloss = log_loss(y_train, train_probs)

print(accuracy)
print(roc_auc)
print(logloss)

print("\nClassification Report:")
print(classification_report(y_train, train_preds))

print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

1.0
1.0
0.01232026571936536

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       190
           1       1.00      1.00      1.00        10

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[190   0]
 [  0  10]]


In [50]:
df_test["win_probability"] = model.predict_proba(X_test)[:, 1]

In [51]:
df_test.head()

Unnamed: 0,rk,squad,mp,w,d,l,gf,ga,gd,pts,...,xg,xga,xgd,xgd/90,last_5,league_winner,games_left,projected_pts,is_champion,win_probability
200,1,Real Madrid,10,9,0,1,22,10,12,27,...,23.5,11.3,12.2,1.22,W L W W W,1,28,102.6,1,0.69
201,2,Barcelona,10,7,1,2,25,12,13,22,...,20.7,14.4,6.3,0.63,W W L W L,0,28,83.6,0,0.095
202,3,Villarreal,10,6,2,2,18,10,8,20,...,16.1,12.5,3.6,0.36,W W L D W,0,28,76.0,0,0.075
203,4,Atlético Madrid,10,5,4,1,18,10,8,19,...,16.4,10.8,5.7,0.57,W W D W W,0,28,72.2,0,0.075
204,5,Espanyol,10,5,3,2,14,11,3,18,...,19.0,11.8,7.2,0.72,D D L W W,0,28,68.4,0,0.075


In [52]:
df_test["win_probability (%)"] = df_test["win_probability"] / df_test["win_probability"].sum() * 100
df_test["win_probability (%)"] = df_test["win_probability (%)"].round(3)

In [53]:
df_test = df_test.sort_values("projected_pts", ascending=False)
print("\nChampionship Win Probabilities (2025):")
df_test[["squad", "pts", "projected_pts", "pts/mp", "gd", "games_left", "win_probability (%)"]]


Championship Win Probabilities (2025):


Unnamed: 0,squad,pts,projected_pts,pts/mp,gd,games_left,win_probability (%)
200,Real Madrid,27,102.6,2.7,12,28,54.118
201,Barcelona,22,83.6,2.2,13,28,7.451
202,Villarreal,20,76.0,2.0,8,28,5.882
203,Atlético Madrid,19,72.2,1.9,8,28,5.882
204,Espanyol,18,68.4,1.8,3,28,5.882
205,Betis,16,60.8,1.6,3,28,5.882
206,Rayo Vallecano,14,53.2,1.4,2,28,0.784
207,Elche,14,53.2,1.4,1,28,5.882
208,Athletic Club,14,53.2,1.4,-1,28,0.784
209,Getafe,14,53.2,1.4,-2,28,0.784


In [54]:
import plotly.express as px

df_test_sorted = df_test.sort_values("projected_pts", ascending=False)

fig = px.bar(
    df_test_sorted,
    x="squad",
    y="win_probability (%)",
    color_discrete_sequence=["#1f77b4"],
    title="Championship Win Probabilities (2025 Season)",
    labels={"squad": "Team", "win_probability (%)": "Win Probability (%)"},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Team",
    yaxis_title="Win Probability (%)",
    xaxis_tickangle=-45,
    bargap=0.3,
    title_x=0.5
)

fig.show()