# Data Analysis

In [39]:
import pandas as pd
import plotly.express as px

In [40]:
bundes_data=pd.read_csv("BLiga_2015_2026_Data.csv")
team_occurence=bundes_data['squad'].value_counts()
print(team_occurence)

squad
Bayern Munich     11
Dortmund          11
Leverkusen        11
Gladbach          11
Mainz 05          11
Wolfsburg         11
Augsburg          11
Hoffenheim        11
Eint Frankfurt    11
Werder Bremen     10
RB Leipzig        10
Freiburg          10
Köln               9
Stuttgart          9
Hertha BSC         8
Union Berlin       7
Schalke 04         7
Hamburger SV       4
Bochum             4
Darmstadt 98       3
Heidenheim         3
Hannover 96        3
Ingolstadt 04      2
Düsseldorf         2
St. Pauli          2
Arminia            2
Nürnberg           1
Paderborn 07       1
Greuther Fürth     1
Holstein Kiel      1
Name: count, dtype: int64


In [41]:
bundes_data['squad'].groupby(bundes_data['rk']).value_counts().head(10)

rk  squad        
1   Bayern Munich    10
    Leverkusen        1
2   Dortmund          5
    RB Leipzig        3
    Leverkusen        1
    Schalke 04        1
    Stuttgart         1
3   Dortmund          3
    RB Leipzig        3
    Leverkusen        2
Name: count, dtype: int64

In [42]:
ranking_counts = bundes_data['squad'].groupby(bundes_data['rk']).value_counts().reset_index(name='count')
ranking_counts = ranking_counts[ranking_counts['rk'].isin([1,2,3,4])]

fig = px.bar(ranking_counts, x='squad', y='count', color='rk',
             barmode='group',
             title='Top 4 Finishes by Team and Rank (Last 10 Years)',
             labels={'squad': 'Team', 'count': 'Finish Count', 'rk': 'Rank'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [43]:
league_winners_bundes = bundes_data[bundes_data["rk"] == 1][["season", "squad", "pts", "pts/mp", "gd"]].sort_values("season")
print(league_winners_bundes)

     season           squad  pts  pts/mp  gd
0      2015   Bayern Munich   88    2.59  63
18     2016   Bayern Munich   82    2.41  67
36     2017   Bayern Munich   84    2.47  64
54     2018   Bayern Munich   78    2.29  56
72     2019   Bayern Munich   82    2.41  68
90     2020   Bayern Munich   78    2.29  55
108    2021   Bayern Munich   77    2.26  60
126    2022   Bayern Munich   71    2.09  54
144    2023      Leverkusen   90    2.65  65
162    2024   Bayern Munich   82    2.41  67
180    2025   Bayern Munich   38    2.71  40


In [44]:
league_winners_bundes_last_10 = bundes_data[(bundes_data["rk"] == 1) & (bundes_data["season"] < 2024)]
avg_pts_mp_winners = league_winners_bundes_last_10["pts/mp"].mean().__round__(2)
print(avg_pts_mp_winners)

2.38


In [45]:
bundes_data['league_winner'] = (bundes_data['rk'] == 1).astype(int)

champions=bundes_data[bundes_data['league_winner'] == 1]
print(champions)

     rk           squad  mp   w  d  l   gf  ga  gd  pts  ...  \
0     1   Bayern Munich  34  28  4  2   80  17  63   88  ...   
18    1   Bayern Munich  34  25  7  2   89  22  67   82  ...   
36    1   Bayern Munich  34  27  3  4   92  28  64   84  ...   
54    1   Bayern Munich  34  24  6  4   88  32  56   78  ...   
72    1   Bayern Munich  34  26  4  4  100  32  68   82  ...   
90    1   Bayern Munich  34  24  6  4   99  44  55   78  ...   
108   1   Bayern Munich  34  24  5  5   97  37  60   77  ...   
126   1   Bayern Munich  34  21  8  5   92  38  54   71  ...   
144   1      Leverkusen  34  28  6  0   89  24  65   90  ...   
162   1   Bayern Munich  34  25  7  2   99  32  67   82  ...   
180   1   Bayern Munich  14  12  2  0   51  11  40   38  ...   

             top_team_scorer      goalkeeper  \
0    Robert Lewandowski - 30    Manuel Neuer   
18   Robert Lewandowski - 30    Manuel Neuer   
36   Robert Lewandowski - 29    Sven Ulreich   
54   Robert Lewandowski - 22    Manuel 

In [46]:
mean_pts_mp_champions = champions["pts/mp"].mean().__round__(2)

print(mean_pts_mp_champions)

2.42


# Machine Learning-Random Forests

In [47]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler     
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, log_loss  
from sklearn.model_selection import cross_val_score  
import pandas as pd                                  
import numpy as np  

In [48]:
bundes_data["games_left"] = 38 - bundes_data["mp"]
bundes_data["projected_pts"] = bundes_data["pts"] + bundes_data["games_left"] * bundes_data["pts/mp"]
bundes_data["is_champion"] = (bundes_data["rk"] == 1).astype(int)

In [49]:
features = ["pts/mp", "gd", "w", "l", "projected_pts"]

df_train = bundes_data[bundes_data["season"] < 2025]

df_test = bundes_data[bundes_data["season"] == 2025].copy()

X_train = df_train[features]
y_train = df_train["is_champion"]
X_test = df_test[features]

In [50]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [51]:
train_preds = model.predict(X_train)
train_probs = model.predict_proba(X_train)[:, 1]

In [52]:
accuracy = accuracy_score(y_train, train_preds)

roc_auc = roc_auc_score(y_train, train_probs)

logloss = log_loss(y_train, train_probs)

print(accuracy)
print(roc_auc)
print(logloss)

print("\nClassification Report:")
print(classification_report(y_train, train_preds))

print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

1.0
1.0
0.005427852582274104

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       170
           1       1.00      1.00      1.00        10

    accuracy                           1.00       180
   macro avg       1.00      1.00      1.00       180
weighted avg       1.00      1.00      1.00       180

Confusion Matrix:
[[170   0]
 [  0  10]]


In [53]:
df_test["win_probability"] = model.predict_proba(X_test)[:, 1]

In [54]:
df_test.head()

Unnamed: 0,rk,squad,mp,w,d,l,gf,ga,gd,pts,...,xg,xga,xgd,xgd/90,last_5,league_winner,games_left,projected_pts,is_champion,win_probability
180,1,Bayern Munich,14,12,2,0,51,11,40,38,...,39.4,9.4,30.0,2.14,D W W W D,1,24,103.04,1,0.295
181,2,RB Leipzig,14,9,2,3,29,16,13,29,...,26.1,18.0,8.1,0.58,L W D W L,0,24,78.68,0,0.12
182,3,Dortmund,14,8,5,1,24,12,12,29,...,21.6,14.4,7.2,0.52,D D W W D,0,24,78.68,0,0.12
183,4,Leverkusen,14,8,2,4,30,19,11,26,...,24.3,16.0,8.3,0.59,W W L L W,0,24,70.64,0,0.005
184,5,Hoffenheim,14,8,2,4,29,20,9,26,...,22.4,21.4,1.0,0.07,W D W L W,0,24,70.64,0,0.005


In [55]:
df_test["win_probability (%)"] = df_test["win_probability"] / df_test["win_probability"].sum() * 100
df_test["win_probability (%)"] = df_test["win_probability (%)"].round(3)

In [56]:
df_test = df_test.sort_values("projected_pts", ascending=False)
print("\nChampionship Win Probabilities (2025):")
df_test[["squad", "pts", "projected_pts", "pts/mp", "gd", "games_left", "win_probability (%)"]]


Championship Win Probabilities (2025):


Unnamed: 0,squad,pts,projected_pts,pts/mp,gd,games_left,win_probability (%)
180,Bayern Munich,38,103.04,2.71,40,24,50.0
181,RB Leipzig,29,78.68,2.07,13,24,20.339
182,Dortmund,29,78.68,2.07,12,24,20.339
183,Leverkusen,26,70.64,1.86,11,24,0.847
184,Hoffenheim,26,70.64,1.86,9,24,0.847
185,Stuttgart,25,67.96,1.79,3,24,3.39
186,Eint Frankfurt,24,65.04,1.71,0,24,0.847
187,Union Berlin,18,48.96,1.29,-4,24,0.0
188,Freiburg,17,46.04,1.21,-2,24,3.39
189,Köln,16,43.36,1.14,-1,24,0.0


In [57]:
import plotly.express as px

df_test_sorted = df_test.sort_values("projected_pts", ascending=False)


fig = px.bar(
    df_test_sorted,
    x="squad",
    y="win_probability (%)",
    color_discrete_sequence=["#1f77b4"],
    title="Championship Win Probabilities (2025 Season)",
    labels={"squad": "Team", "win_probability (%)": "Win Probability (%)"},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Team",
    yaxis_title="Win Probability (%)",
    xaxis_tickangle=-45,
    bargap=0.3,
    title_x=0.5
)

fig.show()