# Data Analysis

In [20]:
import pandas as pd
import plotly.express as px

In [21]:
pl_data=pd.read_csv("PL_2015_2026_Data.csv")
team_occurence=pl_data['squad'].value_counts()
print(team_occurence)

squad
Arsenal            11
Tottenham          11
Chelsea            11
Manchester City    11
Manchester Utd     11
West Ham           11
Crystal Palace     11
Liverpool          11
Everton            11
Newcastle Utd      10
Leicester City      9
Southampton         9
Bournemouth         9
Brighton            9
Burnley             8
Wolves              8
Aston Villa         8
Watford             6
Fulham              6
Brentford           5
Leeds United        4
West Brom           4
Nott'ham Forest     4
Sunderland          3
Stoke City          3
Swansea City        3
Sheffield Utd       3
Norwich City        3
Huddersfield        2
Hull City           1
Middlesbrough       1
Cardiff City        1
Luton Town          1
Ipswich Town        1
Name: count, dtype: int64


In [22]:
pl_data['squad'].groupby(pl_data['rk']).value_counts().head(10)

rk  squad          
1   Manchester City    6
    Liverpool          2
    Arsenal            1
    Chelsea            1
    Leicester City     1
2   Arsenal            4
    Liverpool          2
    Manchester City    2
    Manchester Utd     2
    Tottenham          1
Name: count, dtype: int64

In [23]:
ranking_counts = pl_data['squad'].groupby(pl_data['rk']).value_counts().reset_index(name='count')
ranking_counts = ranking_counts[ranking_counts['rk'].isin([1,2,3,4])]

fig = px.bar(ranking_counts, x='squad', y='count', color='rk',
             barmode='group',
             title='Top 4 Finishes by Team and Rank (Last 10 Years)',
             labels={'squad': 'Team', 'count': 'Finish Count', 'rk': 'Rank'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [24]:
league_winners_pl = pl_data[pl_data["rk"] == 1][["season", "squad", "pts", "pts/mp", "gd"]].sort_values("season")
print(league_winners_pl)

     season             squad  pts  pts/mp  gd
0      2015    Leicester City   81    2.13  32
20     2016           Chelsea   93    2.45  52
40     2017   Manchester City  100    2.63  79
60     2018   Manchester City   98    2.58  72
80     2019         Liverpool   99    2.61  52
100    2020   Manchester City   86    2.26  51
120    2021   Manchester City   93    2.45  73
140    2022   Manchester City   89    2.34  61
160    2023   Manchester City   91    2.39  62
180    2024         Liverpool   84    2.21  45
200    2025           Arsenal   36    2.25  20


In [25]:
league_winners_pl_last_10 = pl_data[(pl_data["rk"] == 1) & (pl_data["season"] < 2024)]
avg_pts_mp_winners = league_winners_pl_last_10["pts/mp"].mean().__round__(2)
print(avg_pts_mp_winners)

2.43


In [26]:
pl_data['league_winner'] = (pl_data['rk'] == 1).astype(int)

champions=pl_data[pl_data['league_winner'] == 1]
print(champions)

     rk             squad  mp   w   d  l   gf  ga  gd  pts  ...  \
0     1    Leicester City  38  23  12  3   68  36  32   81  ...   
20    1           Chelsea  38  30   3  5   85  33  52   93  ...   
40    1   Manchester City  38  32   4  2  106  27  79  100  ...   
60    1   Manchester City  38  32   2  4   95  23  72   98  ...   
80    1         Liverpool  38  32   3  3   85  33  52   99  ...   
100   1   Manchester City  38  27   5  6   83  32  51   86  ...   
120   1   Manchester City  38  29   6  3   99  26  73   93  ...   
140   1   Manchester City  38  28   5  5   94  33  61   89  ...   
160   1   Manchester City  38  28   7  3   96  34  62   91  ...   
180   1         Liverpool  38  25   9  4   86  41  45   84  ...   
200   1           Arsenal  16  11   3  2   30  10  20   36  ...   

                              top_team_scorer         goalkeeper  \
0                            Jamie Vardy - 24  Kasper Schmeichel   
20                           Diego Costa - 20   Thibaut Cou

In [27]:
mean_pts_mp_champions = champions["pts/mp"].mean().__round__(2)

print(mean_pts_mp_champions)

2.39


# Machine Learning-Random Forests

In [28]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import StandardScaler     
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, log_loss  
from sklearn.model_selection import cross_val_score  
import pandas as pd                                  
import numpy as np  

In [29]:
pl_data["games_left"] = 38 - pl_data["mp"]
pl_data["projected_pts"] = pl_data["pts"] + pl_data["games_left"] * pl_data["pts/mp"]
pl_data["is_champion"] = (pl_data["rk"] == 1).astype(int)

In [30]:
features = ["pts/mp", "gd", "w", "l", "projected_pts"]

df_train = pl_data[pl_data["season"] < 2025]

df_test = pl_data[pl_data["season"] == 2025].copy()

X_train = df_train[features]
y_train = df_train["is_champion"]
X_test = df_test[features]

In [31]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [32]:
train_preds = model.predict(X_train)
train_probs = model.predict_proba(X_train)[:, 1]

In [33]:
accuracy = accuracy_score(y_train, train_preds)

roc_auc = roc_auc_score(y_train, train_probs)

logloss = log_loss(y_train, train_probs)

print(accuracy)
print(roc_auc)
print(logloss)

print("\nClassification Report:")
print(classification_report(y_train, train_preds))

print("Confusion Matrix:")
print(confusion_matrix(y_train, train_preds))

1.0
1.0
0.022654274641461237

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       190
           1       1.00      1.00      1.00        10

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[190   0]
 [  0  10]]


In [34]:
df_test["win_probability"] = model.predict_proba(X_test)[:, 1]

In [35]:
df_test.head()

Unnamed: 0,rk,squad,mp,w,d,l,gf,ga,gd,pts,...,xg,xga,xgd,xgd/90,last_5,league_winner,games_left,projected_pts,is_champion,win_probability
200,1,Arsenal,16,11,3,2,30,10,20,36,...,26.5,9.6,16.9,1.06,W D W L W,1,22,85.5,1,0.565
201,2,Manchester City,16,11,1,4,38,16,22,34,...,30.3,17.6,12.7,0.79,L W W W W,0,22,80.86,0,0.395
202,3,Aston Villa,16,10,3,3,25,17,8,33,...,16.7,21.4,-4.7,-0.29,W W W W W,0,22,78.32,0,0.205
203,4,Chelsea,16,8,4,4,27,15,12,28,...,26.9,19.7,7.2,0.45,W D L D W,0,22,66.5,0,0.03
204,5,Crystal Palace,16,7,5,4,20,15,5,26,...,26.2,18.5,7.7,0.48,W L W W L,0,22,61.86,0,0.03


In [36]:
df_test["win_probability (%)"] = df_test["win_probability"] / df_test["win_probability"].sum() * 100
df_test["win_probability (%)"] = df_test["win_probability (%)"].round(3)

In [37]:
df_test = df_test.sort_values("projected_pts", ascending=False)
print("\nChampionship Win Probabilities (2025):")
df_test[["squad", "pts", "projected_pts", "pts/mp", "gd", "games_left", "win_probability (%)"]]


Championship Win Probabilities (2025):


Unnamed: 0,squad,pts,projected_pts,pts/mp,gd,games_left,win_probability (%)
200,Arsenal,36,85.5,2.25,20,22,42.966
201,Manchester City,34,80.86,2.13,22,22,30.038
202,Aston Villa,33,78.32,2.06,8,22,15.589
203,Chelsea,28,66.5,1.75,12,22,2.281
204,Crystal Palace,26,61.86,1.63,5,22,2.281
205,Manchester Utd,26,61.86,1.63,4,22,2.281
206,Liverpool,26,61.86,1.63,2,22,0.38
207,Sunderland,26,61.86,1.63,2,22,2.281
208,Everton,24,57.0,1.5,-1,22,0.38
209,Brighton,23,54.68,1.44,2,22,0.38


In [38]:
import plotly.express as px

df_test_sorted = df_test.sort_values("projected_pts", ascending=False)


fig = px.bar(
    df_test_sorted,
    x="squad",
    y="win_probability (%)",
    color_discrete_sequence=["#1f77b4"],
    title="Championship Win Probabilities (2025 Season)",
    labels={"squad": "Team", "win_probability (%)": "Win Probability (%)"},
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Team",
    yaxis_title="Win Probability (%)",
    xaxis_tickangle=-45,
    bargap=0.3,
    title_x=0.5
)

fig.show()