In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score,classification_report
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv("data/premier_league_standings_2022_2025.csv")
print(" Dataset loaded successfully!")
print(f"Total records: {len(df)}")
print(f"Teams: {df['Team'].nunique()}, Seasons: {df['Season'].nunique()}")

 Dataset loaded successfully!
Total records: 60
Teams: 24, Seasons: 3


In [3]:
df.head()

Unnamed: 0,Team,Matches,Points,GoalDiff,GoalsFor,GoalsAgainst,Season
0,Man City,38,89,61,94,33,2022/23
1,Arsenal,38,84,45,88,43,2022/23
2,Man United,38,75,15,58,43,2022/23
3,Newcastle,38,71,35,68,33,2022/23
4,Liverpool,38,67,28,75,47,2022/23


In [4]:
# Create target variable (Next Season Points)
df = df.sort_values(["Team", "Season"])
df["PointsNext"] = df.groupby("Team")["Points"].shift(-1)  # target: next season points

# Remove rows without target (last season for each team)
train_df = df[df["PointsNext"].notnull()].copy()

print(f"\nTraining data: {len(train_df)} records (after removing last season data)")


Training data: 36 records (after removing last season data)


In [5]:
#  Feature Engineering & Encoding

# Encode Teams
le = LabelEncoder()
train_df["TeamCode"] = le.fit_transform(train_df["Team"])

# Create additional features for better prediction
train_df["AttackRate"] = train_df["GoalsFor"] / train_df["Matches"]  # Goals per match
train_df["DefenseRate"] = train_df["GoalsAgainst"] / train_df["Matches"]  # Conceded per match
train_df["PointsPerMatch"] = train_df["Points"] / train_df["Matches"]  # Points per match
train_df["GoalRatio"] = train_df["GoalsFor"] / (train_df["GoalsAgainst"] + 1)  # Attack/Defense ratio

In [6]:
#  Features & Target Selection
features = [
    "Points", "GoalDiff", "GoalsFor", "GoalsAgainst", 
    "TeamCode", "AttackRate", "DefenseRate", "PointsPerMatch", "GoalRatio"
]
target = "PointsNext"

X = train_df[features]
y = train_df[target]

print(f"\nFeatures used: {features}")
print(f"Target: {target}")


Features used: ['Points', 'GoalDiff', 'GoalsFor', 'GoalsAgainst', 'TeamCode', 'AttackRate', 'DefenseRate', 'PointsPerMatch', 'GoalRatio']
Target: PointsNext


In [7]:
#  Split data for proper evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=18
)

print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 25, Test size: 11


In [8]:
#  Train Random Forest
print("\n Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=300, 
    max_depth=10,
    min_samples_split=3,
    random_state=18
)
rf_model.fit(X_train, y_train)

# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)

# Metrics
rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_train_pred))
rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_pred))
rf_r2 = r2_score(y_test, rf_test_pred)


 Training Random Forest...


In [9]:
#  Train XGBoost
print(" Training XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    objective='reg:squarederror'
)
xgb_model.fit(X_train, y_train)

# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)

# Metrics
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_pred))
xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_test_pred))
xgb_r2 = r2_score(y_test, xgb_test_pred)

 Training XGBoost...


In [10]:
#  Model Comparison
print("\n" + "="*50)
print(" MODEL PERFORMANCE COMPARISON")
print("="*50)
print(f"Random Forest:")
print(f"  Train RMSE: {rf_train_rmse:.2f}")
print(f"  Test RMSE:  {rf_test_rmse:.2f}")
print(f"  RÂ² Score:   {rf_r2:.3f}")
print(f"\nXGBoost:")
print(f"  Train RMSE: {xgb_train_rmse:.2f}")
print(f"  Test RMSE:  {xgb_test_rmse:.2f}")
print(f"  RÂ² Score:   {xgb_r2:.3f}")

# Choose the best model based on test performance
best_model = rf_model if rf_test_rmse < xgb_test_rmse else xgb_model
best_name = 'Random Forest' if best_model == rf_model else 'XGBoost'
best_rmse = rf_test_rmse if best_model == rf_model else xgb_test_rmse
best_r2 = rf_r2 if best_model == rf_model else xgb_r2

print(f"\n Best model: {best_name}")
print(f"   Test RMSE: {best_rmse:.2f}")
print(f"   RÂ² Score: {best_r2:.3f}")


 MODEL PERFORMANCE COMPARISON
Random Forest:
  Train RMSE: 6.60
  Test RMSE:  16.61
  RÂ² Score:   0.304

XGBoost:
  Train RMSE: 0.00
  Test RMSE:  18.67
  RÂ² Score:   0.121

 Best model: Random Forest
   Test RMSE: 16.61
   RÂ² Score: 0.304


In [43]:
# -------------------
#  Feature Importance
# -------------------
print(f"\n {best_name} - Top Feature Importances:")
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    for idx, row in feature_importance.head(5).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")



 Random Forest - Top Feature Importances:
  GoalsFor: 0.156
  AttackRate: 0.147
  Points: 0.134
  PointsPerMatch: 0.127
  GoalRatio: 0.116


In [11]:
#  Predict Next Season (2025/26)
print("\n" + "="*50)
print(" PREDICTING 2025/26 SEASON")
print("="*50)

# Get last season data (2024/25) for all teams
last_season = df[df["Season"] == "2024/25"].copy()

# Encode teams (handle new teams that might not be in training data)
known_teams = set(le.classes_)
prediction_teams = []

for team in last_season["Team"]:
    if team in known_teams:
        prediction_teams.append(team)

last_season_filtered = last_season[last_season["Team"].isin(prediction_teams)].copy()
last_season_filtered["TeamCode"] = le.transform(last_season_filtered["Team"])

# Create same features as training
last_season_filtered["AttackRate"] = last_season_filtered["GoalsFor"] / last_season_filtered["Matches"]
last_season_filtered["DefenseRate"] = last_season_filtered["GoalsAgainst"] / last_season_filtered["Matches"]
last_season_filtered["PointsPerMatch"] = last_season_filtered["Points"] / last_season_filtered["Matches"]
last_season_filtered["GoalRatio"] = last_season_filtered["GoalsFor"] / (last_season_filtered["GoalsAgainst"] + 1)

X_pred = last_season_filtered[features]
predictions = best_model.predict(X_pred)
last_season_filtered["PredictedPoints"] = predictions


 PREDICTING 2025/26 SEASON


In [12]:
#  Results
# Sort by predicted points
results = last_season_filtered.sort_values("PredictedPoints", ascending=False)

print(" PREDICTED PREMIER LEAGUE TABLE 2025/26 ")
print("-" * 55)
print(f"{'Pos':<4} {'Team':<15} {'Current':<8} {'Predicted':<10} {'Change':<8}")
print("-" * 55)

for i, (_, row) in enumerate(results.iterrows(), 1):
    current_points = row["Points"]
    predicted_points = row["PredictedPoints"]
    change = predicted_points - current_points
    change_str = f"{change:+.0f}"
    
    print(f"{i:<4} {row['Team']:<15} {current_points:<8.0f} {predicted_points:<10.1f} {change_str:<8}")

 PREDICTED PREMIER LEAGUE TABLE 2025/26 
-------------------------------------------------------
Pos  Team            Current  Predicted  Change  
-------------------------------------------------------
1    Liverpool       84       80.9       -3      
2    Man City        71       66.1       -5      
3    Arsenal         74       63.9       -10     
4    Chelsea         69       61.0       -8      
5    Newcastle       66       60.0       -6      
6    Nottingham      65       57.8       -7      
7    Everton         48       55.9       +8      
8    Aston Villa     66       55.9       -10     
9    West Ham        43       53.5       +11     
10   Man United      42       53.0       +11     
11   Crystal Palace  53       52.9       -0      
12   Brighton Hove   61       52.2       -9      
13   Tottenham       38       52.2       +14     
14   Fulham          54       50.4       -4      
15   Bournemouth     56       48.2       -8      
16   Brentford       56       45.7       -10   

In [13]:
# Top 3 teams
print(f"\n PREDICTED TOP 3 FOR 2025/26:")
top3 = results.head(3)
for i, (_, row) in enumerate(top3.iterrows(), 1):
    medal = "ðŸ¥‡" if i == 1 else "ðŸ¥ˆ" if i == 2 else "ðŸ¥‰"
    print(f"{medal} {i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


 PREDICTED TOP 3 FOR 2025/26:
ðŸ¥‡ 1. Liverpool       - 80.9 points
ðŸ¥ˆ 2. Man City        - 66.1 points
ðŸ¥‰ 3. Arsenal         - 63.9 points


In [19]:
# Bottom 3 teams (relegation zone)
print(f"\n  PREDICTED RELEGATION ZONE:")
bottom3 = results.tail(3)
for i, (_, row) in enumerate(reversed(list(bottom3.iterrows())), 18):
    print(f"{i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


  PREDICTED RELEGATION ZONE:
18. Southampton     - 21.0 points
19. Leicester City  - 34.7 points
20. Wolverhampton   - 36.1 points


In [14]:
print("\n" + "="*50)
print(" ANALYSIS COMPLETE!")
print(f" Best Model: {best_name} (RMSE: {best_rmse:.2f})")
print(f" Predictions for {len(prediction_teams)} teams")
print("="*50)


 ANALYSIS COMPLETE!
 Best Model: Random Forest (RMSE: 16.61)
 Predictions for 19 teams


### Using Grid Search CV for best model and params

In [15]:
model_params = {
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, None],
            'criterion': ['squared_error', 'friedman_mse']
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10, None]
        }
    },
    'xgboost': {
        'model': XGBRegressor(objective='reg:squarederror'),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 10],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    }
}


In [16]:
scores = []
best_model = None
best_score = -float("inf")

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='r2', return_train_score=False)
    clf.fit(X_train, y_train)   # y_train should be points/target values
    
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
    if clf.best_score_ > best_score:
        best_score = clf.best_score_
        best_model = clf.best_estimator_

scores_df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
print("\nModel Comparison:")
print(scores_df)

print(f"\n Best Model Selected: {best_model}")



Model Comparison:
           model  best_score  \
0  decision_tree   -0.357529   
1  random_forest   -0.146102   
2        xgboost    0.132044   

                                         best_params  
0  {'criterion': 'squared_error', 'max_depth': None}  
1           {'max_depth': None, 'n_estimators': 200}  
2  {'learning_rate': 0.01, 'max_depth': 10, 'n_es...  

 Best Model Selected: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=None, missing=n

In [17]:
print(" PREDICTING 2025/26 SEASON")
last_season = df[df["Season"] == "2024/25"].copy()

 PREDICTING 2025/26 SEASON


In [18]:
# Encode teams (handle new teams that might not be in training data)
known_teams = set(le.classes_)
last_season_filtered = last_season[last_season["Team"].isin(known_teams)].copy()
last_season_filtered["TeamCode"] = le.transform(last_season_filtered["Team"])

In [19]:
# Create same features as training
last_season_filtered["AttackRate"] = last_season_filtered["GoalsFor"] / last_season_filtered["Matches"]
last_season_filtered["DefenseRate"] = last_season_filtered["GoalsAgainst"] / last_season_filtered["Matches"]
last_season_filtered["PointsPerMatch"] = last_season_filtered["Points"] / last_season_filtered["Matches"]
last_season_filtered["GoalRatio"] = last_season_filtered["GoalsFor"] / (last_season_filtered["GoalsAgainst"] + 1)


In [20]:
X_pred = last_season_filtered[features]
predictions = best_model.predict(X_pred)
last_season_filtered["PredictedPoints"] = predictions

In [21]:
results_1 = last_season_filtered.sort_values("PredictedPoints", ascending=False)

print(" PREDICTED PREMIER LEAGUE TABLE 2025/26 ")
print("-" * 55)
print(f"{'Pos':<4} {'Team':<15} {'Current':<8} {'Predicted':<10} {'Change':<8}")
print("-" * 55)
for i, (_, row) in enumerate(results_1.iterrows(), 1):
    current_points = row["Points"]
    predicted_points = row["PredictedPoints"]
    change = predicted_points - current_points
    change_str = f"{change:+.0f}"
    print(f"{i:<4} {row['Team']:<15} {current_points:<8.0f} {predicted_points:<10.1f} {change_str:<8}")


 PREDICTED PREMIER LEAGUE TABLE 2025/26 
-------------------------------------------------------
Pos  Team            Current  Predicted  Change  
-------------------------------------------------------
1    Liverpool       84       71.4       -13     
2    West Ham        43       58.6       +16     
3    Arsenal         74       58.3       -16     
4    Chelsea         69       58.3       -11     
5    Nottingham      65       58.3       -7      
6    Newcastle       66       58.3       -8      
7    Aston Villa     66       58.3       -8      
8    Brighton Hove   61       56.3       -5      
9    Tottenham       38       55.6       +18     
10   Man City        71       55.5       -15     
11   Everton         48       55.3       +7      
12   Man United      42       55.0       +13     
13   Wolverhampton   42       54.2       +12     
14   Crystal Palace  53       52.7       -0      
15   Fulham          54       51.2       -3      
16   Brentford       56       48.7       -7    

In [22]:
# Top 3 teams
print(f"\n PREDICTED TOP 3 FOR 2025/26:")
top3 = results_1.head(3)
for i, (_, row) in enumerate(top3.iterrows(), 1):
    medal = "ðŸ¥‡" if i == 1 else "ðŸ¥ˆ" if i == 2 else "ðŸ¥‰"
    print(f"{medal} {i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


 PREDICTED TOP 3 FOR 2025/26:
ðŸ¥‡ 1. Liverpool       - 71.4 points
ðŸ¥ˆ 2. West Ham        - 58.6 points
ðŸ¥‰ 3. Arsenal         - 58.3 points


In [23]:
# Bottom 3 teams (relegation zone)
print(f"\n  PREDICTED RELEGATION ZONE:")
bottom3 = results_1.tail(3)
for i, (_, row) in enumerate(reversed(list(bottom3.iterrows())), 18):
    print(f"{i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


  PREDICTED RELEGATION ZONE:
18. Leicester City  - 28.6 points
19. Southampton     - 28.6 points
20. Bournemouth     - 48.7 points
