In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [30]:
df = pd.read_csv("data/premier_league_standings_2022_2025.csv")
print("📊 Dataset loaded successfully!")
print(f"Total records: {len(df)}")
print(f"Teams: {df['Team'].nunique()}, Seasons: {df['Season'].nunique()}")

📊 Dataset loaded successfully!
Total records: 60
Teams: 24, Seasons: 3


In [31]:
# Create target variable (Next Season Points)

df = df.sort_values(["Team", "Season"])
df["PointsNext"] = df.groupby("Team")["Points"].shift(-1)  # target: next season points

# Remove rows without target (last season for each team)
train_df = df[df["PointsNext"].notnull()].copy()

print(f"\nTraining data: {len(train_df)} records (after removing last season data)")


Training data: 36 records (after removing last season data)


In [59]:
# -------------------
#  Feature Engineering & Encoding
# -------------------
# Encode Teams
le = LabelEncoder()
train_df["TeamCode"] = le.fit_transform(train_df["Team"])

# Create additional features for better prediction
train_df["AttackRate"] = train_df["GoalsFor"] / train_df["Matches"]  # Goals per match
train_df["DefenseRate"] = train_df["GoalsAgainst"] / train_df["Matches"]  # Conceded per match
train_df["PointsPerMatch"] = train_df["Points"] / train_df["Matches"]  # Points per match
train_df["GoalRatio"] = train_df["GoalsFor"] / (train_df["GoalsAgainst"] + 1)  # Attack/Defense ratio

In [58]:
# -------------------
#  Features & Target Selection
# -------------------
features = [
    "Points", "GoalDiff", "GoalsFor", "GoalsAgainst", 
    "TeamCode", "AttackRate", "DefenseRate", "PointsPerMatch", "GoalRatio"
]
target = "PointsNext"

X = train_df[features]
y = train_df[target]

print(f"\nFeatures used: {features}")
print(f"Target: {target}")


Features used: ['Points', 'GoalDiff', 'GoalsFor', 'GoalsAgainst', 'TeamCode', 'AttackRate', 'DefenseRate', 'PointsPerMatch', 'GoalRatio']
Target: PointsNext


In [57]:
# -------------------
#  Split data for proper evaluation
# -------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 25, Test size: 11


In [56]:
# -------------------
#  Train Random Forest
# -------------------
print("\n Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=300, 
    max_depth=10,
    min_samples_split=3,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)

# Metrics
rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_train_pred))
rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_test_pred))
rf_r2 = r2_score(y_test, rf_test_pred)


 Training Random Forest...


In [55]:
# -------------------
#  Train XGBoost
# -------------------
print(" Training XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    objective='reg:squarederror'
)
xgb_model.fit(X_train, y_train)

# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)

# Metrics
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_pred))
xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_test_pred))
xgb_r2 = r2_score(y_test, xgb_test_pred)

 Training XGBoost...


In [54]:
# -------------------
#  Model Comparison
# -------------------
print("\n" + "="*50)
print(" MODEL PERFORMANCE COMPARISON")
print("="*50)
print(f"Random Forest:")
print(f"  Train RMSE: {rf_train_rmse:.2f}")
print(f"  Test RMSE:  {rf_test_rmse:.2f}")
print(f"  R² Score:   {rf_r2:.3f}")
print(f"\nXGBoost:")
print(f"  Train RMSE: {xgb_train_rmse:.2f}")
print(f"  Test RMSE:  {xgb_test_rmse:.2f}")
print(f"  R² Score:   {xgb_r2:.3f}")

# Choose the best model based on test performance
best_model = rf_model if rf_test_rmse < xgb_test_rmse else xgb_model
best_name = 'Random Forest' if best_model == rf_model else 'XGBoost'
best_rmse = rf_test_rmse if best_model == rf_model else xgb_test_rmse
best_r2 = rf_r2 if best_model == rf_model else xgb_r2

print(f"\n Best model: {best_name}")
print(f"   Test RMSE: {best_rmse:.2f}")
print(f"   R² Score: {best_r2:.3f}")


 MODEL PERFORMANCE COMPARISON
Random Forest:
  Train RMSE: 7.52
  Test RMSE:  12.84
  R² Score:   0.167

XGBoost:
  Train RMSE: 0.00
  Test RMSE:  12.97
  R² Score:   0.150

 Best model: Random Forest
   Test RMSE: 12.84
   R² Score: 0.167


In [52]:
# -------------------
#  Feature Importance
# -------------------
print(f"\n {best_name} - Top Feature Importances:")
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    for idx, row in feature_importance.head(5).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")



 Random Forest - Top Feature Importances:
  GoalsFor: 0.156
  AttackRate: 0.147
  Points: 0.134
  PointsPerMatch: 0.127
  GoalRatio: 0.116


In [51]:
# -------------------
#  Predict Next Season (2025/26)
# -------------------
print("\n" + "="*50)
print(" PREDICTING 2025/26 SEASON")
print("="*50)

# Get last season data (2024/25) for all teams
last_season = df[df["Season"] == "2024/25"].copy()

# Encode teams (handle new teams that might not be in training data)
known_teams = set(le.classes_)
prediction_teams = []

for team in last_season["Team"]:
    if team in known_teams:
        prediction_teams.append(team)

last_season_filtered = last_season[last_season["Team"].isin(prediction_teams)].copy()
last_season_filtered["TeamCode"] = le.transform(last_season_filtered["Team"])

# Create same features as training
last_season_filtered["AttackRate"] = last_season_filtered["GoalsFor"] / last_season_filtered["Matches"]
last_season_filtered["DefenseRate"] = last_season_filtered["GoalsAgainst"] / last_season_filtered["Matches"]
last_season_filtered["PointsPerMatch"] = last_season_filtered["Points"] / last_season_filtered["Matches"]
last_season_filtered["GoalRatio"] = last_season_filtered["GoalsFor"] / (last_season_filtered["GoalsAgainst"] + 1)

X_pred = last_season_filtered[features]
predictions = best_model.predict(X_pred)
last_season_filtered["PredictedPoints"] = predictions


 PREDICTING 2025/26 SEASON


In [48]:
# -------------------
#  Results
# -------------------
# Sort by predicted points
results = last_season_filtered.sort_values("PredictedPoints", ascending=False)

print(" PREDICTED PREMIER LEAGUE TABLE 2025/26 ")
print("-" * 55)
print(f"{'Pos':<4} {'Team':<15} {'Current':<8} {'Predicted':<10} {'Change':<8}")
print("-" * 55)

for i, (_, row) in enumerate(results.iterrows(), 1):
    current_points = row["Points"]
    predicted_points = row["PredictedPoints"]
    change = predicted_points - current_points
    change_str = f"{change:+.0f}"
    
    print(f"{i:<4} {row['Team']:<15} {current_points:<8.0f} {predicted_points:<10.1f} {change_str:<8}")

 PREDICTED PREMIER LEAGUE TABLE 2025/26 
-------------------------------------------------------
Pos  Team            Current  Predicted  Change  
-------------------------------------------------------
1    Liverpool       84       81.8       -2      
2    Man City        71       74.3       +3      
3    Arsenal         74       69.0       -5      
4    Newcastle       66       64.4       -2      
5    Chelsea         69       63.4       -6      
6    Nottingham      65       60.4       -5      
7    Aston Villa     66       60.0       -6      
8    Brighton Hove   61       56.4       -5      
9    Everton         48       54.2       +6      
10   Man United      42       53.1       +11     
11   Crystal Palace  53       51.4       -2      
12   West Ham        43       49.2       +6      
13   Fulham          54       48.4       -6      
14   Tottenham       38       48.3       +10     
15   Bournemouth     56       47.9       -8      
16   Brentford       56       46.1       -10   

In [47]:
# Top 3 teams
print(f"\n PREDICTED TOP 3 FOR 2025/26:")
top3 = results.head(3)
for i, (_, row) in enumerate(top3.iterrows(), 1):
    medal = "🥇" if i == 1 else "🥈" if i == 2 else "🥉"
    print(f"{medal} {i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


 PREDICTED TOP 3 FOR 2025/26:
🥇 1. Liverpool       - 81.8 points
🥈 2. Man City        - 74.3 points
🥉 3. Arsenal         - 69.0 points


In [60]:
# Bottom 3 teams (relegation zone)
print(f"\n  PREDICTED RELEGATION ZONE:")
bottom3 = results.tail(3)
for i, (_, row) in enumerate(reversed(list(bottom3.iterrows())), 18):
    print(f"{i}. {row['Team']:<15} - {row['PredictedPoints']:.1f} points")


  PREDICTED RELEGATION ZONE:
18. Southampton     - 8.4 points
19. Leicester City  - 17.8 points
20. Wolverhampton   - 38.2 points


In [61]:
print("\n" + "="*50)
print(" ANALYSIS COMPLETE!")
print(f" Best Model: {best_name} (RMSE: {best_rmse:.2f})")
print(f" Predictions for {len(prediction_teams)} teams")
print("="*50)


 ANALYSIS COMPLETE!
 Best Model: Random Forest (RMSE: 12.84)
 Predictions for 19 teams
