In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Setting file path and leading the data into a pandas dataframe
path = "../" 
Season_Summary = pd.read_excel(path + "Premier League Data Analysis.xlsx", sheet_name= "Season Summary")

In [10]:
#As the goal is to predict if the team is relegated or not, we treat 'Relegated After Promotion' and 'Relegated' as equal as in both cases the team ends up relegated
df = Season_Summary
df['End_Status'] = df['End_Status'].replace('Relegated After Promotion', 'Relegated')

#Removing any rows with missing values
df = df.dropna()

#Encoding the target variable 'End_Status'
label_encoder = LabelEncoder()
df['End_Status_Encoded'] = label_encoder.fit_transform(df['End_Status'])

#Dropping columns we do not want to use as features
features = df.drop(columns=['End_Status', 'End_Status_Encoded', 'Season', 'Team', 'Total_Matches'])  # Select relevant columns only

#Defining the target variable (the encoded 'End_Status' column)
target = df['End_Status_Encoded']

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

#Building and training the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

#Making predictions and evaluating the model
y_pred = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

#Printing classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

#Printing Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#Printing Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=features.columns, columns=['Importance']).sort_values('Importance', ascending=False)
print("\nFeature Importances:")
print(feature_importances)


Accuracy: 0.95

Classification Report:
              precision    recall  f1-score   support

   Relegated       0.75      0.88      0.81        24
    Survived       0.98      0.96      0.97       171

    accuracy                           0.95       195
   macro avg       0.87      0.92      0.89       195
weighted avg       0.95      0.95      0.95       195


Confusion Matrix:
[[ 21   3]
 [  7 164]]

Feature Importances:
                     Importance
Total_Points           0.288080
Goal_Difference        0.171720
Total_Wins             0.160745
Total_Losses           0.150889
Total_Goals_Against    0.111244
Total_Goals_For        0.061803
Total_Draws            0.055518


In [18]:
#Now using the trained model to make predictions on the 2024-225 Partial Data
df_2024_2025 = pd.read_excel(path + "Premier League Data Analysis.xlsx", sheet_name= "Partial Data")

#Using the same features
features_2024_2025 = df_2024_2025.drop(columns=['End_Status', 'Season', 'Team', 'Total_Matches'])

#Making predictions with the previously trained model
predictions = model.predict(features_2024_2025)

#Decoding the predictions back to the original strings, viewing them and saving them
df_2024_2025['Predicted_End_Status'] = label_encoder.inverse_transform(predictions)
print(df_2024_2025[['Team', 'Predicted_End_Status']])  # Assuming 'Team' is a column with team names

output_file_path = path + "//predicted_2024_2025_season_outcomes.xlsx"
df_2024_2025.to_excel(output_file_path, index=False)
print(f"Predictions saved to {output_file_path}")

       Season            Team  Total_Matches  Total_Wins  Total_Draws  \
0   2024-2025         Arsenal             23          13            8   
1   2024-2025     Aston Villa             23          10            7   
2   2024-2025     Bournemouth             23          11            7   
3   2024-2025       Brentford             23           9            4   
4   2024-2025        Brighton             23           8           10   
5   2024-2025         Chelsea             23          11            7   
6   2024-2025  Crystal Palace             23           6            9   
7   2024-2025         Everton             22           5            8   
8   2024-2025          Fulham             23           8            9   
9   2024-2025         Ipswich             23           3            7   
10  2024-2025       Leicester             23           4            5   
11  2024-2025       Liverpool             22          16            5   
12  2024-2025        Man City             23       

In [32]:
#Predicting the probabilities for each class
probabilities = model.predict_proba(features_2024_2025)

#Adding probabilities to the DataFrame for reference
df_2024_2025['Relegation_Prob'] = probabilities[:, 0]  #Probabilities for being relegated
df_2024_2025['Survival_Prob'] = probabilities[:, 1]    #Probabilities for surviving

#Sorting teams by relegation probability
df_2024_2025 = df_2024_2025.sort_values(by='Relegation_Prob', ascending=False)

#Manually setting the bottom 3 teams as 'Relegated'
df_2024_2025['Final_Prediction'] = 'Survived'  #Setting by default to 'Survived'
df_2024_2025.iloc[:3, df_2024_2025.columns.get_loc('Final_Prediction')] = 'Relegated'  #Setting bottom 3 to 'Relegated'

# Step 5: View the updated predictions
print(df_2024_2025[['Team', 'Final_Prediction', 'Relegation_Prob']])



              Team Final_Prediction  Relegation_Prob
7          Everton        Relegated             0.48
12        Man City        Relegated             0.14
19          Wolves        Relegated             0.08
5          Chelsea         Survived             0.04
4         Brighton         Survived             0.04
13      Man United         Survived             0.04
3        Brentford         Survived             0.04
6   Crystal Palace         Survived             0.03
1      Aston Villa         Survived             0.03
0          Arsenal         Survived             0.03
2      Bournemouth         Survived             0.02
9          Ipswich         Survived             0.02
10       Leicester         Survived             0.02
8           Fulham         Survived             0.01
17       Tottenham         Survived             0.00
15   Nott'm Forest         Survived             0.00
16     Southampton         Survived             0.00
14       Newcastle         Survived           

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr

# Load the Season Summary sheet
file_path = path + "Premier League Data Analysis.xlsx"
season_summary_df = pd.read_excel(file_path, sheet_name="Season Summary")

# Select relevant features and target variable (Rank)
features = ['Goal_Difference', 'ShotOnTarget/Shots', 'Avg_Yellow_Cards/Game', 'Avg_Red_Cards/Game', 'Home_Win_Leading_HT(%)', 'Away_Win_Leading_HT(%)', 'Home_Win_Draw_HT(%)',
           'Away_Win_Draw_HT(%)', 'Home_Win_Losing_HT(%)', 'Away_Win_Losing_HT(%)', 'Manager Changes', 'Caretaker Days']

X = season_summary_df[features]
y = season_summary_df['Rank']

# Step 1: Replace 'NAN' string with actual NaN values
X.replace('NAN', np.nan, inplace=True)

# Step 1: Drop rows with NaN values
X_clean = X.dropna()
y_clean = y[X_clean.index]  # Ensure the target variable corresponds to the rows in X_clean

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Step 3: Initialize the model (Gradient Boosting Regressor)
model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Step 4: Train the model
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
# 1. Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# 2. Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# 3. Spearman Rank Correlation
spearman_corr, _ = spearmanr(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Spearman Rank Correlation: {spearman_corr}")

# Optional: Predict and view rankings for the test set
predicted_ranks = pd.DataFrame({
    'Team': season_summary_df.loc[X_test.index, 'Team'],
    'Actual Rank': y_test,
    'Predicted Rank': y_pred
})

print(predicted_ranks)


Mean Absolute Error (MAE): 1.6697360191766117
Mean Squared Error (MSE): 4.879934236923073
Spearman Rank Correlation: 0.9277586731087921
            Team  Actual Rank  Predicted Rank
561  Southampton           12       13.539138
617    Liverpool            3        1.949575
325  Aston Villa            6        5.375939
589  Bournemouth           15       18.395955
441    Tottenham            5        6.979167
..           ...          ...             ...
362        Wigan           16       15.429452
536    Newcastle           14       14.082154
465  Bournemouth            8       12.680399
478   Sunderland           20       17.910934
270      Everton            6        6.142499

[97 rows x 3 columns]


  X.replace('NAN', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('NAN', np.nan, inplace=True)
