In [1]:
import joblib
model = joblib.load("/content/match_predictor.pkl")

print("Expected number of features:", model.n_features_in_)
print(model.feature_names_in_)

Expected number of features: 18
['B365A' 'B365H' 'HomeTeam_Overall_Rating' 'B365D' 'Referee' 'RefereeCode'
 'AwayTeam_Overall_Rating' 'AwayTeam_Form_Score' 'HomeTeam_Form_Score'
 'AwayTeamCode' 'AwayTeam' 'HomeTeamCode' 'HomeTeam' 'time' 'H2H_AwayForm'
 'hour' 'H2H_HomeForm' 'B365_result']


**Random Forest**

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Load your data
df = pd.read_csv("/content/EPL_processed.csv")

important_features = [
    'B365H', 'B365A', 'B365D',
    'HomeTeam_Overall_Rating', 'AwayTeam_Overall_Rating',
    'Referee', 'AwayTeam_Form_Score', 'HomeTeam_Form_Score','AwayTeam','HomeTeam','time'
]

# Step 2: Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Step 3: Drop rows with missing dates
df = df.dropna(subset=['Date'])

# Step 4: Encode categorical columns
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

# Split data based on date (before/after 2025-08-15)
train_df = df[df['Date'] < '2024-08-15']
test_df = df[df['Date'] >= '2024-08-15']

# Compute class weights
classes = np.unique(train_df['Result'])
y=train_df['Result']
weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df['Result'])
class_weights = dict(zip(classes, weights))

# Train model
model = rfc_best = RandomForestClassifier(n_estimators=250, max_depth=72, random_state=42,class_weight={0: 0.78, 1: 2.56, 2: 1})
model.fit(train_df[important_features], train_df['Result'])

y_preds = model.predict(test_df[important_features])
x_preds = model.predict(train_df[important_features])
print("Test Accuracy:", accuracy_score(test_df['Result'], y_preds))
print("Test Precision:", precision_score(test_df['Result'], y_preds, average='weighted'))
print("Classification Report for Test Data:")
print(classification_report(test_df['Result'], y_preds))
print("Confusion Matrix for Test Data:")
print(confusion_matrix(test_df['Result'], y_preds))

# Save new lightweight model
joblib.dump(model, "match_predictors.pkl")

Test Accuracy: 0.5235849056603774
Test Precision: 0.49704078548550673
Classification Report for Test Data:
              precision    recall  f1-score   support

           0       0.32      0.12      0.17       103
           1       0.52      0.83      0.64       168
           2       0.58      0.46      0.51       153

    accuracy                           0.52       424
   macro avg       0.48      0.47      0.44       424
weighted avg       0.50      0.52      0.48       424

Confusion Matrix for Test Data:
[[ 12  58  33]
 [ 11 140  17]
 [ 14  69  70]]


['match_predictors.pkl']

# **Daignosis**
Checking the value counts of your predictions (y_pred) and comparing with Test_df.
y=train_df['Result]

In [None]:
print(y_preds)

unique, counts = np.unique(y_preds, return_counts=True)
value_counts = dict(zip(unique, counts))
print("Value counts for y_pred:", value_counts)

print(test_df['Result'].value_counts(normalize=True))
y.value_counts(normalize=True)

[0 1 1 1 1 1 2 0 2 1 2 1 1 1 1 1 1 1 1 0 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1
 1 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 2 1 2 2 1 0 1 1 1 1 2 2 2 1 1 1 1 1 2 1
 2 1 0 2 1 1 1 2 0 2 2 0 1 1 1 2 1 1 1 2 2 1 1 0 1 1 1 1 2 1 2 0 1 1 1 1 1
 2 1 1 0 1 2 2 1 1 1 2 1 1 2 2 1 1 1 0 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2
 2 2 1 2 1 2 1 0 1 1 1 2 1 1 1 1 2 1 2 1 1 2 1 2 1 2 1 1 1 1 1 2 2 1 1 2 2
 1 0 2 1 2 1 1 2 1 1 2 1 1 1 0 1 1 2 1 1 1 1 2 2 0 1 1 1 1 1 2 1 2 1 2 1 0
 1 1 2 1 2 1 1 0 2 2 1 1 1 2 0 2 1 1 0 0 2 1 1 0 1 0 1 1 1 1 2 1 1 1 2 0 1
 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 0 2 1 1 2 0 1
 0 2 1 1 1 1 1 1 1 0 1 1 1 1 0 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 2 2 2 1 2 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 2 2 2 1 1 1 2 1
 2 1 1 1 0 0 1 1 0 2 1 1 0 2 1 1 1 1 1 1 2 1 1 0 1 1 1 1 1 1 1 1 0 2 1 1 1
 0 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 1]
Value counts for y_pred: {np.int64(0): np.int64(37), np.int64(1): np.int64(277), np.int64(2): np.int64(110)}
Result
1    0.396226
2    0.36

Unnamed: 0_level_0,proportion
Result,Unnamed: 1_level_1
1,0.464069
2,0.308225
0,0.227706


**LogisticRegression**

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

# Assuming 'df' is your DataFrame and 'top_features' are already defined
# Split dataset based on date
train_df = df[df['Date'] < '2024-08-15']
test_df = df[df['Date'] >= '2024-08-15']

# Define features and target
X_train = train_df.drop(['Result', 'Date'], axis=1)
y_train = train_df['Result']
X_test = test_df.drop(['Result', 'Date'], axis=1)
y_test = test_df['Result']

# Train Logistic Regression model with L1 regularization
# Note: 'liblinear' and 'saga' solvers support L1 penalty
# Removing multi_class='ovr' as it is the default for 'liblinear' and caused an error
log_reg = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42, penalty='l1',class_weight=class_weights)
log_reg.fit(X_train, y_train)

# Predictions and evaluation for L1
y_pred = log_reg.predict(X_test)
log_acc = accuracy_score(y_test, y_pred)
log_prec = precision_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print("\n *** Logistic Regression (L1 Regularization) Evaluation *** ")
print(f"Accuracy: {log_acc:.3f}")
print(f"Precision: {log_prec:.3f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

joblib.dump(model, "match_predictors_log.pkl")


 *** Logistic Regression (L1 Regularization) Evaluation *** 
Accuracy: 0.531
Precision: 0.514

Confusion Matrix:
 [[ 19  56  28]
 [ 18 131  19]
 [ 21  57  75]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.18      0.24       103
           1       0.54      0.78      0.64       168
           2       0.61      0.49      0.55       153

    accuracy                           0.53       424
   macro avg       0.49      0.48      0.47       424
weighted avg       0.51      0.53      0.51       424



['match_predictors_log.pkl']

**SMOTE for Draw prediction only**

In [None]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Assuming X_train and y_train are already defined
# Find the count of the majority class
majority_class_count = y_train.value_counts().max() - 100

# Define the sampling strategy to oversample only the minority class (Draws = 0)
# and bring it up to the count of the majority class
sampling_strategy = {0: majority_class_count,2:majority_class_count}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)

X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

print("✅ Class distribution after SMOTE (oversampling only draws):")
print(y_train_resampled.value_counts())

✅ Class distribution after SMOTE (oversampling only draws):
Result
1    647
2    547
0    547
Name: count, dtype: int64


***XGBoost***

In [None]:
# --- Import libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


# --- Convert Date column and split ---
df['Date'] = pd.to_datetime(df['Date'])
train_df = df[df['Date'] < '2024-08-15']
test_df = df[df['Date'] >= '2024-08-15']

target = 'Result'

X_train = train_df[important_features]
y_train = train_df[target]
X_test = test_df[important_features]
y_test = test_df[target]

# --- Apply SMOTE to oversample minority class (Draws = 0) ---
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("✅ Class distribution after SMOTE:")
print(y_train_res.value_counts())

# --- Train XGBoost model ---
model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss',
    scale_pos_weight = class_weights
)

model.fit(X_train_res, y_train_res)

# --- Predictions ---
y_pred = model.predict(X_test)

# --- Evaluation ---
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')

print("\n✅ XGBoost Model with SMOTE Results:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- Confusion Matrix ---
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# --- Value counts of predictions ---
print("\nPredicted Result Counts:")
print(pd.Series(y_pred).value_counts())


✅ Class distribution after SMOTE:
Result
1    536
2    536
0    536
Name: count, dtype: int64


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ XGBoost Model with SMOTE Results:
Accuracy: 0.483
Precision: 0.457

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.23      0.24       103
           1       0.54      0.73      0.62       168
           2       0.58      0.39      0.46       153

    accuracy                           0.48       424
   macro avg       0.46      0.45      0.44       424
weighted avg       0.48      0.48      0.47       424


Confusion Matrix:
[[ 24  51  28]
 [ 32 122  14]
 [ 40  54  59]]

Predicted Result Counts:
1    227
2    101
0     96
Name: count, dtype: int64


**Model Creation**

In [20]:
import joblib
model = joblib.load("/content/match_predictors_log.pkl")

print("Expected number of features:", model.n_features_in_)
print(model.feature_names_in_)

Expected number of features: 11
['B365H' 'B365A' 'B365D' 'HomeTeam_Overall_Rating'
 'AwayTeam_Overall_Rating' 'Referee' 'AwayTeam_Form_Score'
 'HomeTeam_Form_Score' 'AwayTeam' 'HomeTeam' 'time']


In [17]:
X=df[important_features]
y=df['Result']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

majority_class_count = y_train.value_counts().max() - 100

# Define the sampling strategy to oversample only the minority class (Draws = 0)
# and bring it up to the count of the majority class
sampling_strategy = {0: majority_class_count,2:majority_class_count}

sm = SMOTE(sampling_strategy=sampling_strategy, random_state=42)

X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

lr =LogisticRegression(solver='liblinear', max_iter=1000, random_state=42, penalty='l1',class_weight=class_weights)
lr.fit(X_train_resampled, y_train_resampled)

y_pred = lr.predict(X_test)
log_acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {log_acc:.3f}")

print(y_pred)
y_test.head(50)

Accuracy: 0.484
[0 2 1 1 0 0 0 1 0 0 2 0 0 2 2 0 1 0 2 2 0 0 0 0 0 2 2 2 1 0 0 0 0 0 2 0 2
 1 0 1 1 0 0 2 0 2 0 1 2 2 0 2 2 2 0 0 0 2 0 1 1 0 1 0 2 0 2 1 2 0 0 2 1 1
 1 0 2 2 2 0 0 1 2 2 1 2 1 2 0 2 2 1 1 0 0 0 0 0 0 2 1 1 2 2 1 1 2 1 2 0 0
 0 0 2 2 1 2 1 1 2 1 0 0 0 1 0 2 1 2 2 2 2 1 0 0 1 0 1 2 2 0 0 1 1 2 0 0 2
 2 0 0 0 1 1 2 2 2 1 2 0 0 2 2 1 0 0 1 0 0 0 0 2 2 1 2 1 0 1 0 1 0 1 2 2 0
 2 2 0 1 2 0 2 0 1 1 0 0 1 0 1 2 0 2 2 0 2 0 0 0 1 1 2 2 0 1 0 2 2 2 0 1 1
 0 2 1 2 1 0 0 0 0 1 0 1 2 0 2 2 0 2 0 0 1 0 0 0 1 0 2 2 1 0 0 2 1 1 2 1 2
 0 2 1 0 2 2 2 0 0 0 1 2 1 1 2 2 1 2 0 0 1 0 1 1 1 0 0 0 1 1 0 2 2 1 0 0 2
 0 1 0 0 2 0 2 0 0 0 2 1 0 0 2 1 0 1 0 2]


Unnamed: 0,Result
1239,1
813,2
1174,2
534,1
514,2
1527,1
522,1
274,1
494,2
65,0
