Importing Libraries

In [1]:
import pandas as pd
import glob
import math
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importing the enriched csv
df = pd.read_csv("enriched_df.csv")
df.tail(10)

### Model training

In [33]:
# Select the features and the target variable
features = ['POS_HT', 'POS_AT', 'P_HT', 'P_AT', 'G_HT', 'G_AT', 'ELO_Home', 'ELO_Away']
target = 'FTR'

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

Classification model as Random Forest

In [34]:
# Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 70.88%


Gradient Boost Algorithm

In [35]:
# Create the Gradient Boosting model
model = GradientBoostingClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 69.57%


Logistic Regression Algorithm

In [36]:
# Create the Multinomial Logistic Regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate the model's accuracy
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 74.63%


Finally, we have:
- Random Forest Model accuracy: 70.88%
- Gradient Boosting Model accuracy: 69.57%
- Logistic Regression Model accuracy: 74.63%

So we chose Logistic Regression Model as our final model because it has the best accuracy

Saving either the final model and the concatenation of all dataframes

In [39]:
# We save the final model in .pkl format
joblib.dump(model, "Logistic Regression.pkl")

In [18]:
# Make predictions on the test dataset
predictions = model.predict(X_test)
predictions

array(['H', 'D', 'H', ..., 'H', 'H', 'D'], dtype=object)

## Predictive part

In [None]:
# Select the features and the target variable
features = ['POS_HT', 'POS_AT', 'P_HT', 'P_AT', 'G_HT', 'G_AT', 'ELO_Home', 'ELO_Away']
target = 'FTR'

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Cargar el modelo desde el archivo guardado
model = joblib.load("Logistic Regression.pkl")

# Make predictions in the test set
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy: {:.2f}%".format(accuracy * 100))

Model accuracy: 74.63%


In [None]:
# Adding the predicted results to the dataset
X_test['FTR_pred'] = predictions
X_test['FTR'] = y_test

fixtures = X_test
fixtures.head(10)

Unnamed: 0,POS_HT,POS_AT,P_HT,P_AT,G_HT,G_AT,ELO_Home,ELO_Away,FTR_pred,FTR
12119,18.0,8.0,2.0,6.0,-6.0,2.0,1472.735208,1518.627474,A,A
7584,14.0,21.0,4.0,1.0,2.0,-2.0,1489.728774,1491.133988,A,A
2688,5.0,13.0,57.0,42.0,10.0,-13.0,1531.381391,1463.883925,H,H
399,20.0,20.0,4.0,4.0,-5.0,-7.0,1465.556351,1463.321724,D,D
102,6.0,1.0,23.0,34.0,3.0,27.0,1515.686009,1593.056126,A,A
8726,2.0,9.0,52.0,39.0,22.0,11.0,1567.219564,1520.845393,D,A
1634,19.0,7.0,7.0,14.0,-11.0,-4.0,1474.221146,1494.869309,H,H
3011,22.0,20.0,28.0,34.0,-9.0,-12.0,1449.494995,1486.5805,A,A
12878,20.0,13.0,4.0,14.0,-14.0,0.0,1442.036406,1526.388612,A,A
4863,14.0,9.0,22.0,29.0,-6.0,1.0,1463.677085,1529.0938,A,A


In [None]:
correct_predictions = (fixtures['FTR_pred'] == fixtures['FTR']).sum()
total_predictions = len(fixtures)
accuracy = (correct_predictions / total_predictions) * 100

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 74.63%
