In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv("data/pl-data-cleaned.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,season,match_date,home_team,home_team_score,away_team,away_team_score,home_position,away_position,home_team_avg_goals,away_team_avg_goals,home_team_avg_conceded,away_team_avg_conceded,Over2.5
0,2023/2024,2023-08-11,Burnley,0,Manchester City,3,19,1,0.0,0.0,0.0,0.0,1
1,2023/2024,2023-08-12,Newcastle,5,Aston Villa,1,7,4,0.0,0.0,0.0,0.0,1
2,2023/2024,2023-08-12,Bournemouth,1,West Ham,1,12,9,0.0,0.0,0.0,0.0,0
3,2023/2024,2023-08-12,Arsenal,2,Nottingham,1,2,17,0.0,0.0,0.0,0.0,1
4,2023/2024,2023-08-12,Everton,0,Fulham,1,15,13,0.0,0.0,0.0,0.0,0


In [4]:
data.shape

(450, 13)

In [5]:
data.drop(columns=["season", "match_date", "home_team_score", "away_team_score"], inplace=True)

In [6]:
data.head()

Unnamed: 0,home_team,away_team,home_position,away_position,home_team_avg_goals,away_team_avg_goals,home_team_avg_conceded,away_team_avg_conceded,Over2.5
0,Burnley,Manchester City,19,1,0.0,0.0,0.0,0.0,1
1,Newcastle,Aston Villa,7,4,0.0,0.0,0.0,0.0,1
2,Bournemouth,West Ham,12,9,0.0,0.0,0.0,0.0,0
3,Arsenal,Nottingham,2,17,0.0,0.0,0.0,0.0,1
4,Everton,Fulham,15,13,0.0,0.0,0.0,0.0,0


In [7]:
# # One-Hot Encoding the home_team and away_team
# df_encoded = pd.get_dummies(data, columns=['home_team', 'away_team'], prefix=['home', 'away'])

# # Display the encoded DataFrame
# print(df_encoded)

In [8]:
# # Get the list of encoded team columns
# encoded_home_teams = [col for col in df_encoded.columns if col.startswith('home_')]
# encoded_away_teams = [col for col in df_encoded.columns if col.startswith('away_')]

# print("Encoded Home Teams:", encoded_home_teams)
# print("Encoded Away Teams:", encoded_away_teams)

In [7]:
team_names = data["home_team"].unique()
team_names.sort()
print(team_names)

['Arsenal' 'Aston Villa' 'Bournemouth' 'Brentford' 'Brighton' 'Burnley'
 'Chelsea' 'Crystal Palace' 'Everton' 'Fulham' 'Ipswich' 'Leicester'
 'Liverpool' 'Luton' 'Manchester City' 'Manchester Utd' 'Newcastle'
 'Nottingham' 'Sheffield Utd' 'Southampton' 'Tottenham' 'West Ham'
 'Wolves']


In [8]:
team_mapping = {}
for i, team in enumerate(team_names):
    team_mapping[team] = i

In [9]:
print(team_mapping)

{'Arsenal': 0, 'Aston Villa': 1, 'Bournemouth': 2, 'Brentford': 3, 'Brighton': 4, 'Burnley': 5, 'Chelsea': 6, 'Crystal Palace': 7, 'Everton': 8, 'Fulham': 9, 'Ipswich': 10, 'Leicester': 11, 'Liverpool': 12, 'Luton': 13, 'Manchester City': 14, 'Manchester Utd': 15, 'Newcastle': 16, 'Nottingham': 17, 'Sheffield Utd': 18, 'Southampton': 19, 'Tottenham': 20, 'West Ham': 21, 'Wolves': 22}


In [10]:
data["home_team"] = data["home_team"].map(team_mapping)
data["away_team"] = data["away_team"].map(team_mapping)

In [11]:
data.head()

Unnamed: 0,home_team,away_team,home_position,away_position,home_team_avg_goals,away_team_avg_goals,home_team_avg_conceded,away_team_avg_conceded,Over2.5
0,5,14,19,1,0.0,0.0,0.0,0.0,1
1,16,1,7,4,0.0,0.0,0.0,0.0,1
2,2,21,12,9,0.0,0.0,0.0,0.0,0
3,0,17,2,17,0.0,0.0,0.0,0.0,1
4,8,9,15,13,0.0,0.0,0.0,0.0,0


In [30]:
data.to_csv("data/model-ready-data.csv")

In [12]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Train accuracy: {}".format(accuracy_score(y_train, model.predict(X_train))))

Test Accuracy: 0.5555555555555556
Train accuracy: 0.6416666666666667


In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [25]:
dtree = DecisionTreeClassifier(criterion="entropy", random_state=42)
dtree.fit(X_train, y_train)

In [26]:
print("Test Accuracy:", accuracy_score(y_test, dtree.predict(X_test)))
print("Train accuracy: {}".format(accuracy_score(y_train, dtree.predict(X_train))))

Test Accuracy: 0.5111111111111111
Train accuracy: 1.0


In [27]:
rforest = RandomForestClassifier(criterion="entropy", random_state=42)
rforest.fit(X_train, y_train)

In [28]:
print("Test Accuracy:", accuracy_score(y_test, rforest.predict(X_test)))
print("Train accuracy: {}".format(accuracy_score(y_train, rforest.predict(X_train))))

Test Accuracy: 0.5111111111111111
Train accuracy: 1.0


In [29]:
from sklearn.svm import SVC

In [30]:
svector = SVC(kernel="linear")
svector.fit(X_train, y_train)

In [31]:
print("Test Accuracy:", accuracy_score(y_test, svector.predict(X_test)))
print("Train accuracy: {}".format(accuracy_score(y_train, svector.predict(X_train))))

Test Accuracy: 0.5888888888888889
Train accuracy: 0.6416666666666667


In [24]:
from sklearn.model_selection import GridSearchCV

In [42]:
grid_svm = SVC()

In [43]:
grid_params = {
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
    "C": [0.1, 0.5, 0.7, 1, 5, 10],
    "random_state": [42, 23, 123, 0]
}

In [44]:
grid_model = GridSearchCV(
    estimator=grid_svm,
    param_grid=grid_params,
    cv=10,
    n_jobs=-1
)

In [45]:
grid_model.fit(X_train, y_train)

In [46]:
accuracy_score(y_test, grid_model.predict(X_test))

0.5888888888888889

In [47]:
grid_model.best_estimator_

In [48]:
grid_model.best_params_

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'random_state': 42}

In [27]:
import pickle

In [51]:
with open("models/svm_predictor.pkl", "wb") as file:
    pickle.dump(grid_model, file)

In [17]:
from xgboost import XGBClassifier

In [18]:
classifier = XGBClassifier(n_estimators=20, random_state=42)

In [19]:
classifier.fit(X_train, y_train)

In [20]:
accuracy_score(y_test, classifier.predict(X_test))

0.5444444444444444

In [21]:
accuracy_score(y_train, classifier.predict(X_train))

0.9777777777777777

In [22]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
}

In [25]:
grid_search = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=param_grid, 
    scoring='accuracy', 
    cv=10, 
    verbose=1, 
    n_jobs=-1
)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 81 candidates, totalling 810 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [77]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best Score: 0.6416666666666666


In [28]:
with open("models/xgb_predictor.pkl", "wb") as file:
    pickle.dump(grid_search, file)