In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('D:\dev\project\Football-Match-Prediction\data\processed\df_merged.csv')

In [25]:
data.describe()

In [32]:
data.dropna(inplace=True)

In [33]:
# Ensure 'Time' is in datetime format
data['Time'] = pd.to_datetime(data['Time'])

# Split the data into training and validation sets
train = data[data['Time'] <= '2023-08']
valid = data[data['Time'] > '2023-08']

In [34]:
from sklearn.preprocessing import StandardScaler


y_train = train['HomeTeam_Result'].map({'W' : 2, 'D' : 1, 'L' : 0})
y_valid = valid['HomeTeam_Result'].map({'W' : 2, 'D' : 1, 'L' : 0})
X_train = train.drop(columns=['Time', 'HomeTeam_Result', 'HomeTeam_GF', 'AwayTeam_GF'])
X_valid = valid.drop(columns=['Time', 'HomeTeam_Result', 'HomeTeam_GF', 'AwayTeam_GF'])

# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [35]:
X_valid.head()

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

models = {
    'Logistic Regression': LogisticRegression(solver='liblinear'),
    'Random Forest': RandomForestClassifier(),
    'SVC': SVC()
}

for name, model in models.items():
    if name == 'SVC':
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)
        model.fit(X_train_scaled, y_train)
        print(name)
        print('Training score:', model.score(X_train_scaled, y_train))
        print('Validation score:', model.score(X_valid_scaled, y_valid))
        print()
        continue
    model.fit(X_train, y_train)
    print(name)
    print('Training score:', model.score(X_train, y_train))
    print('Validation score:', model.score(X_valid, y_valid))
    print()

In [37]:
# One-hot encode the team names
X_train = pd.get_dummies(X_train, columns=['HomeTeam', 'AwayTeam'])
X_valid = pd.get_dummies(X_valid, columns=['HomeTeam', 'AwayTeam'])
# Align the columns of X_valid to match X_train
X_valid = X_valid.reindex(columns=X_train.columns, fill_value=0)

In [38]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

In [39]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

sns.heatmap(confusion_matrix(y_valid, y_pred), annot = True, fmt='d')
print(classification_report(y_valid, y_pred))