In [108]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [82]:
df = pd.read_csv("../data/03_feature_engineered/2021-2022_engineered.csv")
df.set_index(['date', 'time'], inplace=True)
df.sort_index(inplace=True)

In [83]:
df_2021_2022 = df.copy()

In [84]:
# drop columns about in-game stats 
df_2021_2022.drop(columns=['home_goals', 'away_goals', 'home_poss', 'away_poss', 'home_xg', 
                           'away_xg', 'home_sh', 'away_sh', 'home_shot_on_target', 'away_shot_on_target'], inplace=True)

In [None]:
df_2021_2022

In [86]:
# Get unique team names from the dataframe
team_names = df['home_team'].unique()

# Create a dictionary with team names as keys and 0 as values
team_points = {team: 0 for team in team_names}

In [None]:
mw1 = df_2021_2022[df_2021_2022['round'] == 'Matchweek 1']
mw1

In [88]:

for row in mw1.itertuples():
    if row.result == 1:
        team_points[row.home_team] += 3
    elif row.result == -1:
        team_points[row.away_team] += 3
    else:
        # Corrected logic for a draw
        team_points[row.home_team] += 1
        team_points[row.away_team] += 1

In [90]:
df_clean = df_2021_2022.dropna()

In [92]:
not_features = ['round','home_team','away_team','venue','result']
features = [col for col in df_2021_2022.columns if col not in not_features]

In [93]:
X = df_clean[features]
y = df_clean['result']

In [95]:
cutoff_date = '2022-02-12'
X_train = X[X.index.get_level_values('date') < cutoff_date]
y_train = y[y.index.get_level_values('date') < cutoff_date]
X_test = X[X.index.get_level_values('date') >= cutoff_date]
y_test = y[y.index.get_level_values('date') >= cutoff_date]

In [96]:
# Ensure X and y are aligned
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
assert len(X_train) == len(y_train), "Training data misalignment!"
assert len(X_test) == len(y_test), "Test data misalignment!"

X_train shape: (214, 61)
y_train shape: (214,)
X_test shape: (156, 61)
y_test shape: (156,)


In [None]:
rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],       
    'max_depth': [None, 10, 20],          
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4]         
}

tcsv = TimeSeriesSplit(n_splits=4)

In [None]:
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=tcsv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1 
)

In [123]:
print("Starting hyperparameter tuning with Time Series Cross-Validation...")
grid_search.fit(X_train, y_train)
print("Tuning complete.")

Starting hyperparameter tuning with Time Series Cross-Validation...
Fitting 4 folds for each of 81 candidates, totalling 324 fits
Tuning complete.


In [None]:

print(f"Best Hyperparameters: {grid_search.best_params_}")

best_rf_model = grid_search.best_estimator_


final_predictions = best_rf_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_predictions)
final_f1 = f1_score(y_test, final_predictions, average='macro')

print(f"\nFinal Model Performance:")
print(f"Accuracy: {final_accuracy:.2%}")
print(f"Macro F1-Score: {final_f1:.2%}")

cm = confusion_matrix(y_test, final_predictions)
cm_df = pd.DataFrame(
    cm, 
    index=['Actual: Away Win', 'Actual: Draw', 'Actual: Home Win'],
    columns=['Pred: Away', 'Pred: Draw', 'Pred: Home']
)
cm_df


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}

Final Model Performance:
Accuracy: 53.21%
Macro F1-Score: 46.77%


Unnamed: 0,Pred: Away,Pred: Draw,Pred: Home
Actual: Away Win,25,13,19
Actual: Draw,8,7,12
Actual: Home Win,11,10,51
