In [14]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import xgboost as xgb

In [22]:
df_leagues = pd.read_csv('../data/csv/Leagues.csv', index_col=0)

df_leagues.drop(['date', 'league', 'matchweek', 'home_team_id', 'away_team_id', 'home_team', 'away_team', 'home_manager', 'away_manager'], axis=1, inplace=True)
print(f'Nb NaN: {df_leagues.isna().sum().sum()}')

df_leagues['home_team_form'] = df_leagues['home_team_form'].fillna('')
df_leagues['home_team_home_form'] = df_leagues['home_team_home_form'].fillna('')
df_leagues['home_team_average_goals_form'] = df_leagues['home_team_average_goals_form'].fillna(0)
df_leagues['home_team_average_xg_form'] = df_leagues['home_team_average_xg_form'].fillna(0)
df_leagues['home_team_average_goals_form_against'] = df_leagues['home_team_average_goals_form_against'].fillna(0)
df_leagues['home_team_average_xg_form_against'] = df_leagues['home_team_average_xg_form_against'].fillna(0)
df_leagues['away_team_form'] = df_leagues['away_team_form'].fillna('')
df_leagues['away_team_away_form'] = df_leagues['away_team_away_form'].fillna('')
df_leagues['away_team_average_goals_form'] = df_leagues['home_team_average_goals_form'].fillna(0)
df_leagues['away_team_average_xg_form'] = df_leagues['home_team_average_xg_form'].fillna(0)
df_leagues['away_team_average_goals_form_against'] = df_leagues['home_team_average_goals_form_against'].fillna(0)
df_leagues['away_team_average_xg_form_against'] = df_leagues['home_team_average_xg_form_against'].fillna(0)

print(f'Nb NaN: {df_leagues.isna().sum().sum()}')
df_leagues.head()

Nb NaN: 1276
Nb NaN: 0


Unnamed: 0,home_formation,home_goals,home_xg,away_xg,away_goals,away_formation,home_team_form,home_team_league_pos,home_team_points_diff,home_team_home_form,...,away_team_form,away_team_league_pos,away_team_points_diff,away_team_away_form,away_team_away_league_pos,away_team_away_points_diff,away_team_average_goals_form,away_team_average_xg_form,away_team_average_goals_form_against,away_team_average_xg_form_against
0,4-2-3-1,1,2.4,0.4,0,4-2-3-1,,1.0,0,,...,,1.0,0,,1.0,0,0.0,0.0,0.0,0.0
1,4-2-3-1,1,0.8,1.7,3,4-2-3-1,DDLWW,14.0,6,LDW,...,WDLDW,7.0,-2,WDD,2.0,2,1.0,0.633333,1.0,1.533333
2,4-2-3-1,1,1.0,1.2,1,4-3-3,,1.0,0,,...,,1.0,0,,1.0,0,0.0,0.0,0.0,0.0
3,4-2-3-1,2,1.3,0.6,1,4-2-3-1,WWLLD,10.0,-12,DWL,...,LDWLW,8.0,1,LLL,19.0,0,1.666667,1.833333,1.666667,1.3
4,4-2-3-1,1,1.7,1.0,0,3-1-4-2,DWDLD,9.0,-2,DDL,...,DDDLL,18.0,-1,LDL,17.0,-2,0.666667,1.033333,1.0,1.066667


In [23]:
def get_train_test_split(df_leagues: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
    df_train, df_test = train_test_split(df_leagues, test_size=0.2, random_state=42)
    #df_train, df_val = train_test_split(df_full_train, test_size=len(df_leagues) * 0.2 / len(df_full_train), random_state=42)

    y_train = df_train['home_goals']
    #y_val = df_val['home_goals']
    y_test = df_test['home_goals']

    del df_train['home_goals']
    #del df_val['home_goals']
    del df_test['home_goals']
    
    return df_train, df_test, y_train, y_test



In [24]:
df_train, df_test, y_train, y_test = get_train_test_split(df_leagues)
df_train.head()

Unnamed: 0,home_formation,home_xg,away_xg,away_goals,away_formation,home_team_form,home_team_league_pos,home_team_points_diff,home_team_home_form,home_team_home_league_pos,...,away_team_form,away_team_league_pos,away_team_points_diff,away_team_away_form,away_team_away_league_pos,away_team_away_points_diff,away_team_average_goals_form,away_team_average_xg_form,away_team_average_goals_form_against,away_team_average_xg_form_against
2,4-2-3-1,0.8,1.3,1,4-2-3-1,LWWDW,11.0,-10,DLW,13.0,...,LWLWL,14.0,0,LWW,9.0,3,2.0,2.133333,2.666667,1.533333
51,4-2-3-1,0.9,0.7,0,4-2-3-1,LDLLL,20.0,-16,DDL,20.0,...,WWLDW,5.0,-7,WLD,7.0,-1,0.0,1.133333,1.0,1.333333
227,4-2-3-1,1.2,0.9,2,3-4-3,LWWLW,3.0,6,WWL,1.0,...,DWWLW,5.0,0,DDW,6.0,0,2.333333,1.166667,1.666667,1.633333
119,4-2-3-1,2.4,1.4,0,3-4-3,LDWLW,13.0,-11,WDL,13.0,...,LWLDL,16.0,5,WWD,16.0,4,2.333333,2.333333,2.666667,1.5
372,4-2-3-1,1.5,1.5,1,4-4-2,DDDWL,8.0,-4,DDD,7.0,...,WWLLW,10.0,-5,LWL,15.0,-6,1.333333,1.733333,1.333333,1.366667


In [25]:
dict_vectorizer = DictVectorizer(sparse=False)

train_serie_dict = df_train.to_dict(orient='records')
test_serie_dict = df_test.to_dict(orient='records')
X_train = dict_vectorizer.fit_transform(train_serie_dict)
X_test = dict_vectorizer.transform(test_serie_dict)

In [88]:
df_results = pd.DataFrame(columns={'max_depth': [], 'rmse': [], 'mae': [], 'r2': []})

def train_decision_tree_model(max_depth: int) -> DecisionTreeRegressor:
    decision_tree_model = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    decision_tree_model.fit(X_train, y_train)
    
    y_home_test_pred = decision_tree_model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_home_test_pred)
    mae = mean_absolute_error(y_test, y_home_test_pred)
    r2 = r2_score(y_test, y_home_test_pred)
    
    return rmse, mae, r2

for max_depth in tqdm(range(1, 10)):
    rmse, mae, r2 = train_decision_tree_model(max_depth)
    
    df_temp = pd.DataFrame([{'max_depth': max_depth, 'rmse': rmse, 'mae': mae, 'r2': r2}])
    df_results = pd.concat([df_results, df_temp], ignore_index=True)

df_results

  0%|          | 0/9 [00:00<?, ?it/s]

  df_results = pd.concat([df_results, df_temp], ignore_index=True)


Unnamed: 0,max_depth,rmse,mae,r2
0,1,1.121526,0.907857,0.25653
1,2,1.026887,0.835718,0.37671
2,3,1.011334,0.80579,0.395447
3,4,1.021003,0.817499,0.383832
4,5,1.014446,0.810292,0.391721
5,6,1.047284,0.819271,0.351703
6,7,1.05674,0.812144,0.339943
7,8,1.11643,0.860832,0.263271
8,9,1.158487,0.894672,0.206719


In [83]:

df_leagues_copy = df_leagues.copy()
y = df_leagues_copy['home_goals']
del df_leagues_copy['home_goals']

serie_dict = df_leagues_copy.copy().to_dict(orient='records')
X = dict_vectorizer.fit_transform(serie_dict)

param_grid = {
    'max_depth': range(1, 10),
    'min_samples_split': range(1, 10, 2),
    'min_samples_leaf': range(1, 8, 2),
    'max_features': [None, 'sqrt', 'log2']
}

scoring = ['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2']

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring=scoring, refit='r2')
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score (R2):", grid_search.best_score_)

Best parameters: {'max_depth': 3, 'max_features': None, 'min_samples_leaf': 7, 'min_samples_split': 3}
Best cross-validation score (R2): 0.30782822340412086


540 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/base.py", line 666, in _validate

In [90]:
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

y_home_test_pred = random_forest_model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_home_test_pred)
mae = mean_absolute_error(y_test, y_home_test_pred)
r2 = r2_score(y_test, y_home_test_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')

RMSE: 0.9713584008269773
MAE: 0.7804285714285714
R²: 0.44229591598247875


In [None]:
param_grid = {
    'n_estimators': range(10, 301, 50),
    'max_depth': range(1, 10, 2),
    'min_samples_split': range(1, 11, 2)
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {grid_search.best_params_}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Fitting 3 folds for each of 270 candidates, totalling 810 fits
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=60; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=60; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=110; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=210; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=210; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_

162 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/florian/Workspace/Learning/ml_zoomcamp/.venv/lib/python3.13/site-packages/sklearn/base.py", line 666, in _validate_

Meilleurs paramètres : {'max_depth': 6, 'min_samples_split': 7, 'n_estimators': 260}


In [95]:
rmse = root_mean_squared_error(y_test, y_pred_best)
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')

RMSE: 0.9650309018667516
MAE: 0.7741115094448412
R²: 0.44953810042198683


In [29]:
df_results = pd.DataFrame(columns={'max_depth': [], 'min_samples_leaf': [], 'accuracy': []})
for max_depth in tqdm(range(1, 10)):
    decision_tree_classifier = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    decision_tree_classifier.fit(X_train, y_train)

    y_pred = decision_tree_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    df_temp = pd.DataFrame([{'max_depth': max_depth, 'accuracy': accuracy}])
    df_results = pd.concat([df_results, df_temp], ignore_index=True)

df_results.sort_values(by='accuracy', ascending=False)


decision_tree_classifier = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_classifier.fit(X_train, y_train)

y_pred = decision_tree_classifier.predict(X_test)

alter_true = [1 if y_test.iloc[i] >= y_pred[i] else 0 for i in range(len(y_test))]
alter_accuracy = sum(alter_true) / len(y_test)
alter_accuracy

  0%|          | 0/9 [00:00<?, ?it/s]

  df_results = pd.concat([df_results, df_temp], ignore_index=True)


0.65

In [None]:
y_train_one_goal = (y_train >= 1).astype(int)
y_train_two_goal = (y_train >= 2).astype(int)

y_test_one_goal = (y_test >= 1).astype(int)
y_test_two_goal = (y_test >= 2).astype(int)

df_results = pd.DataFrame(columns=['max_depth', 'accuracy_one_goal', 'accuracy_two_goal', 'mean_accuracy'])

for max_depth in tqdm(range(1, 10)):
    # Modèle pour 1 but ou plus
    random_classifier_one = RandomForestClassifier(max_depth=max_depth, random_state=42)
    random_classifier_one.fit(X_train, y_train_one_goal)
    y_pred_one_goal = random_classifier_one.predict(X_test)
    accuracy_one_goal = accuracy_score(y_test_one_goal, y_pred_one_goal)

    # Modèle pour 2 buts ou plus
    random_classifier_two = RandomForestClassifier(max_depth=max_depth, random_state=42)
    random_classifier_two.fit(X_train, y_train_two_goal)
    y_pred_two_goal = random_classifier_two.predict(X_test)
    accuracy_two_goal = accuracy_score(y_test_two_goal, y_pred_two_goal)
    
    mean_accuracy = (accuracy_one_goal + accuracy_two_goal) / 2
    
    # Ajouter les résultats de précision pour chaque modèle
    df_temp = pd.DataFrame([{
        'max_depth': max_depth, 
        'accuracy_one_goal': accuracy_one_goal, 
        'accuracy_two_goal': accuracy_two_goal,
        'mean_accuracy': mean_accuracy
    }])
    
    df_results = pd.concat([df_results, df_temp], ignore_index=True)

# Trier et afficher les résultats
df_results = df_results.sort_values(by='mean_accuracy', ascending=False)
df_results

  0%|          | 0/19 [00:00<?, ?it/s]

  df_results = pd.concat([df_results, df_temp], ignore_index=True)


Unnamed: 0,max_depth,accuracy_one_goal,accuracy_two_goal,mean_accuracy
17,18,0.739286,0.767857,0.753571
16,17,0.742857,0.764286,0.753571
6,7,0.746429,0.760714,0.753571
18,19,0.742857,0.760714,0.751786
14,15,0.746429,0.757143,0.751786
8,9,0.746429,0.75,0.748214
12,13,0.746429,0.75,0.748214
11,12,0.746429,0.75,0.748214
9,10,0.746429,0.75,0.748214
10,11,0.746429,0.746429,0.746429


In [52]:
print(y_train_one_goal.value_counts())
print(y_train_two_goal.value_counts())

home_goals
1    885
0    231
Name: count, dtype: int64
home_goals
0    596
1    520
Name: count, dtype: int64
