In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_leagues = pd.read_csv('../data/csv/Leagues.csv', index_col=0)

df_leagues.drop(['date', 'league', 'matchweek', 'home_team_id', 'away_team_id', 'home_team', 'away_team', 'home_manager', 'away_manager'], axis=1, inplace=True)
print(f'Nb NaN: {df_leagues.isna().sum().sum()}')

df_leagues['home_team_form'] = df_leagues['home_team_form'].fillna('')
df_leagues['home_team_home_form'] = df_leagues['home_team_home_form'].fillna('')
df_leagues['home_team_average_goals_form'] = df_leagues['home_team_average_goals_form'].fillna(0)
df_leagues['home_team_average_xg_form'] = df_leagues['home_team_average_xg_form'].fillna(0)
df_leagues['home_team_average_goals_form_against'] = df_leagues['home_team_average_goals_form_against'].fillna(0)
df_leagues['home_team_average_xg_form_against'] = df_leagues['home_team_average_xg_form_against'].fillna(0)
df_leagues['away_team_form'] = df_leagues['away_team_form'].fillna('')
df_leagues['away_team_away_form'] = df_leagues['away_team_away_form'].fillna('')
df_leagues['away_team_average_goals_form'] = df_leagues['home_team_average_goals_form'].fillna(0)
df_leagues['away_team_average_xg_form'] = df_leagues['home_team_average_xg_form'].fillna(0)
df_leagues['away_team_average_goals_form_against'] = df_leagues['home_team_average_goals_form_against'].fillna(0)
df_leagues['away_team_average_xg_form_against'] = df_leagues['home_team_average_xg_form_against'].fillna(0)

print(f'Nb NaN: {df_leagues.isna().sum().sum()}')
df_leagues.head()

Nb NaN: 2960
Nb NaN: 0


Unnamed: 0,home_formation,home_goals,home_xg,away_xg,away_goals,away_formation,home_team_form,home_team_league_pos,home_team_points_diff,home_team_home_form,...,away_team_form,away_team_league_pos,away_team_points_diff,away_team_away_form,away_team_away_league_pos,away_team_away_points_diff,away_team_average_goals_form,away_team_average_xg_form,away_team_average_goals_form_against,away_team_average_xg_form_against
0,4-2-3-1,1,2.4,0.4,0,4-2-3-1,,1.0,0,,...,,1.0,0,,1.0,0,0.0,0.0,0.0,0.0
1,4-2-3-1,1,0.8,1.7,3,4-2-3-1,DDLWW,14.0,6,LDW,...,WDLDW,7.0,-2,WDD,2.0,2,1.0,0.633333,1.0,1.533333
2,4-2-3-1,1,1.0,1.2,1,4-3-3,,1.0,0,,...,,1.0,0,,1.0,0,0.0,0.0,0.0,0.0
3,4-2-3-1,2,1.3,0.6,1,4-2-3-1,WWLLD,10.0,-12,DWL,...,LDWLW,8.0,1,LLL,19.0,0,1.666667,1.833333,1.666667,1.3
4,4-2-3-1,1,1.7,1.0,0,3-1-4-2,DWDLD,9.0,-2,DDL,...,DDDLL,18.0,-1,LDL,17.0,-2,0.666667,1.033333,1.0,1.066667


In [160]:
def get_train_test_split(df_leagues: pd.DataFrame, goal_type:str = 'home') -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
    df_train, df_test = train_test_split(df_leagues, test_size=0.2, random_state=42)

    y_train = df_train[f'{goal_type}_goals']
    y_test = df_test[f'{goal_type}_goals']

    del df_train[f'{goal_type}_goals']
    del df_test[f'{goal_type}_goals']
    
    return df_train, df_test, y_train, y_test



In [161]:
df_train, df_test, y_train, y_test = get_train_test_split(df_leagues)
df_train.head()

Unnamed: 0,home_formation,home_xg,away_xg,away_goals,away_formation,home_team_form,home_team_league_pos,home_team_points_diff,home_team_home_form,home_team_home_league_pos,...,away_team_form,away_team_league_pos,away_team_points_diff,away_team_away_form,away_team_away_league_pos,away_team_away_points_diff,away_team_average_goals_form,away_team_average_xg_form,away_team_average_goals_form_against,away_team_average_xg_form_against
57,4-2-3-1,2.0,2.4,2,5-3-2,LDDDL,20.0,-16,LDD,20.0,...,WWDLW,14.0,4,WDL,11.0,6,0.666667,0.933333,1.666667,1.333333
303,4-4-2,2.9,0.2,1,3-5-2,DWDWW,6.0,-8,WWW,4.0,...,DLWWW,2.0,3,DLW,7.0,1,3.0,2.666667,0.666667,0.833333
165,4-4-2,0.5,0.8,0,4-4-2,DWDWL,8.0,-5,LWW,6.0,...,DLLDL,18.0,-6,LLL,13.0,4,1.0,1.033333,0.333333,0.433333
100,3-4-3,1.0,1.9,2,3-5-2,WWDLW,10.0,7,WWL,14.0,...,LLWLL,16.0,-7,LLL,16.0,-9,2.666667,1.766667,1.666667,1.9
255,3-5-2,2.5,1.1,0,3-3-3-1,WLWDD,15.0,0,DWW,10.0,...,LLDLD,17.0,-2,DLD,15.0,5,2.0,1.9,0.666667,1.2


In [None]:
def get_X_train_test(df_train:pd.DataFrame, df_test:pd.DataFrame):
    dict_vectorizer = DictVectorizer(sparse=False)

    train_serie_dict = df_train.to_dict(orient='records')
    test_serie_dict = df_test.to_dict(orient='records')

    X_train = dict_vectorizer.fit_transform(train_serie_dict)
    X_test = dict_vectorizer.transform(test_serie_dict)
    
    return X_train, X_test

In [6]:
df_results = pd.DataFrame(columns={'max_depth': [], 'rmse': [], 'mae': [], 'r2': []})

def train_decision_tree_model(max_depth: int) -> DecisionTreeRegressor:
    decision_tree_model = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    decision_tree_model.fit(X_train, y_train)
    
    y_home_test_pred = decision_tree_model.predict(X_test)
    
    rmse = root_mean_squared_error(y_test, y_home_test_pred)
    mae = mean_absolute_error(y_test, y_home_test_pred)
    r2 = r2_score(y_test, y_home_test_pred)
    
    return rmse, mae, r2

for max_depth in tqdm(range(1, 10)):
    rmse, mae, r2 = train_decision_tree_model(max_depth)
    
    df_temp = pd.DataFrame([{'max_depth': max_depth, 'rmse': rmse, 'mae': mae, 'r2': r2}])
    df_results = pd.concat([df_results, df_temp], ignore_index=True)

df_results

  df_results = pd.concat([df_results, df_temp], ignore_index=True)
100%|██████████| 9/9 [00:00<00:00, 43.70it/s]


Unnamed: 0,max_depth,rmse,mae,r2
0,1,1.160539,0.915236,0.27838
1,2,1.093608,0.878318,0.359215
2,3,1.083805,0.864697,0.370651
3,4,1.086357,0.856927,0.367685
4,5,1.122468,0.857173,0.324949
5,6,1.167929,0.86484,0.269161
6,7,1.158093,0.863705,0.281419
7,8,1.180695,0.868354,0.253097
8,9,1.197485,0.878613,0.231703


In [7]:

df_leagues_copy = df_leagues.copy()
y = df_leagues_copy['home_goals']
del df_leagues_copy['home_goals']

serie_dict = df_leagues_copy.copy().to_dict(orient='records')
X = dict_vectorizer.fit_transform(serie_dict)

param_grid = {
    'max_depth': range(1, 10),
    'min_samples_split': range(1, 10, 2),
    'min_samples_leaf': range(1, 8, 2),
    'max_features': [None, 'sqrt', 'log2']
}

scoring = ['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2']

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring=scoring, refit='r2')
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score (R2):", grid_search.best_score_)

Best parameters: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 7, 'min_samples_split': 3}
Best cross-validation score (R2): 0.339525586610933


540 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/base.py", l

In [8]:
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

y_home_test_pred = random_forest_model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_home_test_pred)
mae = mean_absolute_error(y_test, y_home_test_pred)
r2 = r2_score(y_test, y_home_test_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')

RMSE: 1.0673339414763057
MAE: 0.8248659517426273
R²: 0.3896352863250785


In [9]:
param_grid = {
    'n_estimators': range(10, 301, 50),
    'max_depth': range(1, 10, 2),
    'min_samples_split': range(1, 11, 2)
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {grid_search.best_params_}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=110; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=60; total time=   0.0s
[CV] END ..max_depth=1, min_samples_split=1, n_estimators=10; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=160; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=210; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=210; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_estimators=210; total time=   0.0s
[CV] END .max_depth=1, min_samples_split=1, n_

90 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/florianbaudin/.local/share/virtualenvs/env-y2vBCjKW/lib/python3.13/site-packages/sklearn/base.py", line

Meilleurs paramètres : {'max_depth': 7, 'min_samples_split': 9, 'n_estimators': 260}


In [10]:
rmse = root_mean_squared_error(y_test, y_pred_best)
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')

RMSE: 1.0552176474869035
MAE: 0.8244992243727136
R²: 0.4034142588704742


In [84]:
y_train_one_goal = (y_train >= 1).astype(int)
y_train_two_goal = (y_train >= 2).astype(int)

y_test_one_goal = (y_test >= 1).astype(int)
y_test_two_goal = (y_test >= 2).astype(int)


df_results = pd.DataFrame(columns={'accuracy': [], 'accuracy_one': [], 'accuracy_two': []})

decision_tree_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)
decision_tree_classifier.fit(X_train, y_train_one_goal)
y_pred = decision_tree_classifier.predict_proba(X_test)[:, 1]

decision_tree_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)
decision_tree_classifier.fit(X_train, y_train_two_goal)
y_pred = decision_tree_classifier.predict_proba(X_test)[:, 1]

accuracy_one = accuracy_score(y_test_one_goal, y_pred >= 0.1 )
accuracy_two = accuracy_score(y_test_two_goal, y_pred >= 0.6 )

df_results = pd.DataFrame([{'accuracy_one': accuracy_one, 'accuracy_two': accuracy_two, 'accuracy': ((accuracy_one + accuracy_two) / 2)}])
#df_results = pd.concat([df_results, df_temp], ignore_index=True)

df_results = df_results.sort_values(by='accuracy_two', ascending=False)
df_results

Unnamed: 0,accuracy_one,accuracy_two,accuracy
0,0.753351,0.743968,0.74866


In [85]:
y_train_one_goal = (y_train >= 1).astype(int)
y_train_two_goal = (y_train >= 2).astype(int)

y_test_one_goal = (y_test >= 1).astype(int)
y_test_two_goal = (y_test >= 2).astype(int)

df_results = pd.DataFrame(columns=['max_depth', 'min_samples_leaf', 'accuracy_one_goal', 'accuracy_two_goal', 'mean_accuracy'])

y_pred_one_goal = []
y_pred_two_goal = []

#for min_samples_leaf in tqdm(range(1, 51)):
    
# Modèle pour 1 but ou plus    
random_classifier_one = RandomForestClassifier(max_depth=23, random_state=42)
random_classifier_one.fit(X_train, y_train_one_goal)
y_pred_one_goal = random_classifier_one.predict_proba(X_test)[:, 1] >= 0.63
accuracy_one_goal = accuracy_score(y_test_one_goal, y_pred_one_goal)

# Modèle pour 2 buts ou plus
random_classifier_two = RandomForestClassifier(max_depth=28, random_state=42)
random_classifier_two.fit(X_train, y_train_two_goal)
y_pred_two_goal = random_classifier_two.predict_proba(X_test)[:, 1] >= 0.52
accuracy_two_goal = accuracy_score(y_test_two_goal, y_pred_two_goal)

mean_accuracy = (accuracy_one_goal + accuracy_two_goal) / 2
mean_accuracy
    
#    # Ajouter les résultats de précision pour chaque modèle
#    df_temp = pd.DataFrame([{
#        'max_depth': max_depth,
#        'min_samples_leaf': min_samples_leaf,
#        'accuracy_one_goal': accuracy_one_goal, 
#        'accuracy_two_goal': accuracy_two_goal,
#        'mean_accuracy': mean_accuracy
#    }])
#    
#    df_results = pd.concat([df_results, df_temp], ignore_index=True)
#
## Trier et afficher les résultats
#df_results = df_results.sort_values(by='mean_accuracy', ascending=False)
#df_results

0.7660857908847185

In [92]:
dict_vectorizer = DictVectorizer(sparse=False)

train_serie_dict = df_train.to_dict(orient='records')
test_serie_dict = df_test.to_dict(orient='records')

X_train = dict_vectorizer.fit_transform(train_serie_dict)
X_test = dict_vectorizer.transform(test_serie_dict)

feature_importances = random_classifier_one.feature_importances_
print(len(feature_importances))
print(len(list(dict_vectorizer.feature_names_)))

feature_importances_df = pd.DataFrame({
    'Feature': list(dict_vectorizer.feature_names_),
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
feature_importances_df.head(10)

815
815


Unnamed: 0,Feature,Importance
814,home_xg,0.107197
66,away_team_away_league_pos,0.042493
810,home_team_home_league_pos,0.035764
407,away_xg,0.027103
24,away_team_average_xg_form,0.026748
813,home_team_points_diff,0.026255
405,away_team_league_pos,0.025895
430,home_team_average_xg_form,0.025795
67,away_team_away_points_diff,0.024743
812,home_team_league_pos,0.022977


In [56]:
def value_count(arr):
    values, counts = np.unique(arr, return_counts=True)
    for value, count in zip(values, counts):
        print(f"Valeur: {value}, Count: {count}")

print(y_test_one_goal.value_counts())
print(value_count(y_pred_one_goal))
print(y_test_two_goal.value_counts())
print(value_count(y_pred_two_goal))

home_goals
1    564
0    182
Name: count, dtype: int64
Valeur: False, Count: 79
Valeur: True, Count: 667
None
home_goals
0    414
1    332
Name: count, dtype: int64
Valeur: False, Count: 472
Valeur: True, Count: 274
None


In [119]:
dtrain_one_goal = xgb.DMatrix(X_train, y_train_one_goal)
dtest_one_goal = xgb.DMatrix(X_test, y_test_one_goal)

dtrain_two_goal = xgb.DMatrix(X_train, y_train_two_goal)
dtest_two_goal = xgb.DMatrix(X_test, y_test_two_goal)

params_one_goal = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 28,
    "eta": 0.1,
    #"subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

params_two_goal = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 28,
    "eta": 0.1,
    #"subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

model_one_goal = xgb.train(params_one_goal, dtrain_one_goal, num_boost_round=100)
model_two_goal = xgb.train(params_two_goal, dtrain_two_goal, num_boost_round=100)

y_pred_one_goal = (model_one_goal.predict(dtest_one_goal) >= 0.5).astype(int)
y_pred_two_goal = (model_two_goal.predict(dtest_two_goal) >= 0.5).astype(int)

accuracy_one_goal = accuracy_score(y_test_one_goal, y_pred_one_goal)
accuracy_two_goal = accuracy_score(y_test_two_goal, y_pred_two_goal)

print(f"Accuracy for One Goal: {accuracy_one_goal:.2f}")
print(f"Accuracy for Two Goals: {accuracy_two_goal:.2f}")

Accuracy for One Goal: 0.78
Accuracy for Two Goals: 0.73


In [None]:
df_train, df_test, y_train, y_test = get_train_test_split(df_leagues, 'home')
X_train, X_test = get_X_train_test(df_train, df_test)

y_train_one_goal = (y_train == 1).astype(int)
y_train_two_goal = (y_train == 2).astype(int)
y_train_three_goal = (y_train == 3).astype(int)

y_test_one_goal = (y_test == 1).astype(int)
y_test_two_goal = (y_test == 2).astype(int)
y_test_three_goal = (y_test == 3).astype(int)

model = xgb.XGBClassifier(random_state=42)

param_grid = {
    "max_depth": range(1,35, 5),
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [50, 100, 200],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

grid_search = GridSearchCV(
    estimator = model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

def get_model(grid_search: GridSearchCV, y_train: [int]):
    grid_search.fit(X_train, y_train)

    print('---------------------------------------------------------------------------')
    print("Meilleurs hyperparamètres :", grid_search.best_params_)
    print("Meilleur score :", grid_search.best_score_)

    return grid_search.best_estimator_

model_one_goal = get_model(grid_search=grid_search, y_train=y_train_one_goal)
model_two_goal = get_model(grid_search=grid_search, y_train=y_train_two_goal)
model_three_goal = get_model(grid_search=grid_search, y_train=y_train_three_goal)

y_pred_one_goal = model_one_goal.predict(X_test)
y_pred_two_goal = model_two_goal.predict(X_test)
y_pred_three_goal = model_three_goal.predict(X_test)

accuracy_one = accuracy_score(y_test_one_goal, y_pred_one_goal)
accuracy_two = accuracy_score(y_test_two_goal, y_pred_two_goal)
accuracy_three = accuracy_score(y_test_three_goal, y_pred_three_goal)

print(f'Accuracy: 1 Goals -> {accuracy_one} | 2 Goals -> {accuracy_two} | 3 Goals -> {accuracy_three}')

Fitting 5 folds for each of 252 candidates, totalling 1260 fits
---------------------------------------------------------------------------
Meilleurs hyperparamètres : {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 21, 'n_estimators': 100, 'subsample': 0.8}
Meilleur score : 0.6761632547525098
Fitting 5 folds for each of 252 candidates, totalling 1260 fits




---------------------------------------------------------------------------
Meilleurs hyperparamètres : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 21, 'n_estimators': 100, 'subsample': 1.0}
Meilleur score : 0.7546097377266646
Accuracy: 1+ Goals -> 0.6809651474530831 | 2+ Goals -> 0.7654155495978552


In [159]:
def find_best_threshold(model:xgb.XGBClassifier, y_test:[int]):
    df_result = pd.DataFrame(columns={'threshold': [], 'accuracy': []})
    for threshold in np.linspace(0,1, 20):
        y_proba = (model.predict_proba(X_test)[:,1] >= threshold).astype(int)
        threshold_accuracy = accuracy_score(y_test, y_proba)
        
        df_temp = pd.DataFrame([{'threshold': threshold, 'accuracy': threshold_accuracy}])
        if df_result.empty:
            df_result = df_temp
        else:
            df_result = pd.concat([df_result, df_temp])
        
    df_result = df_result.sort_values('accuracy', ascending=False)
    print(df_result.head(3))
    
    return df_result['threshold'].iloc[1]

best_threshold_one_goal = find_best_threshold(model_one_goal, y_test_one_goal)
best_threshold_two_goal = find_best_threshold(model_two_goal, y_test_two_goal)

y_proba_one_goal = (model_one_goal.predict_proba(X_test)[:,1] >= best_threshold_one_goal).astype(int)
one_goal_accuracy = accuracy_score(y_test_one_goal, y_proba_one_goal)

y_proba_two_goal = (model_two_goal.predict_proba(X_test)[:,1] >= best_threshold_two_goal).astype(int)
two_goal_accuracy = accuracy_score(y_test_two_goal, y_proba_two_goal)

print(f'1+ Accuracy: {one_goal_accuracy} - 2+ Accuracy: {two_goal_accuracy}')

   threshold  accuracy
0   1.000000  0.689008
0   0.578947  0.689008
0   0.947368  0.689008
   threshold  accuracy
0   0.526316  0.765416
0   0.578947  0.765416
0   0.947368  0.765416
1+ Accuracy: 0.6890080428954424 - 2+ Accuracy: 0.7654155495978552


In [158]:
print('1+ Accuracy: 0.7841823056300268 - 2+ Accuracy: 0.760053619302949')
print('1  Accuracy: 0.6890080428954424 - 2  Accuracy: 0.7654155495978552')

1+ Accuracy: 0.7841823056300268 - 2+ Accuracy: 0.760053619302949
1  Accuracy: 0.6890080428954424 - 2  Accuracy: 0.7654155495978552
