In [116]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
from feature_selection.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,
                                    
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

In [117]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

In [118]:
from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [119]:
target = "goals_1.5"
test_date = "15-Oct-2021"

In [120]:
feats = Features()

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


In [121]:
ds = create_dataset(target=target, test_date="20-October-2021",
                    features=feats,
                    odds_features=True,
                    include_std=True,
                    ignore_features=IGNORE_FEATURES,
                   )

In [123]:

raw_referee_table = pd.read_csv("referee_table.csv")

referee_table = add_columns_table(feat=feats, df=raw_referee_table) # add unique id_match to referee table 
referee_table.columns = [c+"_ref" for c in referee_table.columns]

referee_cols  = referee_table.columns.tolist() 
referee_cols.remove('competition_ref')
referee_cols.remove('referee_ref')
referee_cols.remove('season_ref')
referee_cols.remove('raw_match_id_ref')
referee_cols.remove('date_ref') # Remove repeated columns 
referee_train = pd.merge(ds.train_data, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + train old data
referee_test = pd.merge(ds.test_data, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + test old data

In [124]:
referee_train.drop(columns=["hour_rank", "hour_before_16", "is_weekend"], inplace=True)
referee_train = referee_train.astype({k: float for k in [x for x in referee_train.columns if "ratio" in x]})

In [13]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }
feat_sel = FeatureSelection(target=target,
                            dataset=referee_train.dropna(),
                            target_features=210,
                            filter_metrics=metric_param,
                            include=["lr", "rf"],
                            setup_kwargs=setup_kwargs,
                            optimize=True,
                           )
selected_features = feat_sel.repeat_pipeline()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6073,0.6306,0.6417,0.6218,0.6316,0.2114,0.2115,0.2365


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5498,0.5691,0.5535,0.5734,0.5633,0.099,0.0991,0.3867


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6199,0.6506,0.6417,0.6366,0.6391,0.2377,0.2377,0.2334


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6199,0.6506,0.6417,0.6366,0.6391,0.2377,0.2377,0.2334


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.4755,0.5,0.0,0.0,0.0,0.0,0.0,0.25


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.4755,0.5,0.0,0.0,0.0,0.0,0.0,0.25


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6199,0.6506,0.6417,0.6366,0.6391,0.2377,0.2377,0.2334


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.561,0.5988,0.5481,0.5874,0.5671,0.1229,0.1232,0.3104


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.561,0.5988,0.5481,0.5874,0.5671,0.1229,0.1232,0.3104


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.561,0.5988,0.5481,0.5874,0.5671,0.1229,0.1232,0.3104


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.561,0.5988,0.5481,0.5874,0.5671,0.1229,0.1232,0.3104


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.561,0.5988,0.5481,0.5874,0.5671,0.1229,0.1232,0.3104


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.568,0.604,0.5802,0.5897,0.5849,0.1347,0.1347,0.3444


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6129,0.6496,0.6203,0.6339,0.627,0.2248,0.2248,0.2328


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5835,0.6074,0.5909,0.6055,0.5981,0.1659,0.166,0.3312


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5835,0.6074,0.5909,0.6055,0.5981,0.1659,0.166,0.3312


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5919,0.6216,0.5802,0.6182,0.5986,0.1843,0.1847,0.287


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5919,0.6216,0.5802,0.6182,0.5986,0.1843,0.1847,0.287


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5919,0.6216,0.5802,0.6182,0.5986,0.1843,0.1847,0.287


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.5975,0.652,0.6176,0.616,0.6168,0.1929,0.1929,0.2347


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6059,0.6564,0.6417,0.6202,0.6307,0.2085,0.2086,0.2321


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.5245,0.5,1.0,0.5245,0.6881,0.0,0.0,0.25


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.5975,0.652,0.6176,0.616,0.6168,0.1929,0.1929,0.2347


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6059,0.6564,0.6417,0.6202,0.6307,0.2085,0.2086,0.2321


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5849,0.6255,0.5829,0.6089,0.5956,0.1695,0.1697,0.284


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6157,0.6616,0.6444,0.6309,0.6376,0.2287,0.2288,0.23


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5905,0.6359,0.5963,0.6126,0.6043,0.1801,0.1801,0.2612


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5905,0.6359,0.5963,0.6126,0.6043,0.1801,0.1801,0.2612


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5877,0.6267,0.5856,0.6117,0.5984,0.1751,0.1753,0.2807


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5877,0.6267,0.5856,0.6117,0.5984,0.1751,0.1753,0.2807


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5905,0.6359,0.5963,0.6126,0.6043,0.1801,0.1801,0.2612


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6269,0.6663,0.6444,0.6444,0.6444,0.2521,0.2521,0.2291


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6269,0.6663,0.6444,0.6444,0.6444,0.2521,0.2521,0.2291


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6031,0.6586,0.6337,0.6188,0.6262,0.2033,0.2033,0.2326


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6269,0.6663,0.6444,0.6444,0.6444,0.2521,0.2521,0.2291


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Random Forest Classifier,0.6031,0.6586,0.6337,0.6188,0.6262,0.2033,0.2033,0.2326


In [125]:
new_features = [x for x in set(selected_features+ NEW_FEATURES + [target]) if x in referee_train.columns]
referee_train_sub = referee_train[new_features]

_ = setup(data=referee_train_sub.dropna(), target=target, **setup_kwargs)
x_train = get_config('X_train')



In [126]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb"],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6786,0.6756,0.73,0.8307,0.7769,0.2107,0.2163,0.9067
et,Extra Trees Classifier,0.7498,0.6739,0.9319,0.7835,0.8511,0.0999,0.1148,0.39
catboost,CatBoost Classifier,0.7423,0.6724,0.9221,0.7815,0.846,0.0887,0.1021,3.2533
lda,Linear Discriminant Analysis,0.666,0.6705,0.7004,0.8378,0.7628,0.2137,0.2236,0.31
rf,Random Forest Classifier,0.7454,0.6702,0.9081,0.7912,0.8455,0.1409,0.153,0.4133
lightgbm,Light Gradient Boosting Machine,0.7461,0.6686,0.9229,0.7844,0.848,0.1056,0.1198,0.9833
gbc,Gradient Boosting Classifier,0.736,0.6608,0.8934,0.79,0.8385,0.1303,0.1401,1.8567
xgboost,Extreme Gradient Boosting,0.7398,0.6577,0.8991,0.7906,0.8414,0.1337,0.1429,1.0967
ada,Ada Boost Classifier,0.6755,0.6062,0.7802,0.7935,0.7865,0.1102,0.1104,0.6
dt,Decision Tree Classifier,0.6377,0.532,0.7299,0.7837,0.7557,0.0579,0.0588,0.3433


In [127]:
tuned= tune_model(top_models[0], optimize="Precision")#, search_library="optuna")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6767,0.6884,0.7188,0.84,0.7747,0.2148,0.2232
1,0.6938,0.6922,0.7345,0.8433,0.7851,0.261,0.2686
2,0.656,0.6855,0.6995,0.8256,0.7573,0.1801,0.1875
Mean,0.6755,0.6887,0.7176,0.8363,0.7724,0.2186,0.2265
SD,0.0155,0.0028,0.0143,0.0077,0.0115,0.0331,0.0332


In [128]:
cali = calibrate_model(tuned)

final = finalize_model(cali)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6957,0.69,0.7482,0.8407,0.7917,0.2339,0.2394
1,0.6975,0.6933,0.7444,0.8403,0.7895,0.2595,0.2656
2,0.6597,0.6883,0.7069,0.8247,0.7613,0.1812,0.1878
Mean,0.6843,0.6905,0.7332,0.8352,0.7808,0.2248,0.2309
SD,0.0174,0.0021,0.0186,0.0074,0.0139,0.0326,0.0323


In [129]:
evaluate_model(cali)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [130]:
thresholded = optimize_threshold(final, optimize="precision", grid_interval=0.05)

In [151]:
predictions = predict_model(final, data=referee_test, probability_threshold=0.8)
predictions[predictions['Label'] == 1].shape[0], predictions.shape[0]

(33, 684)

In [None]:
# Desglose por ligas y semanas
# anyadir numero de apuestas
# Siempre la columna total al principio seguida de las metricas desglosadas

In [152]:
metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]
metrics_dict = {
    metric: check_metric(
        actual=predictions[target], 
        prediction=predictions['Label'], 
        metric=metric
    ) for metric in metrics_list
}

In [153]:
new_model_metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
new_model_metrics_df.index = ['new_feat_model']
for col, val in metrics_dict.items():
    new_model_metrics_df.loc['new_feat_model', col] = val 
new_model_metrics_df

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
new_feat_model,0.2544,0.5219,0.0575,0.9394,0.1084,0.0192,0.0834


In [144]:
x = (feats.odds[feats.odds["match_id"].isin(predictions[predictions['Label'] == 1].index)])

ix = np.logical_and(x["type"]=="over", x["market"]=="goals")
ix = np.logical_and(ix, x["cutoff"]==2.5)


(x[ix]["outcome"] <= 2).mean(), (x[ix]["outcome"] < 2).mean()

(0.2631578947368421, 0.09210526315789473)