# Imports 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
from ml_bets.supplementary.functions import create_dataset, add_targets, add_columns_table, train_test_split, compute_seasons 
from feature_selection.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric,
    create_model,
    finalize_model,
    optimize_threshold,
                                    
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

# Parameters 

In [2]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }  # NEED TO BE FIXED. REWRITE PARAMETERS!!!!! 
metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]
user_date = "20-October-2021"
user_target = 'goals_2.5'
cutoff = user_target.split("_")[-1]
ignore_features = ['month','date','competition','month_start_or_end', 'season']
fold_strategy = "timeseries"

### Pycaret setup kwargs 

In [87]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.99,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy=fold_strategy,
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

# Creation dataset (standard)

In [83]:
df = create_dataset()
df = add_targets(df=df, cutoff=cutoff)
# Train-test split
train, test = train_test_split(df=df, sep=user_date) # insert table to evaluate and separation date 

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls
/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Run this cell if target is boolean

In [84]:
train = train.astype({user_target: int})
test = test.astype({user_target: int})

## Pycaret evaluation 

In [85]:
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = train.select_dtypes(include=numerics).columns.tolist()
setup_kwargs['numeric_features'] = num_cols 

In [88]:
_ = setup(data=train.drop(columns=['month','date','competition','month_start_or_end', 'season']),
          target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [89]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6007,0.6449,0.6258,0.6218,0.6233,0.1957,0.196,0.1933
lda,Linear Discriminant Analysis,0.5977,0.6342,0.6198,0.6196,0.6191,0.1897,0.1901,0.08
rf,Random Forest Classifier,0.594,0.6308,0.6388,0.6117,0.6233,0.1787,0.1798,0.1867
gbc,Gradient Boosting Classifier,0.5841,0.6253,0.6164,0.6035,0.6088,0.161,0.1617,1.28
lightgbm,Light Gradient Boosting Machine,0.5946,0.6245,0.6264,0.614,0.6195,0.1825,0.1829,0.43
et,Extra Trees Classifier,0.5915,0.6167,0.6371,0.608,0.6218,0.1754,0.1759,0.17
xgboost,Extreme Gradient Boosting,0.594,0.6163,0.6228,0.613,0.6175,0.1823,0.1826,0.7267
ada,Ada Boost Classifier,0.5761,0.5968,0.598,0.5972,0.5972,0.1471,0.1473,0.3067
dt,Decision Tree Classifier,0.5397,0.5369,0.5753,0.5616,0.568,0.074,0.0741,0.11
dummy,Dummy Classifier,0.4726,0.5,0.0,0.0,0.0,0.0,0.0,0.07


In [99]:
evaluate_model(top_models[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [98]:
train.competition.unique()

array(['german_bundesliga', 'spanish_la_liga', 'english_premier_league',
       'italian_serie_a', 'french_ligue_1', 'us_major_league_soccer',
       'mexican_primera'], dtype=object)

## Finalize model

In [94]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [95]:
predictions = predict_model(final, data=test)

In [96]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric,
    ) for metric in metrics_list
}

In [97]:
metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
metrics_df.index = ['standard_model']
for col, val in metrics_dict.items():
    metrics_df.loc['standard_model', col] = val 
metrics_df

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
standard_model,0.6121,0.6159,0.5597,0.6611,0.6062,0.2292,0.2323


# Creation dataset (with new columns)

### Preparation 

In [23]:
features = Features() #Features(output=features_path)
examples = features.create()
examples = examples[~examples[f"prob_under_goals_{cutoff}"].isna()].dropna(axis=0, thresh=107)
# Initialize PipelineDataset class
pipe_ds = PipelineDatasets(
    features=features,
    target=user_target,
    examples=examples,
    test_size=user_date,
)
n_train, n_test = pipe_ds.train_data.dropna(axis=0, thresh=107), pipe_ds.test_data.dropna(axis=0, thresh=107)

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Run this cell if target is boolean

In [28]:
n_test = n_test[~n_test["goals_2.5"].isna()]

In [29]:
n_train = n_train.astype({user_target: int})
n_test = n_test.astype({user_target: int})

## Referee table. Merging dataframe 

In [30]:
current_path = os.getcwd() 
path = Path(current_path).parent / 'data/features/referee_table.csv'
raw_referee_table = pd.read_csv("referee_table.csv")

In [31]:
referee_table = add_columns_table(feat=features, df=raw_referee_table) # add unique id_match to referee table 

In [63]:
referee_cols  = referee_table.columns.tolist() 
referee_cols.remove('competition')
referee_cols.remove('referee')
referee_cols.remove('date') # Remove repeated columns 
referee_train = pd.merge(n_train, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + train old data
referee_test = pd.merge(n_test, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + test old data

In [64]:
referee_train.drop(columns=["date", "month", "competition", "hour_rank", "hour_before_16"], inplace=True)
referee_train.drop(columns=[x for x in referee_train.columns if "_std" in x], inplace=True)

In [40]:
"competition" in referee_test.columns

True

## Selection of features

In [65]:
feat_sel = FeatureSelection(target=user_target,
                            dataset=referee_train,
                            target_features=100,
                            filter_metrics=metric_param,
                            include=["lr", "et"],
                           )
selected_features = feat_sel.repeat_pipeline()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Extra Trees Classifier,0.5877,0.6357,0.6657,0.5656,0.6116,0.1783,0.181,0.2368


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5534,0.5982,0.5478,0.5417,0.5447,0.1065,0.1066,0.3149


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Extra Trees Classifier,0.5767,0.6248,0.6348,0.558,0.594,0.1557,0.1571,0.2388


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5945,0.6295,0.5871,0.5838,0.5854,0.1887,0.1887,0.2647


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5932,0.6428,0.6264,0.5762,0.6003,0.1875,0.1882,0.2445


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Extra Trees Classifier,0.5863,0.6227,0.6124,0.5707,0.5908,0.1736,0.174,0.2405


In [67]:
len(sorted(selected_features))

52

### Parameters 

In [46]:
ignore_features = ['month','date','competition','season_x','month_start_or_end','raw_match_id','season_y']

### Pycaret setup kwargs 

In [68]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=True,
        multicollinearity_threshold=0.99,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy=fold_strategy,
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

## Pycaret evaluation

In [100]:
referee_train = referee_train[selected_features + [user_target]]
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = referee_train.select_dtypes(include=numerics).columns.tolist()
num_cols.remove("goals_2.5")
setup_kwargs['numeric_features'] = num_cols 

In [101]:
_ = setup(data=referee_train, target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [102]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.6254,0.6799,0.6166,0.662,0.6379,0.251,0.2523,0.2533
lr,Logistic Regression,0.626,0.6791,0.6177,0.6623,0.6387,0.2521,0.2534,0.7533
et,Extra Trees Classifier,0.6211,0.6607,0.6548,0.6439,0.6488,0.2377,0.2382,0.3567
rf,Random Forest Classifier,0.6155,0.6586,0.6259,0.6456,0.6354,0.2291,0.2295,0.3667
gbc,Gradient Boosting Classifier,0.6069,0.6457,0.6283,0.6351,0.6314,0.2107,0.2109,0.5867
lightgbm,Light Gradient Boosting Machine,0.5977,0.6399,0.6156,0.6267,0.621,0.1925,0.1926,0.3433
ada,Ada Boost Classifier,0.602,0.6347,0.634,0.6283,0.6308,0.1995,0.1997,0.3267
xgboost,Extreme Gradient Boosting,0.594,0.6311,0.6225,0.6221,0.6218,0.184,0.1843,0.5533
dt,Decision Tree Classifier,0.5564,0.5555,0.5659,0.5889,0.5771,0.1106,0.1107,0.2667
dummy,Dummy Classifier,0.4646,0.5,0.0,0.0,0.0,0.0,0.0,0.2633


In [103]:
evaluate_model(top_models[1])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Finalize model

In [73]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [74]:
predictions = predict_model(final, data=referee_test[selected_features + [user_target]])

In [75]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric
    ) for metric in metrics_list
}

In [76]:
new_model_metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
new_model_metrics_df.index = ['new_feat_model']
for col, val in metrics_dict.items():
    new_model_metrics_df.loc['new_feat_model', col] = val 
new_model_metrics_df

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
new_feat_model,0.6092,0.6092,0.6098,0.6394,0.6243,0.2177,0.2179


# Performance

In [77]:
pd.concat([metrics_df, new_model_metrics_df])

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
standard_model,0.6121,0.6159,0.5597,0.6611,0.6062,0.2292,0.2323
new_feat_model,0.6092,0.6092,0.6098,0.6394,0.6243,0.2177,0.2179
