In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
from feature_selection.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [2]:
target = "goals_2.5"
test_date = "1-Dec-2021"

In [3]:
feats = Features()

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


In [6]:
def patched_create_dataset(
    test_date: str,
    target: str,
    columns=None,
    odds_features: bool = True,
    summary: bool = True,
    odds_rankings: bool = True,
    include_std: bool = True,
    features: Features = None,
    ignore_features=None,
    drop_future_matches: bool = True,
    test_weeks: int = 4,
):
    features = features or Features()
    examples = features.create(
        columns=columns,
        odds_features=odds_features,
        odds_rankings=odds_rankings,
        referee_features=True,
        include_std=include_std,
        summary=summary,
    )
    examples = examples[[x for x in examples.columns if "possession" not in x]]
    if ignore_features is not None:
        examples.drop(columns=ignore_features, inplace=True)
    pds = PipelineDatasets(
        examples=examples,
        features=features,
        target=target,
        drop_future_matches=drop_future_matches,
        test_size=test_date,
        test_weeks=test_weeks,
    )
    return pds

In [5]:
ds = patched_create_dataset(target=target,
                    test_date=test_date,
                    features=feats,
                    odds_features=True,
                    include_std=True,
                    ignore_features=IGNORE_FEATURES+["referee", "hour_rank", "hour_before_16", "is_weekend"],
                    drop_future_matches=False,
                    
                   )

In [6]:
train_data = ds.train_data.copy()


In [7]:
setup_kwargs = dict(
        preprocess=True,
        #custom_pipeline=loaded,
        train_size=0.75,
        session_id=123,
        normalize=True,
       # normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=50,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

## Feature selection (without triplet)

In [8]:
metric_param = {
        "Accuracy": -0.1,
        "AUC": -0.1,
        "Recall": -0.1,
        "Precision": -0.1,
        "F1": -0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }
feat_sel = FeatureSelection(target=target,
                            dataset=train_data.dropna(),#[list(set(new_subset+new_feat+[target]))],
                            target_features=500,
                            filter_metrics=metric_param,
                            include=["lr"],
                            setup_kwargs=setup_kwargs,
                            optimize=True,
                            opt_list=["AUC"],
                           )
selected_features = feat_sel.repeat_pipeline()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5158,0.5292,0.4973,0.5257,0.5111,0.0323,0.0323


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5103,0.5309,0.4946,0.5199,0.5069,0.0212,0.0212


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5282,0.5543,0.5351,0.5366,0.5359,0.0561,0.0561


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5296,0.557,0.5297,0.5385,0.5341,0.0591,0.0591


In [9]:
sorted(selected_features)

['DEF_mean_a',
 'MED_diff',
 'accurate_cross_nocorner_std_diff',
 'accurate_cross_std_h',
 'accurate_fwd_zone_pass_mean_diff',
 'accurate_fwd_zone_pass_std_a',
 'accurate_goal_kicks_std_h',
 'accurate_keeper_sweeper_mean_a',
 'accurate_keeper_sweeper_mean_h',
 'accurate_keeper_sweeper_std_a',
 'accurate_launches_mean_a',
 'accurate_layoffs_mean_a',
 'accurate_layoffs_std_a',
 'accurate_pass_std_diff',
 'accurate_through_ball_mean_diff',
 'accurate_throws_std_diff',
 'accurate_throws_std_h',
 'att_assist_openplay_mean_a',
 'att_corner_ratio_mean_a',
 'att_freekick_goal_std_diff',
 'att_freekick_miss_mean_h',
 'att_freekick_miss_std_a',
 'att_freekick_total_mean_a',
 'att_freekick_total_mean_h',
 'att_goal_high_centre_mean_diff',
 'att_goal_high_centre_std_diff',
 'att_goal_high_left_mean_diff',
 'att_goal_high_left_std_h',
 'att_goal_high_right_std_a',
 'att_goal_high_right_std_diff',
 'att_goal_low_centre_mean_a',
 'att_goal_low_centre_std_a',
 'att_goal_low_centre_std_h',
 'att_goal_l

#### new imports 

In [8]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
#from featsel.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [9]:
from ml_bets.research.tips import calibrate_tips, tips_from_model, combine_tips, predict_dataset, compose_tips, get_tip_probs

In [10]:
ix = np.logical_and(feats.matches["date"].dt.month > 10,
                    feats.matches["competition"].isin({"mexican_primera", 'us_major_league_soccer'}))
index = feats.matches[~ix].index

In [11]:
def setup_dataset(test_date):
    ds = create_dataset(target=target,
                        test_date=test_date,
                        features=feats,
                        odds_features=True,
                        include_std=True,
                        test_weeks=4,
                        ignore_features=IGNORE_FEATURES,
                        drop_future_matches=False,
                       )
    train_data = ds.train_data.copy()#[ds.train_data.index.map(lambda x: "us_major_league_soccer" not in x and "mexican" not in x)]
    train_data.drop(columns=["hour_rank", "hour_before_16", "is_weekend"], inplace=True)
    train_data = train_data[train_data.index.isin(index)][list(set(selected_features)) + [target]].copy()#.reset_index(drop=True)
    test_data = ds.test_set[ds.test_set.index.isin(index)][list(set(selected_features)) + [target]].copy()#.reset_index(drop=True)
    val_data = ds.val_set[ds.val_set.index.isin(index)].copy()
    setup_kwargs = dict(
        preprocess=True,
        test_data=test_data[train_data.columns.tolist()],#.dropna(),
        #numeric_features=[x for x in train_data.columns.tolist() if x != target],
        #custom_pipeline=loaded,
        #train_size=0.75,
        session_id=123,
        normalize=True,
        normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=True,
        pca_method="kernel",
        pca_components=50,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )
    _ = setup(data=train_data, target=target, **setup_kwargs)
    return train_data, test_data, val_data, ds

In [37]:
train_data, test_data, val_data, ds = setup_dataset("4-Dec-2021")

In [38]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_ensemble():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "et", "rf", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="MCC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    blend = blend_models(cali)
    opti = tune_model(blend, optimize="Precision", choose_better=True, n_iter=50, search_library="optuna")
    return opti, cali, tuned_models

In [39]:
top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5831,0.6161,0.5937,0.603,0.5971,0.1652,0.1658,0.63
svm,SVM - Linear Kernel,0.578,0.0,0.5975,0.6037,0.5845,0.1544,0.1656,0.205
ridge,Ridge Classifier,0.5825,0.0,0.5937,0.6018,0.5966,0.1639,0.1645,0.2
lda,Linear Discriminant Analysis,0.5819,0.6107,0.5962,0.6003,0.5972,0.1624,0.1629,0.21


In [41]:
evaluate_model(top_models[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Feature selection (adding triplet)

In [10]:
common_feat = list(set(["_".join(x.split("_")[:-1]) for x in selected_features]))
new_subset = [x for x in train_data.columns for c in common_feat if c in x]

len(selected_features), len(new_subset), sorted(new_subset)

(181,
 583,
 ['ATT_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'DEF_mean_a',
  'DEF_mean_diff',
  'DEF_mean_h',
  'MED_a',
  'MED_diff',
  'MED_h',
  'MED_mean_a',
  'MED_mean_diff',
  'MED_mean_h',
  'MED_std_a',
  'MED_std_diff',
  'MED_std_h',
  'accurate_cross_nocorner_std_a',
  'accurate_cross_nocorner_std_diff',
  'accurate_cross_nocorner_std_h',
  'accurate_cross_std_a',
  'accurate_cross_std_diff',
  'accurate_cross_std_h',
  'accurate_fwd_zone_pass_mean_a',
  'accurate_fwd_zone_pass_mean_diff',
  'accurate_fwd_zone_pass_mean_h',
  'accurate_fwd_zone_pass_std_a',
  'accurate_fwd_zone_pass_std_diff',
  'accurate_fwd_zone_pass_std_h',
  'accurate_goal_kicks_std_a',
  'accurate_goal_kicks_std_diff',
  'accurate_goal_kicks_std_h',
  'accurate_keeper_sweeper_mean_a',
  'accurate_keeper_sweeper_mean_diff',
  'accurate_keeper_sweeper_mean_h',
  'accurate_keeper_sweeper_std_a',
  'accurate_keeper_sweeper_std_diff',
  'accurate_keeper_sweeper_std_h',
  '

In [12]:
metric_param = {
        "Accuracy": -0.1,
        "AUC": -0.1,
        "Recall": -0.1,
        "Precision": -0.1,
        "F1": -0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }
feat_sel = FeatureSelection(target=target,
                            dataset=train_data.dropna()[list(set(new_subset+[target]))],
                            target_features=250,
                            filter_metrics=metric_param,
                            include=["lr"],
                            setup_kwargs=setup_kwargs,
                            optimize=True,
                            opt_list=["AUC"],
                           )
selected_features = feat_sel.repeat_pipeline()



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5199,0.5415,0.5514,0.5271,0.539,0.0388,0.0388


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5227,0.5462,0.5162,0.532,0.524,0.0456,0.0456


In [13]:
common_feat = list(set(["_".join(x.split("_")[:-1]) for x in selected_features]))
new_subset = [x for x in train_data.columns for c in common_feat if c in x]

len(selected_features), len(new_subset), sorted(new_subset)

(145,
 276,
 ['ATT_h_mul_shots_mul_goals_mean_diff',
  'ATT_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'accurate_cross_nocorner_std_a',
  'accurate_cross_nocorner_std_diff',
  'accurate_cross_nocorner_std_h',
  'accurate_cross_std_a',
  'accurate_cross_std_diff',
  'accurate_cross_std_h',
  'accurate_goal_kicks_std_a',
  'accurate_goal_kicks_std_diff',
  'accurate_goal_kicks_std_h',
  'accurate_keeper_sweeper_mean_a',
  'accurate_keeper_sweeper_mean_diff',
  'accurate_keeper_sweeper_mean_h',
  'accurate_keeper_sweeper_std_a',
  'accurate_keeper_sweeper_std_diff',
  'accurate_keeper_sweeper_std_h',
  'accurate_launches_mean_a',
  'accurate_launches_mean_diff',
  'accurate_launches_mean_h',
  'accurate_layoffs_mean_a',
  'accurate_layoffs_mean_diff',
  'accurate_layoffs_mean_h',
  'accurate_layoffs_std_a',
  'accurate_layoffs_std_diff',
  'accurate_layoffs_std_h',
  'accurate_through_ball_mean_a',
  'accurate_thro

#### new imports 

In [12]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
#from featsel.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [13]:
from ml_bets.research.tips import calibrate_tips, tips_from_model, combine_tips, predict_dataset, compose_tips, get_tip_probs

In [14]:
ix = np.logical_and(feats.matches["date"].dt.month > 10,
                    feats.matches["competition"].isin({"mexican_primera", 'us_major_league_soccer'}))
index = feats.matches[~ix].index

In [15]:
test_feats = ['acc_cross_nocorner_pct_std_a',
 'acc_cross_nocorner_pct_std_diff',
 'acc_cross_nocorner_pct_std_h',
 'accurate_flick_on_mean_h',
 'accurate_flick_on_mean_a',
 'accurate_freekick_cross_mean_a',
 'accurate_freekick_cross_mean_diff',
 'accurate_freekick_cross_mean_h',
 'accurate_goal_kicks_mean_diff',
 'accurate_goal_kicks_std_a',
 'accurate_goal_kicks_std_diff',
 'accurate_goal_kicks_std_h',
 'accurate_keeper_sweeper_mean_a',
 'accurate_keeper_sweeper_std_diff',
 'accurate_keeper_sweeper_std_h',
 'accurate_pass_std_a',
 'accurate_pass_std_h',
 'accurate_through_ball_mean_diff',
 'att_assist_openplay_std_a',
 'att_assist_openplay_std_h',
 'att_cmiss_left_mean_a',
 'att_cmiss_left_mean_h',
 'att_cmiss_left_std_a',
 'att_cmiss_left_std_diff',
 'att_cmiss_left_std_h',
 'att_goal_high_centre_mean_diff',
 'att_goal_high_centre_std_diff',
 'att_goal_high_right_std_a',
 'att_goal_low_centre_mean_diff',
 'att_goal_low_centre_mean_h',
 'att_goal_low_centre_std_a',
 'att_goal_low_centre_std_h',
 'att_hd_goal_mean_a',
 'att_hd_goal_mean_h',
 'att_hd_target_mean_a',
 'att_hd_target_mean_h',
 'att_ibox_own_goal_mean_diff',
 'att_ibox_own_goal_std_h',
 'att_ibox_own_goal_std_a',
 'att_miss_high_mean_a',
 'att_miss_high_mean_h',
 'att_miss_high_right_mean_diff',
 'att_obxd_right_mean_diff',
 'att_obxd_right_std_diff',
 'att_one_on_one_mean_h',
 'att_one_on_one_mean_a',
 'att_one_on_one_std_diff',
 'att_one_on_one_std_h',
 'att_post_high_std_a',
 'att_post_high_std_h',
 'att_post_right_mean_a',
 'att_post_right_mean_h',
 'attempts_ibox_std_a',
 'attempts_ibox_std_h',
 'backward_pass_mean_diff',
 'backward_pass_mean_h',
 'big_chance_created_mean_h',
 'big_chance_created_std_diff',
 'clean_sheet_std_diff',
 'contentious_decision_mean_a',
 'contentious_decision_mean_diff',
 'contentious_decision_std_a',
 'contentious_decision_std_h',
 'duel_won_pct_mean_diff',
 'effective_clearance_mean_diff',
 'error_lead_to_goal_mean_diff',
 'error_lead_to_goal_std_diff',
 'first_yellow_card_1t_mean_h',
 'first_yellow_card_1t_mean_a',
 'foul_throw_in_std_diff',
 'fouled_final_third_mean_diff',
 'goal_assist_deadball_mean_diff',
 'goal_assist_openplay_std_diff',
 'goal_assist_openplay_std_h',
 'goal_assist_setplay_std_a',
 'goal_assist_setplay_std_diff',
 'goal_assist_setplay_std_h',
 'goal_assist_std_diff',
 'goals_mean_diff',
 'goals_openplay_std_h',
 'goals_openplay_std_a',
 'good_high_claim_mean_diff',
 'high_to_low_goals_mean_a',
 'high_to_low_goals_mean_diff',
 'high_to_low_goals_std_a',
 'high_to_low_goals_std_h',
 'imp_prob_under_goals_0.5_h',
 'imp_prob_under_goals_2.5_diff',
 'interception_mean_a',
 'interception_mean_diff',
 'interception_mean_h',
 'interceptions_in_box_std_a',
 'interceptions_in_box_std_diff',
 'interceptions_in_box_std_h',
 'last_man_tackle_mean_a',
 'last_man_tackle_mean_diff',
 'left_div_right_foot_goals_std_diff',
 'leftside_pass_mean_a',
 'leftside_pass_mean_h',
 'leftside_pass_std_a',
 'long_pass_own_to_opp_mean_diff',
 'long_pass_own_to_opp_mean_h',
 'no_foot_goals_ratio_std_a',
 'no_foot_goals_ratio_std_diff',
 'no_foot_goals_ratio_std_h',
 'odd_ratio_under_corners_10.5',
 'odd_ratio_under_corners_8.5',
 'odds_home_under_both_score_h',
 'odds_away_under_both_score_a',
 'odds_away_under_goals_4.5_a',
 'odds_home_under_goals_4.5_h',
 'odds_away_over_goals_4.5_a',
 'odds_home_over_goals_4.5_h',
 'odds_home_under_goals_0.5_diff',
 'odds_home_under_goals_0.5_h',
 'own_goals_std_h',
 'own_goals_std_a',
 'pen_goals_conceded_mean_diff',
 'penalty_faced_std_a',
 'penalty_won_std_a',
 'penalty_faced_std_h',
 'penalty_won_std_h',
 'poss_won_att_3rd_std_a',
 'poss_won_att_3rd_std_diff',
 'poss_won_att_3rd_std_h',
 'post_scoring_att_std_a',
 'post_scoring_att_std_h',
 'prob_squared_under_goals_2.5',
 'pts_dropped_winning_pos_mean_a',
 'pts_dropped_winning_pos_std_diff',
 'pts_dropped_winning_pos_std_h',
 'ratio_over_goals_2.5_a',
 'ratio_over_goals_2.5_h',
 'ratio_under_goals_2.5_a',
 'ratio_under_goals_2.5_diff',
 'raw_prob_over_goals_1.5',
 'raw_prob_over_goals_2.5',
 'raw_prob_over_goals_3.5',
 'raw_prob_under_goals_1.5',
 'raw_prob_under_goals_2.5',
 'raw_prob_under_goals_3.5',
 'raw_prob_under_goals_4.5',
 'red_card_1t_mean_a',
 'red_card_1t_mean_diff',
 'red_card_1t_mean_h',
 'red_card_2t_mean_a',
 'red_card_2t_mean_diff',
 'red_card_mean_h',
 'red_card_mean_a',
 'red_card_std_diff',
 'right_to_left_goals_mean_diff',
 'right_to_left_goals_std_diff',
 'rightside_pass_div_leftside_pass_mean_a',
 'rightside_pass_div_leftside_pass_mean_h',
 'second_yellow_mean_h',
 'shots_mul_goals_std_h',
 'second_yellow_mean_a',
 'shots_mul_goals_std_a',
 'successful_final_third_passes_mean_a',
 'successful_final_third_passes_mean_diff',
 'successful_final_third_passes_std_a',
 'successful_final_third_passes_std_diff',
 'successful_put_through_std_a',
 'successful_put_through_std_diff',
 'successful_put_through_std_h',
 'total_clearance_mean_a',
 'total_clearance_mean_h',
 'total_fastbreak_std_diff',
 'total_high_claim_mean_a',
 'total_high_claim_mean_diff',
 'total_high_claim_mean_h',
 'total_keeper_sweeper_mean_a',
 'total_keeper_sweeper_mean_h',
 'total_launches_std_h',
 'total_launches_std_a',
 'total_red_card_mean_h',
 'total_red_card_mean_a',
 'total_red_card_std_diff',
 'total_throws_std_a',
 'total_throws_std_diff',
 'total_win_pct_over_goals_1.5_a',
 'total_win_pct_over_goals_1.5_h',
 'total_win_pct_over_goals_2.5_a',
 'total_win_pct_over_goals_2.5_h',
 'total_win_pct_over_goals_3.5_a',
 'total_win_pct_over_goals_3.5_h',
 'total_win_pct_under_goals_1.5_a',
 'total_win_pct_under_goals_1.5_h',
 'total_win_pct_under_goals_2.5_a',
 'total_win_pct_under_goals_2.5_h',
 'total_win_pct_under_goals_3.5_a',
 'total_yel_card_std_a',
 'total_yel_card_std_diff',
 'total_yel_card_std_h',
 'win_pct_away_over_goals_2.5_a',
 'win_pct_away_over_goals_2.5_diff',
 'win_pct_away_under_goals_2.5_a',
 'win_pct_away_under_goals_2.5_diff',
 'win_pct_home_over_goals_2.5_a',
 'win_pct_home_over_goals_2.5_diff',
 'win_pct_home_over_goals_2.5_h',
 'win_pct_home_under_goals_2.5_diff',
 'win_pct_home_under_goals_2.5_h',
 'winner_mean_diff']

In [16]:
def setup_dataset(test_date):
    ds = create_dataset(target=target,
                        test_date=test_date,
                        features=feats,
                        odds_features=True,
                        include_std=True,
                        test_weeks=6,
                        ignore_features=IGNORE_FEATURES,
                        drop_future_matches=False,
                       )
    train_data = ds.train_data.copy()#[ds.train_data.index.map(lambda x: "us_major_league_soccer" not in x and "mexican" not in x)]
    train_data.drop(columns=["hour_rank", "hour_before_16", "is_weekend"], inplace=True)
    train_data = train_data[train_data.index.isin(index)][list(set(test_feats)) + [target]].copy()#.reset_index(drop=True)
    test_data = ds.test_set[ds.test_set.index.isin(index)][list(set(test_feats)) + [target]].copy()#.reset_index(drop=True)
    val_data = ds.val_set[ds.val_set.index.isin(index)].copy()
    setup_kwargs = dict(
        preprocess=True,
        test_data=test_data[train_data.columns.tolist()],#.dropna(),
        #numeric_features=[x for x in train_data.columns.tolist() if x != target],
        #custom_pipeline=loaded,
        #train_size=0.75,
        session_id=123,
        normalize=True,
        normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="linear",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )
    _ = setup(data=train_data, target=target, **setup_kwargs)
    return train_data, test_data, val_data, ds

In [17]:
train_data, test_data, val_data, ds = setup_dataset("4-Dec-2021")

In [18]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_ensemble():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "et", "rf", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="MCC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    blend = blend_models(cali)
    opti = tune_model(blend, optimize="Precision", choose_better=True, n_iter=50, search_library="optuna")
    return opti, cali, tuned_models

In [19]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_linear_models():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="MCC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    return tuned_models, cali

In [20]:
tuned_linear, cali_linear = train_linear_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6414,0.6893,0.6716,0.6507,0.661,0.2806,0.2807
1,0.5997,0.6586,0.6321,0.6124,0.6221,0.1969,0.197
2,0.6216,0.6679,0.6123,0.6442,0.6278,0.2435,0.2439
3,0.6126,0.6475,0.6015,0.6345,0.6175,0.2257,0.226
Mean,0.6188,0.6658,0.6294,0.6354,0.6321,0.2367,0.2369
SD,0.0152,0.0154,0.0267,0.0145,0.0171,0.0303,0.0303


In [53]:
top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6166,0.6734,0.5836,0.65,0.6081,0.2351,0.24,0.3
lda,Linear Discriminant Analysis,0.6166,0.6705,0.5892,0.6475,0.6115,0.2347,0.2387,0.3025
ridge,Ridge Classifier,0.6166,0.0,0.5892,0.6476,0.6115,0.2347,0.2387,0.285
svm,SVM - Linear Kernel,0.5841,0.0,0.5441,0.6157,0.5734,0.1708,0.1739,0.305


In [43]:
tuned_linear[3]

SGDClassifier(alpha=0.10126692364605047, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001022673152073415,
              fit_intercept=False, l1_ratio=0.2203376725525369,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l1', power_t=0.5,
              random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
evaluate_model(tuned_linear[3])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [51]:
evaluate_model(cali_linear[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [55]:
market="goals"
cutoff = 2.5
tips = tips_from_model(model=cali_linear[0],
                test_data=test_data,
                features=feats,
                market=market,
                cutoff=cutoff,
                      )

In [56]:
def show_results(df, groupby=["bet_type", "match_week"]):
    ix = np.logical_and(df["win"]>=0, True)# df["label"]>=1)
    #ix = np.logical_and(df["consensus"] >=0.1, ix)
    ix2 = np.logical_and(df["odds"]>=1.5, df["odds"]<2.)
    ix = np.logical_and(ix, ix2)
    #ix = np.logical_and(ix, ~df["validation"])
    #ix = np.logical_and(ix, df["exp_payoff_prec"] >0.88)
    #ix = np.logical_and(ix, df["exp_payoff"] <1.2)
    #ix = np.logical_and(ix, df["exp_payoff"] >0.8)
    ix = np.logical_and(ix, df["confidence"] >=0.60)
    #ix = np.logical_and(ix, df["confidence"] <=0.75)
    x = df[ix].groupby(groupby)[["win", "profit", "exp_payoff", "exp_payoff_prec", "odds", "confidence"]].mean()
    x["count"] = df[ix].groupby(groupby)[["win"]].count()
    x["buenas"] = df[ix].groupby(groupby)[["win"]].sum().astype(int)
    x["model"] = df["model"].iloc[0]
    return x
show_results(tips)

Unnamed: 0_level_0,Unnamed: 1_level_0,win,profit,exp_payoff,exp_payoff_prec,odds,confidence,count,buenas,model
bet_type,match_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
goals_over_2.5,2021_19,0.818182,0.434545,1.193888,,1.743636,0.683591,11,9,model
goals_over_2.5,2021_20,0.7,0.144,1.087417,,1.641,0.66427,10,7,model
goals_over_2.5,2021_21,0.8,0.359,1.130386,,1.685,0.6716,10,8,model
goals_over_2.5,2021_22,0.571429,0.04,1.130597,,1.708571,0.6628,7,4,model
goals_over_2.5,2021_23,0.5,-0.14,1.216788,,1.815,0.66775,4,2,model
goals_over_2.5,2021_24,0.625,0.05875,1.168317,,1.73,0.674975,8,5,model
goals_over_2.5,2021_25,0.0,-1.0,1.20024,,1.8,0.6668,1,0,model
goals_under_2.5,2021_19,0.75,0.255,1.034973,,1.63,0.63415,4,3,model
goals_under_2.5,2021_20,0.8,0.372,1.14258,,1.706,0.67048,5,4,model
goals_under_2.5,2021_21,0.333333,-0.363333,1.200528,,1.746667,0.686033,3,1,model
