In [1]:
from itertools import product
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import param
from pycaret.classification import (
    compare_models,
    create_model,
    get_config,
    predict_model,
    setup,
    tune_model,
)
from pycaret.utils import check_metric




class FeatureSelection(param.Parameterized):

    # Class attributes
    model_class_to_name = {
        "RidgeClassifier": "ridge",
        "LogisticRegression": "lr",
        "LinearDiscriminantAnalysis": "lda",
        "GradientBoostingClassifier": "gbc",
        "QuadraticDiscriminantAnalysis": "qda",
        "LGBMClassifier": "lightgbm",
        "AdaBoostClassifier": "ada",
        "RandomForestClassifier": "rf",
        "ExtraTreesClassifier": "et",
        "GaussianNB": "nb",
        "DecisionTreeClassifier": "dt",
        "KNeighborsClassifier": "knn",
        "SGDClassifier": "svm",
        "CatBoostClassifier": "catboost",
        "SVC": "rbfsvm",
        "GaussianProcessClassifier": "gpc",
        "MLPClassifier": "mlp",
        "XGBClassifier": "xgboost",
    }

    metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]

    # Private class attributes
    _filter_metric = {
        "Accuracy": 0.5,
        "AUC": 0.5,
        "Recall": 0.6,
        "Precision": 0.6,
        "F1": 0.6,
        "Kappa": 0.1,
        "MCC": 0.1,
    }

    _setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        # test_data=test_data,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.4,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        ignore_features=None,
        fold_strategy="timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.4,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=False,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=False,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
        html=False,
    )

    _numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]

    # Init values
    ## Feature selection parameters
    target = param.String("goal_2.5")
    number_features = param.Number(
        0.5,
        bounds=(0, 1),
        inclusive_bounds=(False, False),
        doc="Number of features (percentage) selected each iteration. Only the first nth "
        "features will be kept for the next iteration.",
    )
    target_features = param.Number(
        0.3,
        bounds=(0, None),
        inclusive_bounds=(False, True),
        doc="Final total number of features. The goal of the package is to reduce "
        "the incoming columns of the dataset to this 'target_features' number.",
    )
    ## Metric parameters
    filter_metrics = param.Dict(_filter_metric)
    ## Model setup and model optimization parameters
    numerics = param.List(_numerics)
    ignore_features = param.List(default=[], allow_None=True)
    setup_kwargs = param.Dict(_setup_kwargs)
    include = param.List(default=None, item_type=str, allow_None=True)
    exclude = param.List(["qda", "knn", "nb"], item_type=str)
    sort = param.String("AUC")
    number_models = param.Integer(10, bounds=(2, 13))
    top_models = param.List(default=None, allow_None=True)
    optimize = param.Boolean(False)
    opt_list = param.List(["Accuracy", "Precision", "Recall", "F1", "AUC"], item_type=str)
    ## Class selectors
    dataset = param.ClassSelector(class_=pd.DataFrame)
    dict_models = param.ClassSelector(class_=dict)
    tune_dict_models = param.ClassSelector(class_=dict)
    x_train = param.ClassSelector(class_=pd.DataFrame)
    x_df = param.DataFrame(pd.DataFrame())
    model_df = param.ClassSelector(class_=pd.DataFrame)
    model_tuned_df = param.ClassSelector(class_=pd.DataFrame)
    features_df = param.ClassSelector(class_=pd.DataFrame)

    def __init__(self, dataset: pd.DataFrame, **kwargs):
        # Copy of the incoming dataset
        dataset = dataset.copy()
        # Compute the upper bound of number_features, target_features, number_models
        total_features = dataset.shape[1]
        self.param.target_features.bounds = (0, total_features)
        if "include" in kwargs:
            self.param.number_models.default = len(kwargs["include"])
            self.param.number_models.bounds = (0, len(kwargs["include"]))
        # Call super
        super(FeatureSelection, self).__init__(dataset=dataset, **kwargs)
        # Get the features of the dataframe
        self.feature_list = self.dataset.columns.tolist()
        self.feature_list.remove(self.target)  # target column should not be counted
        # Compute target features
        self.target_features = self.calculate_number_features(
            number_features=self.target_features, features=self.feature_list
        )
        # Get the evaluator and the arguments. Depends on the "include" parameter
        self._training_function, self._args = self._decide_model_eval()
        # Get all the columns whose type is numeric
        self.numeric_features = self._compute_numeric_features(df=self.dataset[self.feature_list])

    def _compute_numeric_features(self, df: pd.DataFrame):
        """Return those columns from the given dataset whose data type is numeric."""
        return df.select_dtypes(include=self.numerics).columns.tolist()

    def _decide_model_eval(self):
        """
        Define the pycaret model evaluator depending on the number of included models.

        If the 'include' list parameter equals 1, the method will return
        the 'create_models' pycaret object.
        If 'include' parameter list is greatear than 1, the method will
        return the 'compare_model' pycaret object and its arguments.
        If 'include' parameter equals None, the method will return the
        'compare_models' pycaret object, where all possible models are
        considered for evaluation, except those included within the 'exclude'
        list.
        """
        args = {"n_select": self.number_models, "sort": self.sort, "verbose": False}
        training_function = compare_models
        if not self.include:
            args["exclude"] = self.exclude
        elif len(self.include) == 1:
            training_function = lambda *rgs, **kwargs: [create_model(*rgs, **kwargs)]
            args = {"estimator": self.include[0], "verbose": False}
        else:
            args["include"] = self.include
        return training_function, args

    @staticmethod
    def calculate_number_features(
        number_features: Union[int, float], features: Union[pd.DataFrame, List]
    ) -> int:
        n_features = (
            int(number_features)
            if (number_features >= 1)
            else int(number_features * len(features))
        )
        return n_features

    def train_model(self):
        """Preprocess the data and select self.number_models top models."""
        # Selected dataset
        selected_cols = self.feature_list + [self.target]
        train_data = self.dataset[selected_cols] if self.x_df.empty else self.x_df[selected_cols]
        # Numeric features
        self.setup_kwargs["numeric_features"] = [
            c for c in self.numeric_features if c in self.feature_list
        ]
        # Ignore features
        self.setup_kwargs["ignore_features"] = [
            c for c in self.ignore_features if c in self.feature_list
        ]
        # Initialize pycaret setup
        setup(data=train_data, target=self.target, **self.setup_kwargs)
        # Get train dataset and preprocessed dataframe
        self.x_train = get_config("X_train")
        if self.x_df.empty:  # TODO change x_df by dataset and add flag?
            self.x_df = pd.concat([get_config("X"), get_config("y")], axis=1)
            self.setup_kwargs["preprocess"] = False  # Turn off preprocessing
        # Compare models
        self.top_models = self._training_function(**self._args)

    def create_dict_models(self):
        """Create a dictionary whose values are pycaret standard models."""
        self.dict_models = {
            str(top_model).split("(")[0]: top_model for top_model in self.top_models
        }
        # Remove bad catboost key
        oldkey = [key for key in self.dict_models.keys() if key.startswith("<catboost")]
        if oldkey:
            self.dict_models["CatBoostClassifier"] = self.dict_models.pop(oldkey[0])
        # Remap
        self.dict_models = {
            self.model_class_to_name[key]: self.dict_models[key] for key in self.dict_models.keys()
        }

    def create_dict_tuned_models(self):
        """Create a dictionary whose keys and values are pycaret tuned models."""
        self.tune_dict_models = {}
        for (model_str, py_model), optimize in product(self.dict_models.items(), self.opt_list):
            self.tune_dict_models[f"{model_str}_tune_{optimize}"] = tune_model(
                py_model,
                optimize=optimize,
                verbose=False,
                n_iter=30,
                choose_better=True,
            )

    def get_metrics_df(self, test_predicted, model, dataframe):
        """Compute different metric values for the given model."""
        value_dct = dict()
        for metric in self.metrics_list:
            try:
                value_dct[metric] = check_metric(
                    actual=test_predicted[self.target],
                    prediction=test_predicted["Label"],
                    metric=metric,
                )
            except AttributeError:
                value_dct[metric] = np.nan
        for key, val in value_dct.items():
            dataframe.loc[model, key] = val
        return dataframe

    def remove_bad_models(self, dataframe: pd.DataFrame):
        """Filter and remove the models whose metrics do not satisfy the given conditions."""
        remove_dict = dict()
        models = dataframe.index.tolist()
        for model, (metric, cond) in product(models, self.filter_metrics.items()):
            if dataframe.loc[model, metric] < cond:
                remove_dict[model] = metric
        remove_models = list(set(remove_dict.keys()))
        dataframe.drop(labels=remove_models, axis="index", inplace=True)
        return dataframe

    def filter_best_features(self, key_model: str, models_dict: Dict):
        """Compute the most relevant features used by the given model."""
        py_model = models_dict[key_model]
        cond = any([key_model.startswith(name) for name in ["lr", "lda", "ridge", "svm"]])
        score_metric = abs(py_model.coef_[0]) if cond else py_model.feature_importances_
        metrics_dict = {
            "model_id": key_model,
            "model": key_model.split("_")[0],
            "feature": self.x_train.columns,
            "score": score_metric,
        }
        df = pd.DataFrame(metrics_dict).sort_values(by="score", ascending=False)
        top_n_features = self.calculate_number_features(
            number_features=self.number_features,
            features=df,
        )
        return df.iloc[:top_n_features]

    def extract_features(self, dataframe: pd.DataFrame, dict_models: Dict):
        """Update self.features_df with the most relevant features used by the given model."""
        models = dataframe.index.tolist()
        for model in models:  # model extracted from dataframe
            # Check
            if (
                model not in dict_models.keys()
            ):  # check no errors have been produced during operations
                raise KeyError(f"The selected model: {model} is not listed in dict_models.keys()")
            df_conc = self.filter_best_features(key_model=model, models_dict=dict_models)
            self.features_df = pd.concat([self.features_df, df_conc])

    def compute_metrics_df(self):
        """Update self.features_df with the most relevant features used by the standard models."""
        self.model_df = pd.DataFrame(
            data=[], index=self.dict_models.keys(), columns=self.metrics_list
        )
        for model, py_model in self.dict_models.items():
            predict = predict_model(py_model)
            self.model_df = self.get_metrics_df(
                test_predicted=predict,
                model=model,
                dataframe=self.model_df,
            )
        self.model_df = self.remove_bad_models(dataframe=self.model_df)
        self.extract_features(dataframe=self.model_df, dict_models=self.dict_models)

    def filter_tuned_duplicate(self):
        """Remove tuned models with identical metrics."""

        def drop(x):
            x.drop_duplicates(subset=self.metrics_list, keep="first", inplace=True)
            return x

        df = self.model_tuned_df.groupby("model").apply(drop)
        df = (
            df.drop(columns="model").reset_index(level="model")
            if isinstance(df.index, pd.MultiIndex)
            else df
        )
        return df

    def tune_df(self):
        """Update self.features_df with the most relevant features used by tuned models."""
        # Tune dataframe
        self.model_tuned_df = pd.DataFrame(
            data=[],
            index=self.tune_dict_models.keys(),
            columns=["model"] + self.metrics_list,
        )
        # Model entry
        for prim_model in self.dict_models.keys():
            ix = [ind.startswith(prim_model) for ind in self.model_tuned_df.index]
            self.model_tuned_df.loc[ix, "model"] = prim_model
        # Fill dataframe
        for model, py_model in self.tune_dict_models.items():
            predict = predict_model(py_model)
            self.model_tuned_df = self.get_metrics_df(
                test_predicted=predict, model=model, dataframe=self.model_tuned_df
            )
        # Remove duplicate and filter
        self.model_tuned_df = self.filter_tuned_duplicate()
        self.model_tuned_df = self.remove_bad_models(dataframe=self.model_tuned_df)
        # Get features
        self.extract_features(dataframe=self.model_tuned_df, dict_models=self.tune_dict_models)

    def run_feature_extraction(self):
        """Update self.features_df with the most relevant features used by each model."""
        # Initialize feature dataframe and train model
        self.features_df = pd.DataFrame(data=[], columns=["model_id", "model", "feature", "score"])
        self.train_model()
        # Run standard models
        self.create_dict_models()
        self.compute_metrics_df()
        # Run tuned models
        if self.optimize:
            self.create_dict_tuned_models()
            if not bool(self.tune_dict_models):
                raise ValueError("The tune dictionary is empty!")
            self.tune_df()
        # Return the list containing the features and their score
        self.features_df.index.name = "index_rem"
        return self.features_df.reset_index().drop(columns="index_rem")

    def remove_zeros(self):
        """Remove non-relevant features (those with a zero score)."""
        ix = self.features_df["score"] <= 0
        self.features_df.drop(index=self.features_df.loc[ix].index, inplace=True)
        self.features_df.reset_index(drop=True, inplace=True)

    @staticmethod
    def normalize(dataframe):
        """Normalize the pycaret score of each feature."""

        def norm(x):
            x["normal"] = x["score"] / x["score"].max()
            return x

        return (
            dataframe.groupby("model_id")
            .apply(norm)
            .sort_values(["model_id", "normal"], ascending=[True, False])
        )

    @staticmethod
    def feature_score(dataframe):
        """Assign the score to the selected features."""
        group = dataframe.groupby("feature")
        sorted_data = group.agg(
            counts=pd.NamedAgg(column="normal", aggfunc="count"),
            normal_sum=pd.NamedAgg(column="normal", aggfunc="sum"),
        ).sort_values(by=["counts", "normal_sum"], ascending=False)
        sorted_data["final_score"] = sorted_data["normal_sum"] / sorted_data["counts"]
        return sorted_data.sort_values("final_score", ascending=False)

    def create_feature_list(self):
        """Run all necessary methods to extract the list of relevant features."""
        # Call creation features dataframe
        self.features_df = self.run_feature_extraction()
        # Remove zeros and normalize
        self.remove_zeros()
        self.features_df = self.normalize(dataframe=self.features_df)
        # Get score
        scoreboard = self.feature_score(dataframe=self.features_df)
        top_n_features = self.calculate_number_features(
            number_features=self.number_features,
            features=scoreboard,
        )
        filtered = scoreboard.iloc[:top_n_features]
        self.feature_list = filtered.index.tolist()

    def repeat_pipeline(self):
        """Iterate over the process to create the feature list and repeat it self.repeat times."""
        while len(self.feature_list) > self.target_features:
            # Call iteration
            self.create_feature_list()
            if len(self.feature_list) <= 1:
                break
        return self.feature_list


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [3]:
target = "goals_2.5"
test_date = "1-Dec-2021"

In [4]:
feats = Features()

c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\ESP1C.xls
c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\ING1C.xls
c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\ITA1C.xls
Excel file c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\MEX1C.xls is empty. Skipping
c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\FRA1C.xls
Excel file c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\MLS1C.xls is empty. Skipping
c:\users\usuario\desktop\data science\ml_bets-master\data\future_matches\ALE1C.xls


In [5]:
def patched_create_dataset(
    test_date: str,
    target: str,
    columns=None,
    odds_features: bool = True,
    summary: bool = True,
    odds_rankings: bool = True,
    include_std: bool = True,
    features: Features = None,
    ignore_features=None,
    drop_future_matches: bool = True,
    test_weeks: int = 4,
):
    features = features or Features()
    examples = features.create(
        columns=columns,
        odds_features=odds_features,
        odds_rankings=odds_rankings,
        referee_features=True,
        include_std=include_std,
        summary=summary,
    )
    examples = examples[[x for x in examples.columns if "possession" not in x]]
    if ignore_features is not None:
        examples.drop(columns=ignore_features, inplace=True)
    pds = PipelineDatasets(
        examples=examples,
        features=features,
        target=target,
        drop_future_matches=drop_future_matches,
        test_size=test_date,
        test_weeks=test_weeks,
    )
    return pds

In [6]:
ds = patched_create_dataset(target=target,
                    test_date=test_date,
                    features=feats,
                    odds_features=True,
                    include_std=True,
                    ignore_features=IGNORE_FEATURES+["referee", "hour_rank", "hour_before_16", "is_weekend"],
                    drop_future_matches=False,
                    
                   )

In [7]:
train_data = ds.train_data.copy()


In [8]:
setup_kwargs = dict(
        preprocess=True,
        #custom_pipeline=loaded,
        train_size=0.75,
        session_id=123,
        normalize=True,
       # normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=50,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

## Feature selection (without triplet)

In [8]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }
feat_sel = FeatureSelection(target=target,
                            dataset=train_data.dropna(),#[list(set(new_subset+new_feat+[target]))],
                            target_features=500,
                            filter_metrics=metric_param,
                            include=["lr"],
                            setup_kwargs=setup_kwargs,
                            optimize=True,
                            opt_list=["AUC" , "Accuracy" , "Precision" , "Recall"],
                            number_features = 0.56
                           )
selected_features = feat_sel.repeat_pipeline()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5186,0.5299,0.5,0.5286,0.5139,0.0378,0.0378


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5131,0.54,0.5027,0.5225,0.5124,0.0265,0.0265


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5296,0.5489,0.5297,0.5385,0.5341,0.0591,0.0591


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5034,0.5386,0.4811,0.513,0.4965,0.0077,0.0077


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5213,0.5423,0.5081,0.5311,0.5193,0.0431,0.0431


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5296,0.5489,0.5297,0.5385,0.5341,0.0591,0.0591


In [9]:
len(selected_features)

355

In [15]:
sorted(selected_features)

['DEF_mean_a',
 'MED_mean_diff',
 'acc_corners_intobox_pct_std_a',
 'accurate_back_zone_pass_std_a',
 'accurate_chipped_pass_std_diff',
 'accurate_cross_nocorner_std_diff',
 'accurate_cross_std_a',
 'accurate_cross_std_h',
 'accurate_fwd_zone_pass_mean_diff',
 'accurate_fwd_zone_pass_std_a',
 'accurate_fwd_zone_pass_std_h',
 'accurate_goal_kicks_std_h',
 'accurate_keeper_sweeper_mean_a',
 'accurate_keeper_sweeper_mean_diff',
 'accurate_keeper_sweeper_mean_h',
 'accurate_keeper_sweeper_std_a',
 'accurate_keeper_sweeper_std_diff',
 'accurate_keeper_sweeper_std_h',
 'accurate_keeper_throws_std_diff',
 'accurate_launches_mean_a',
 'accurate_launches_std_a',
 'accurate_layoffs_mean_a',
 'accurate_layoffs_std_a',
 'accurate_layoffs_std_diff',
 'accurate_pass_std_a',
 'accurate_pass_std_diff',
 'accurate_pull_back_mean_diff',
 'accurate_through_ball_mean_diff',
 'accurate_throws_std_a',
 'accurate_throws_std_diff',
 'accurate_throws_std_h',
 'att_assist_openplay_mean_a',
 'att_bx_right_mean_h

In [35]:
selected_features_save = ['DEF_mean_a',
 'MED_mean_diff',
 'acc_corners_intobox_pct_std_a',
 'accurate_back_zone_pass_std_a',
 'accurate_chipped_pass_std_diff',
 'accurate_cross_nocorner_std_diff',
 'accurate_cross_std_a',
 'accurate_cross_std_h',
 'accurate_fwd_zone_pass_mean_diff',
 'accurate_fwd_zone_pass_std_a',
 'accurate_fwd_zone_pass_std_h',
 'accurate_goal_kicks_std_h',
 'accurate_keeper_sweeper_mean_a',
 'accurate_keeper_sweeper_mean_diff',
 'accurate_keeper_sweeper_mean_h',
 'accurate_keeper_sweeper_std_a',
 'accurate_keeper_sweeper_std_diff',
 'accurate_keeper_sweeper_std_h',
 'accurate_keeper_throws_std_diff',
 'accurate_launches_mean_a',
 'accurate_launches_std_a',
 'accurate_layoffs_mean_a',
 'accurate_layoffs_std_a',
 'accurate_layoffs_std_diff',
 'accurate_pass_std_a',
 'accurate_pass_std_diff',
 'accurate_pull_back_mean_diff',
 'accurate_through_ball_mean_diff',
 'accurate_throws_std_a',
 'accurate_throws_std_diff',
 'accurate_throws_std_h',
 'att_assist_openplay_mean_a',
 'att_bx_right_mean_h',
 'att_bx_right_std_diff',
 'att_cmiss_high_mean_h',
 'att_cmiss_left_std_a',
 'att_cmiss_left_std_h',
 'att_corner_mean_a',
 'att_corner_ratio_mean_a',
 'att_fastbreak_std_h',
 'att_freekick_goal_std_diff',
 'att_freekick_miss_mean_h',
 'att_freekick_miss_std_a',
 'att_freekick_miss_std_diff',
 'att_freekick_miss_std_h',
 'att_freekick_target_std_diff',
 'att_freekick_total_mean_a',
 'att_freekick_total_mean_diff',
 'att_freekick_total_mean_h',
 'att_freekick_total_std_diff',
 'att_goal_high_centre_mean_diff',
 'att_goal_high_centre_std_diff',
 'att_goal_high_left_mean_a',
 'att_goal_high_left_mean_diff',
 'att_goal_high_left_std_h',
 'att_goal_high_right_std_a',
 'att_goal_high_right_std_diff',
 'att_goal_low_centre_mean_a',
 'att_goal_low_centre_mean_h',
 'att_goal_low_centre_std_a',
 'att_goal_low_centre_std_h',
 'att_goal_low_right_mean_h',
 'att_hd_goal_mean_a',
 'att_hd_goal_mean_h',
 'att_hd_post_std_h',
 'att_hd_target_mean_a',
 'att_hd_total_mean_diff',
 'att_hd_total_std_h',
 'att_ibox_blocked_mean_a',
 'att_ibox_blocked_std_diff',
 'att_ibox_blocked_std_h',
 'att_ibox_own_goal_mean_h',
 'att_ibox_own_goal_std_diff',
 'att_lf_goal_mean_diff',
 'att_lf_goal_mean_h',
 'att_lf_goal_std_h',
 'att_lg_centre_mean_diff',
 'att_lg_centre_mean_h',
 'att_lg_centre_std_a',
 'att_miss_high_left_std_a',
 'att_miss_high_left_std_h',
 'att_miss_right_std_diff',
 'att_miss_right_std_h',
 'att_obox_blocked_mean_h',
 'att_obox_blocked_std_a',
 'att_obox_goal_mean_h',
 'att_obox_goal_std_a',
 'att_obox_goal_std_diff',
 'att_obox_miss_mean_a',
 'att_obox_miss_mean_diff',
 'att_obox_miss_std_a',
 'att_obox_post_mean_diff',
 'att_obx_centre_std_a',
 'att_obx_right_mean_diff',
 'att_obxd_right_mean_diff',
 'att_obxd_right_std_diff',
 'att_one_on_one_std_diff',
 'att_openplay_mean_h',
 'att_openplay_std_h',
 'att_pen_goal_mean_a',
 'att_pen_goal_mean_h',
 'att_pen_goal_std_a',
 'att_post_left_std_a',
 'att_post_left_std_diff',
 'att_post_left_std_h',
 'att_rf_goal_mean_h',
 'att_rf_target_std_a',
 'att_rf_total_mean_a',
 'att_rf_total_std_diff',
 'att_setpiece_std_a',
 'att_sv_low_centre_mean_h',
 'att_sv_low_centre_std_diff',
 'att_sv_low_right_std_h',
 'attempts_conceded_ibox_mean_diff',
 'attempts_conceded_ibox_std_diff',
 'attempts_ibox_std_a',
 'attempts_ibox_std_h',
 'attempts_obox_mean_h',
 'backward_pass_mean_a',
 'backward_pass_std_h',
 'ball_recovery_mean_diff',
 'bc_miss_div_created_std_h',
 'bc_miss_div_scored_std_h',
 'bc_scored_div_created_mean_a',
 'bc_scored_div_created_mean_diff',
 'bc_scored_div_created_std_h',
 'big_chance_created_mean_h',
 'big_chance_created_std_diff',
 'big_chance_missed_std_diff',
 'big_chance_scored_mean_diff',
 'blocked_cross_mean_a',
 'blocked_cross_std_a',
 'challenge_lost_std_a',
 'challenge_lost_std_diff',
 'clean_sheet_std_diff',
 'contentious_decision_mean_diff',
 'contentious_decision_std_a',
 'corner_taken_std_diff',
 'corners_ratio_mean_h',
 'crosses_18yard_mean_a',
 'dispossessed_std_h',
 'diving_save_mean_h',
 'diving_save_std_diff',
 'draw_streak_h',
 'duel_won_std_a',
 'effective_clearance_mean_a',
 'effective_clearance_mean_diff',
 'effective_head_clearance_mean_h',
 'error_lead_to_goal_mean_diff',
 'first_yellow_card_1t_std_diff',
 'first_yellow_card_std_a',
 'freekick_cross_std_a',
 'fwd_pass_std_a',
 'goal_assist_deadball_mean_diff',
 'goal_assist_deadball_std_h',
 'goal_assist_intent_norm_mean_h',
 'goal_assist_intent_norm_std_diff',
 'goal_assist_intentional_mean_diff',
 'goal_assist_intentional_mean_h',
 'goal_assist_mean_a',
 'goal_assist_mean_h',
 'goal_assist_openplay_mean_h',
 'goal_assist_openplay_std_a',
 'goal_assist_openplay_std_diff',
 'goal_assist_openplay_std_h',
 'goal_assist_setplay_mean_a',
 'goal_assist_std_diff',
 'goal_assist_std_h',
 'goal_fastbreak_mean_diff',
 'goal_fastbreak_mean_h',
 'goal_kicks_div_long_passes_std_diff',
 'goal_kicks_per_shot_mean_a',
 'goals_2t_mean_diff',
 'goals_2t_mean_h',
 'goals_2t_pct_std_h',
 'goals_conceded_ibox_mean_h',
 'goals_conceded_mean_a',
 'goals_mean_a',
 'goals_mean_diff',
 'goals_openplay_mean_a',
 'goals_openplay_std_h',
 'goals_std_h',
 'good_high_claim_mean_a',
 'good_high_claim_mean_diff',
 'good_high_claim_mean_h',
 'hit_woodwork_mean_a',
 'interceptions_in_box_mean_diff',
 'interceptions_in_box_std_a',
 'interceptions_in_box_std_diff',
 'interceptions_in_box_std_h',
 'last_man_tackle_mean_diff',
 'last_man_tackle_std_a',
 'left_div_right_foot_goals_mean_a',
 'leftside_pass_std_diff',
 'lineup_mean_h',
 'long_pass_own_to_opp_success_std_diff',
 'lost_corners_mean_a',
 'lost_corners_std_diff',
 'no_foot_goals_ratio_mean_h',
 'no_foot_goals_ratio_std_a',
 'no_foot_goals_ratio_std_h',
 'odd_ratio_over_cards_5.5',
 'odds_away_over_cards_5.5_a',
 'odds_away_over_corners_9.5_diff',
 'odds_away_under_cards_3.5_h',
 'odds_away_under_corners_6.5_a',
 'odds_away_under_corners_8.5_a',
 'odds_home_over_goals_1.5_a',
 'odds_home_under_cards_4.5_h',
 'odds_home_under_goals_0.5_diff',
 'ontarget_att_assist_std_a',
 'ontarget_att_assist_std_diff',
 'open_play_pass_std_a',
 'outfielder_block_mean_h',
 'own_goals_std_diff',
 'own_goals_std_h',
 'passes_left_div_blocked_pass_std_h',
 'passes_right_mean_diff',
 'passes_right_std_a',
 'passes_right_std_diff',
 'passes_right_std_h',
 'pen_area_entries_mean_a',
 'pen_goals_conceded_std_diff',
 'penalty_conceded_mean_a',
 'penalty_faced_mean_diff',
 'penalty_faced_std_diff',
 'penalty_save_mean_diff',
 'penalty_save_mean_h',
 'penalty_won_mean_a',
 'penalty_won_std_a',
 'points_diff',
 'points_std_a',
 'points_std_h',
 'poss_won_att_3rd_std_a',
 'poss_won_att_3rd_std_diff',
 'poss_won_att_3rd_std_h',
 'poss_won_mid_3rd_std_a',
 'post_scoring_att_std_diff',
 'post_scoring_att_std_h',
 'prob_squared_over_both_score',
 'prob_squared_over_goals_2.5',
 'prob_squared_under_goals_2.5',
 'prob_squared_under_goals_4.5',
 'pts_dropped_winning_pos_mean_a',
 'punches_mean_a',
 'ranking_h',
 'ranking_mean_diff',
 'ratio_over_cards_4.5_a',
 'ratio_over_corners_8.5_diff',
 'ratio_over_corners_9.5_a',
 'ratio_under_corners_10.5_diff',
 'ratio_under_corners_10.5_h',
 'ratio_under_corners_11.5_h',
 'ratio_under_corners_8.5_h',
 'ratio_under_corners_9.5_diff',
 'ratio_under_goals_4.5_h',
 'raw_prob_over_goals_1.5',
 'raw_prob_over_goals_2.5',
 'raw_prob_under_corners_8.5',
 'raw_prob_under_goals_4.5',
 'red_card_1t_mean_a',
 'red_card_2t_mean_a',
 'red_card_away_ref',
 'red_card_home_ref',
 'red_card_std_a',
 'right_to_left_goals_mean_diff',
 'right_to_left_goals_std_diff',
 'rightside_pass_mean_a',
 'rightside_pass_std_diff',
 'saves_std_h',
 'second_yellow_mean_diff',
 'shots_div_passes_right_std_h',
 'shots_mul_goals_mean_a',
 'shots_mul_goals_std_h',
 'shots_std_diff',
 'subs_made_mean_h',
 'successful_fifty_fifty_std_a',
 'successful_final_third_passes_std_h',
 'successful_put_through_std_a',
 'successful_put_through_std_diff',
 'successful_put_through_std_h',
 'total_back_zone_pass_mean_a',
 'total_bets_over_cards_4.5_a',
 'total_bets_over_cards_4.5_diff',
 'total_bets_over_cards_5.5_diff',
 'total_bets_under_cards_4.5_h',
 'total_bets_under_corners_6.5_diff',
 'total_bets_under_corners_9.5_h',
 'total_bets_under_goals_2.5_a',
 'total_chipped_pass_std_a',
 'total_clearance_mean_h',
 'total_cross_nocorner_std_h',
 'total_expulsions_ref',
 'total_final_third_passes_std_diff',
 'total_fwd_zone_pass_mean_diff_mul_goal_kicks_per_shot_mean_diff',
 'total_fwd_zone_pass_std_a',
 'total_fwd_zone_pass_std_diff',
 'total_high_claim_mean_diff',
 'total_high_claim_mean_h',
 'total_high_claim_std_a',
 'total_high_claim_std_diff',
 'total_high_claim_std_h',
 'total_keeper_sweeper_mean_a',
 'total_keeper_sweeper_mean_diff',
 'total_keeper_sweeper_mean_h',
 'total_keeper_sweeper_std_diff',
 'total_launches_mean_a',
 'total_launches_std_h',
 'total_layoffs_mean_diff',
 'total_layoffs_std_a',
 'total_layoffs_std_diff',
 'total_layoffs_std_h',
 'total_penalty_match_ref',
 'total_red_card_1t_ref',
 'total_red_card_2t_ref',
 'total_red_card_match_ref',
 'total_second_yel_card_home_ref',
 'total_through_ball_mean_a',
 'total_through_ball_std_a',
 'total_throws_mean_h',
 'total_throws_std_a',
 'total_throws_std_diff',
 'total_throws_std_h',
 'total_win_pct_over_corners_7.5_diff',
 'total_win_pct_over_corners_9.5_a',
 'total_win_pct_over_goals_2.5_h',
 'total_win_pct_under_cards_3.5_diff',
 'total_win_pct_under_goals_1.5_a',
 'total_win_pct_under_goals_3.5_h',
 'total_yel_card_std_a',
 'win_pct_away_over_cards_6.5_a',
 'win_pct_away_over_corners_10.5_diff',
 'win_pct_away_over_corners_8.5_a',
 'win_pct_away_over_goals_0.5_diff',
 'win_pct_away_over_goals_2.5_a',
 'win_pct_away_over_goals_2.5_diff',
 'win_pct_away_under_both_score_h',
 'win_pct_away_under_goals_2.5_a',
 'win_pct_away_under_goals_2.5_diff',
 'win_pct_home_over_cards_6.5_a',
 'win_pct_home_over_goals_2.5_h',
 'win_pct_home_under_cards_4.5_diff',
 'win_pct_home_under_corners_11.5_h',
 'win_pct_home_under_corners_8.5_a',
 'win_pct_home_under_corners_9.5_h',
 'win_pct_home_under_goals_2.5_diff',
 'win_pct_home_under_goals_2.5_h',
 'winner_mean_diff',
 'winner_std_diff',
 'winner_std_h',
 'won_contest_mean_a',
 'won_corners_ratio_mean_a',
 'won_corners_ratio_mean_h',
 'won_corners_ratio_std_h',
 'won_tackle_std_a']

#### new imports 

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
#from featsel.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [11]:
from ml_bets.research.tips import calibrate_tips, tips_from_model, combine_tips, predict_dataset, compose_tips, get_tip_probs

In [12]:
ix = np.logical_and(feats.matches["date"].dt.month > 10,
                    feats.matches["competition"].isin({"mexican_primera", 'us_major_league_soccer'}))
index = feats.matches[~ix].index

In [16]:
def setup_dataset(test_date):
    ds = create_dataset(target=target,
                        test_date=test_date,
                        features=feats,
                        odds_features=True,
                        include_std=True,
                        test_weeks=4,
                        ignore_features=IGNORE_FEATURES,
                        drop_future_matches=False,
                       )
    train_data = ds.train_data.copy()#[ds.train_data.index.map(lambda x: "us_major_league_soccer" not in x and "mexican" not in x)]
    train_data.drop(columns=["hour_rank", "hour_before_16", "is_weekend"], inplace=True)
    train_data = train_data[train_data.index.isin(index)][list(set(selected_features)) + [target]].copy()#.reset_index(drop=True)
    test_data = ds.test_set[ds.test_set.index.isin(index)][list(set(selected_features)) + [target]].copy()#.reset_index(drop=True)
    val_data = ds.val_set[ds.val_set.index.isin(index)].copy()
    setup_kwargs = dict(
        preprocess=True,
        test_data=test_data[train_data.columns.tolist()],#.dropna(),
        #numeric_features=[x for x in train_data.columns.tolist() if x != target],
        #custom_pipeline=loaded,
        #train_size=0.75,
        session_id=123,
        normalize=True,
        normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=50,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )
    _ = setup(data=train_data, target=target, **setup_kwargs)
    return train_data, test_data, val_data, ds

In [17]:
train_data, test_data, val_data, ds = setup_dataset("4-Dec-2021")

In [22]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_ensemble():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "et", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="AUC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    blend = blend_models(cali)
    opti = tune_model(blend, optimize="Precision", choose_better=True, n_iter=50, search_library="optuna")
    return opti, cali, tuned_models, blend

In [23]:
opti_model, cali_model, tune_model, blend_model = train_ensemble()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6208,0.0,0.6404,0.6357,0.638,0.2399,0.2399
1,0.6049,0.0,0.6232,0.6216,0.6224,0.2081,0.2081
2,0.6422,0.0,0.6404,0.6633,0.6516,0.2841,0.2843
3,0.5997,0.0,0.5246,0.6435,0.578,0.2048,0.2087
Mean,0.6169,0.0,0.6071,0.641,0.6225,0.2342,0.2353
SD,0.0165,0.0,0.0482,0.015,0.0277,0.0319,0.0311


TypeError: calibrate_model() got an unexpected keyword argument 'calibrate_fold'

In [39]:
top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5831,0.6161,0.5937,0.603,0.5971,0.1652,0.1658,0.63
svm,SVM - Linear Kernel,0.578,0.0,0.5975,0.6037,0.5845,0.1544,0.1656,0.205
ridge,Ridge Classifier,0.5825,0.0,0.5937,0.6018,0.5966,0.1639,0.1645,0.2
lda,Linear Discriminant Analysis,0.5819,0.6107,0.5962,0.6003,0.5972,0.1624,0.1629,0.21


In [41]:
evaluate_model(top_models[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Feature selection (adding triplet)

In [10]:
common_feat = list(set(["_".join(x.split("_")[:-1]) for x in selected_features]))
new_subset = [x for x in train_data.columns for c in common_feat if c in x]

len(selected_features), len(new_subset), sorted(new_subset)

(181,
 583,
 ['ATT_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'DEF_mean_a',
  'DEF_mean_diff',
  'DEF_mean_h',
  'MED_a',
  'MED_diff',
  'MED_h',
  'MED_mean_a',
  'MED_mean_diff',
  'MED_mean_h',
  'MED_std_a',
  'MED_std_diff',
  'MED_std_h',
  'accurate_cross_nocorner_std_a',
  'accurate_cross_nocorner_std_diff',
  'accurate_cross_nocorner_std_h',
  'accurate_cross_std_a',
  'accurate_cross_std_diff',
  'accurate_cross_std_h',
  'accurate_fwd_zone_pass_mean_a',
  'accurate_fwd_zone_pass_mean_diff',
  'accurate_fwd_zone_pass_mean_h',
  'accurate_fwd_zone_pass_std_a',
  'accurate_fwd_zone_pass_std_diff',
  'accurate_fwd_zone_pass_std_h',
  'accurate_goal_kicks_std_a',
  'accurate_goal_kicks_std_diff',
  'accurate_goal_kicks_std_h',
  'accurate_keeper_sweeper_mean_a',
  'accurate_keeper_sweeper_mean_diff',
  'accurate_keeper_sweeper_mean_h',
  'accurate_keeper_sweeper_std_a',
  'accurate_keeper_sweeper_std_diff',
  'accurate_keeper_sweeper_std_h',
  '

In [9]:
"""Monkey patch feature_selection module."""
from typing import List, Tuple

# from feature_selection import FeatureSelection


def get_root_features(feature_list: List) -> Tuple[List, List]:
    """Get the features related to teams, i.e., those ended with "_a", "_h", "_diff"."""
    # Conditions
    cond1 = lambda x: x.endswith("_h")
    cond2 = lambda x: x.endswith("_a")
    cond3 = lambda x: x.endswith("_diff")
    # Features ended with _h, _a, _diff
    sublist = [c for c in feature_list if (cond1(c) or cond2(c) or cond3(c))]
    return sublist, list(set(["_".join(c.split("_")[:-1]) for c in sublist]))


def append_list(root_features: List, og_columns: List):
    """Append suffixes to the incoming feature labels."""
    # Create empty list
    dressed_list = list()
    # Add  suffixes to each root label
    # dressed_list = [c for root in root_features for c in df_columns if c.startswith(root)]
    for c in root_features:
        dressed_list.append(f"{c}_h")
        dressed_list.append(f"{c}_a")
        dressed_list.append(f"{c}_diff")
    dressed_list = [c for c in dressed_list if c in og_columns]
    # return list(set(dressed_list))
    return dressed_list


def massage_feat_list(feature_list: List, og_columns: List):
    """
    Add team features to the incoming list.

    Given a feature list, this function adds the full trio of team
    features (*_h, *_a, *_diff) to those already present that are
    related to team statistics.
    """
    feature_list = feature_list.copy()
    # Get list of match features and root
    sublist, root = get_root_features(feature_list)
    # Get features not related to teams
    no_team = list(set(feature_list) - set(sublist))
    # Add suffixes to roots
    dressed = append_list(root_features=root, og_columns=og_columns.copy())
    # return list(set(no_team + dressed))
    return no_team + dressed


def finalize_list(feature_list: List, df_columns: List):
    """
    Complete the feature_list returned by the selection module.

    Once the number of features has been reduced to a subset of
    relevant features, this function completes the list by adding
    team features to those already present.

    If a team feature is present, but the set only contains the one
    related to the home or away team, the function adds the corresponding
    feature related to the difference between teams, i.e., team_feat_diff.
    """
    feature_list = feature_list.copy()
    # Elements with _h, _a, _diff
    sublist, roots = get_root_features(feature_list)
    # List to be used as building block
    final_list = list(set(feature_list) - set(sublist))
    # Check for team features
    for root in roots:
        triplet = [f"{root}_h", f"{root}_a", f"{root}_diff"]
        subset = [c for tri in triplet for c in sublist if c == tri]
        if len(subset) < 2 and not subset[0].endswith("_diff"):
            subset.append(f"{root}_diff")
        final_list = final_list + subset
    final_list = [c for c in final_list if c in df_columns]
    return final_list


def patch_repeat_pipeline(self):
    """Iterate over the process to create the feature list and repeat it self.repeat times (PATCH)."""
    i = 0
    while len(self.feature_list) > self.target_features:
        i += 1
        print(f"{i} iteration")
        # Call iteration
        self.create_feature_list()
        self.feature_list = massage_feat_list(
            feature_list=self.feature_list, og_columns=self.x_df.columns
        )
        if len(self.feature_list) <= 1:
            break
    return finalize_list(feature_list=self.feature_list, df_columns=self.x_df.columns)

def patch_train_model(self):
        """Preprocess the data and select self.number_models top models."""
        # Selected dataset
        selected_cols = self.feature_list + [self.target]
        train_data = self.dataset[selected_cols] if self.x_df.empty else self.x_df[selected_cols]
        # Numeric features
        self.setup_kwargs["numeric_features"] = [
            c for c in self.numeric_features if c in self.feature_list
        ]
        # Ignore features
        self.setup_kwargs["ignore_features"] = [
            c for c in self.ignore_features if c in self.feature_list
        ]
        self.setup_kwargs["feature_interaction"]: False
        # Initialize pycaret setup
        setup(data=train_data, target=self.target, **self.setup_kwargs)
        # Get train dataset and preprocessed dataframe
        self.x_train = get_config("X_train")
        if self.x_df.empty:  # TODO change x_df by dataset and add flag?
            self.x_df = pd.concat([get_config("X"), get_config("y")], axis=1)
            self.setup_kwargs["preprocess"] = False  # Turn off preprocessing
        # Compare models
        self.top_models = self._training_function(**self._args)

In [10]:
def patch_train_model(self):
        """Preprocess the data and select self.number_models top models."""
        # Selected dataset
        selected_cols = self.feature_list + [self.target]
        train_data = self.dataset[selected_cols] if self.x_df.empty else self.x_df[selected_cols]
        # Numeric features
        self.setup_kwargs["numeric_features"] = [
            c for c in self.numeric_features if c in self.feature_list
        ]
        # Ignore features
        self.setup_kwargs["ignore_features"] = [
            c for c in self.ignore_features if c in self.feature_list
        ]
        self.setup_kwargs["feature_interaction"]: False
        # Initialize pycaret setup
        setup(data=train_data, target=self.target, **self.setup_kwargs)
        # Get train dataset and preprocessed dataframe
        self.x_train = get_config("X_train")
        if self.x_df.empty:  # TODO change x_df by dataset and add flag?
            self.x_df = pd.concat([get_config("X"), get_config("y")], axis=1)
            self.setup_kwargs["preprocess"] = False  # Turn off preprocessing
        # Compare models
        self.top_models = self._training_function(**self._args)

In [11]:
FeatureSelection.train_model = patch_train_model



In [12]:
FeatureSelection.repeat_pipeline = patch_repeat_pipeline



In [13]:
FeatureSelection.repeat_pipeline

<function __main__.patch_repeat_pipeline(self)>

In [29]:
FeatureSelection.filter_tuned_duplicate??

In [13]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }
feat_sel = FeatureSelection(target=target,
                            dataset=train_data.dropna(),#[list(set(new_subset+new_feat+[target]))],
                            target_features=0.56,
                            filter_metrics=metric_param,
                            include=["lr"],
                            setup_kwargs=setup_kwargs,
                            optimize=True,
                            opt_list=["AUC" , "Accuracy" , "Precision" , "Recall"],
                            number_features = 0.56
                           )
triplet_selected_features = feat_sel.repeat_pipeline()



1 iteration


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5186,0.5299,0.5,0.5286,0.5139,0.0378,0.0378


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5089,0.5322,0.4892,0.5186,0.5035,0.0186,0.0186


In [14]:
len(triplet_selected_features)

1445

In [15]:
feat_sel.x_df

Unnamed: 0_level_0,win_pct_h,draw_pct_h,loss_pct_h,win_streak_h,loss_streak_h,draw_streak_h,points_h,ranking_h,ATT_h,MED_h,...,total_expulsions_ref,total_tackle_ref,total_fk_foul_ref,total_penalty_match_ref,total_hand_ball_ref,total_fouls_home_ref,total_fouls_away_ref,total_fouls_match_ref,fk_per_tackle_ref,goals_2.5
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1069833_german_bundesliga,-1.643901,1.533277,0.214646,-0.714075,-0.783440,1.612666,0.639803,-0.179871,0.315282,-0.131555,...,0.985318,1.101278,-1.029729,-0.793801,0.917055,-0.732395,-0.728244,-0.827393,-1.218083,1
1069835_german_bundesliga,-0.664834,-1.429944,1.660861,1.123439,-0.783440,-0.599329,0.372292,0.631282,-0.405111,-0.514660,...,-0.369170,0.356790,-0.346304,-1.454542,0.156403,0.124371,-0.627903,-0.305101,-0.380686,0
1069836_german_bundesliga,-0.664834,-0.199662,0.974545,-0.714075,1.710316,-0.599329,0.372292,0.959403,-1.008551,-0.857581,...,0.542920,0.774597,1.110759,-1.454542,-0.907535,1.374438,0.197598,0.864320,0.337828,0
1069834_german_bundesliga,-0.664834,2.212205,-1.666619,-0.714075,-0.783440,1.869795,0.058238,-0.025558,-0.717569,-1.320451,...,1.406218,0.391996,1.565295,2.265159,1.301284,1.168461,1.635305,1.640432,0.764458,1
1069842_german_bundesliga,-0.664834,0.759340,0.214646,-0.714075,-0.783440,1.612666,0.721268,0.389111,-0.717569,-0.514660,...,-0.296511,0.965913,-0.156935,-0.105425,-0.590373,0.562788,-0.929066,-0.234115,-0.369954,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2219403_spanish_la_liga,-1.643901,0.759340,0.974545,-0.714075,1.710316,-0.599329,-0.245043,0.959403,-0.238578,0.080462,...,0.372171,0.721725,0.121370,1.021130,-0.051437,-0.058292,0.272404,0.093577,-0.371811,0
2229046_italian_serie_a,0.203837,-1.429944,0.974545,-0.714075,0.985387,-0.599329,-0.119351,0.744759,-0.405111,0.306435,...,1.682687,-2.489470,0.417330,0.763942,0.568077,0.583474,0.272404,0.460710,1.857669,1
2210408_english_premier_league,-1.643901,1.533277,0.214646,-0.714075,0.985387,-0.599329,-1.194145,0.389111,0.315282,-0.131555,...,0.372171,1.580043,-0.084230,-0.105425,0.685931,0.208313,-0.127036,0.017975,-1.040655,0
2229048_italian_serie_a,0.203837,0.759340,-0.670644,-0.714075,0.985387,-0.599329,-0.245043,1.160365,-0.405111,-0.514660,...,-1.639614,1.974827,0.809434,-2.368850,-0.590373,0.275842,0.869680,0.638675,-0.802420,0


In [19]:
feat_sel.dataset.to_csv('Vicente_Dataset.csv')

In [21]:
feat_sel.x_df.to_csv('Vicente_x_df.csv')

In [45]:
evaluate_model(feat_sel.top_models[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [18]:
error = ['win_pct_home_over_corners_11.5_h', 'ranking_a', 'odds_home_over_corners_6.5_diff', 'total_bets_under_corners_6.5_h', 'win_pct_away_over_cards_6.5_diff', 'ratio_under_cards_2.5_a', 'total_clearance_mean_a', 'red_card_1t_mean_h', 'total_bets_over_cards_5.5_a', 'head_clearance_std_diff', 'att_obx_left_mean_a', 'MED_mean_h', 'pts_dropped_winning_pos_mean_h', 'win_pct_home_under_corners_11.5_diff', 'fifty_fifty_std_h', 'total_bets_under_cards_4.5_a', 'red_card_2t_std_h', 'total_bets_over_cards_4.5_h', 'ranking_mean_h', 'penalty_save_std_h', 'red_card_std_h', 'att_cmiss_high_left_mean_h', 'att_cmiss_high_left_mean_a', 'att_post_right_mean_a', 'total_bets_over_corners_9.5_h', 'win_pct_home_under_corners_11.5_a', 'head_clearance_mean_h', 'effective_head_clearance_mean_a', 'odds_home_under_goals_0.5_a', 'ATT_h_mul_shots_mul_goals_mean_a', 'total_bets_under_cards_4.5_diff', 'red_card_2t_std_a', 'accurate_keeper_throws_mean_a', 'ratio_over_cards_2.5_a', 'ranking_diff', 'att_lg_centre_std_h', 'corners_std_diff', 'effective_clearance_mean_h', 'att_obx_left_std_h', 'att_freekick_goal_std_h', 'ratio_over_cards_2.5_h', 'odds_away_over_corners_9.5_a', 'att_goal_high_centre_std_h', 'total_red_card_std_a', 'clearance_off_line_mean_h', 'head_clearance_mean_diff', 'odds_away_over_both_score_a', 'own_goals_mean_h', 'ratio_over_cards_6.5_h', 'att_obxd_right_std_h', 'att_lg_centre_mean_a', 'pts_dropped_winning_pos_mean_diff', 'att_freekick_goal_std_a', 'fifty_fifty_std_a', 'total_fwd_zone_pass_mean_diff_mul_goal_kicks_per_shot_mean_a', 'ATT_h_mul_shots_mul_goals_mean_h', 'total_bets_over_corners_9.5_diff', 'shots_mul_goals_mean_diff', 'DEF_mean_diff', 'win_pct_home_over_cards_6.5_diff', 'total_bets_over_cards_2.5_h', 'total_bets_under_cards_2.5_a', 'own_goals_std_a', 'ratio_under_corners_6.5_a', 'last_man_tackle_std_h', 'att_goal_high_centre_std_a', 'att_ibox_own_goal_mean_a', 'ratio_under_cards_2.5_h', 'att_ibox_own_goal_std_h', 'penalty_save_mean_a', 'att_post_right_std_h', 'att_hd_post_std_a', 'att_obxd_right_mean_a', 'ratio_under_corners_6.5_h', 'total_bets_under_goals_2.5_diff', 'ratio_over_cards_6.5_a', 'last_man_tackle_mean_a', 'att_obx_right_mean_h', 'total_bets_under_cards_2.5_diff', 'att_pen_target_mean_a', 'total_bets_under_corners_6.5_a', 'total_clearance_mean_diff', 'att_cmiss_high_right_std_a', 'total_bets_under_corners_9.5_a', 'goal_fastbreak_mean_a', 'odds_home_under_goals_0.5_h', 'att_obx_right_mean_a', 'att_cmiss_high_right_mean_h', 'total_fwd_zone_pass_mean_diff_mul_goal_kicks_per_shot_mean_h'] 

In [12]:
[c for c in feat_sel.dataset.columns if 'att_lg_centre_std_h' in c]

['att_lg_centre_std_h']

In [22]:
np.array(error)[~np.array([c in feat_sel.dataset.columns for c in error])]


array(['ATT_h_mul_shots_mul_goals_mean_a',
       'total_fwd_zone_pass_mean_diff_mul_goal_kicks_per_shot_mean_a',
       'ATT_h_mul_shots_mul_goals_mean_h',
       'total_fwd_zone_pass_mean_diff_mul_goal_kicks_per_shot_mean_h'],
      dtype='<U60')

In [20]:
np.array([c in feat_sel.dataset.columns for c in error])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False])

In [13]:
common_feat = list(set(["_".join(x.split("_")[:-1]) for x in selected_features]))
new_subset = [x for x in train_data.columns for c in common_feat if c in x]

len(selected_features), len(new_subset), sorted(new_subset)

(145,
 276,
 ['ATT_h_mul_shots_mul_goals_mean_diff',
  'ATT_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'DEF_h_mul_shots_mul_goals_mean_diff',
  'accurate_cross_nocorner_std_a',
  'accurate_cross_nocorner_std_diff',
  'accurate_cross_nocorner_std_h',
  'accurate_cross_std_a',
  'accurate_cross_std_diff',
  'accurate_cross_std_h',
  'accurate_goal_kicks_std_a',
  'accurate_goal_kicks_std_diff',
  'accurate_goal_kicks_std_h',
  'accurate_keeper_sweeper_mean_a',
  'accurate_keeper_sweeper_mean_diff',
  'accurate_keeper_sweeper_mean_h',
  'accurate_keeper_sweeper_std_a',
  'accurate_keeper_sweeper_std_diff',
  'accurate_keeper_sweeper_std_h',
  'accurate_launches_mean_a',
  'accurate_launches_mean_diff',
  'accurate_launches_mean_h',
  'accurate_layoffs_mean_a',
  'accurate_layoffs_mean_diff',
  'accurate_layoffs_mean_h',
  'accurate_layoffs_std_a',
  'accurate_layoffs_std_diff',
  'accurate_layoffs_std_h',
  'accurate_through_ball_mean_a',
  'accurate_thro

#### new imports 

In [12]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
#from featsel.feature_selection import FeatureSelection

from pycaret.utils import check_metric
from pycaret.classification import (add_metric, calibrate_model, optimize_threshold,
    create_model,
    finalize_model,
    optimize_threshold,              
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

from ml_bets.supplementary.functions import IGNORE_FEATURES
from ml_bets.research.datasets import create_dataset
from ml_bets.features.names.goals import GOALS_FEATURES, NEW_FEATURES

In [13]:
from ml_bets.research.tips import calibrate_tips, tips_from_model, combine_tips, predict_dataset, compose_tips, get_tip_probs

In [14]:
ix = np.logical_and(feats.matches["date"].dt.month > 10,
                    feats.matches["competition"].isin({"mexican_primera", 'us_major_league_soccer'}))
index = feats.matches[~ix].index

In [15]:
test_feats = ['acc_cross_nocorner_pct_std_a',
 'acc_cross_nocorner_pct_std_diff',
 'acc_cross_nocorner_pct_std_h',
 'accurate_flick_on_mean_h',
 'accurate_flick_on_mean_a',
 'accurate_freekick_cross_mean_a',
 'accurate_freekick_cross_mean_diff',
 'accurate_freekick_cross_mean_h',
 'accurate_goal_kicks_mean_diff',
 'accurate_goal_kicks_std_a',
 'accurate_goal_kicks_std_diff',
 'accurate_goal_kicks_std_h',
 'accurate_keeper_sweeper_mean_a',
 'accurate_keeper_sweeper_std_diff',
 'accurate_keeper_sweeper_std_h',
 'accurate_pass_std_a',
 'accurate_pass_std_h',
 'accurate_through_ball_mean_diff',
 'att_assist_openplay_std_a',
 'att_assist_openplay_std_h',
 'att_cmiss_left_mean_a',
 'att_cmiss_left_mean_h',
 'att_cmiss_left_std_a',
 'att_cmiss_left_std_diff',
 'att_cmiss_left_std_h',
 'att_goal_high_centre_mean_diff',
 'att_goal_high_centre_std_diff',
 'att_goal_high_right_std_a',
 'att_goal_low_centre_mean_diff',
 'att_goal_low_centre_mean_h',
 'att_goal_low_centre_std_a',
 'att_goal_low_centre_std_h',
 'att_hd_goal_mean_a',
 'att_hd_goal_mean_h',
 'att_hd_target_mean_a',
 'att_hd_target_mean_h',
 'att_ibox_own_goal_mean_diff',
 'att_ibox_own_goal_std_h',
 'att_ibox_own_goal_std_a',
 'att_miss_high_mean_a',
 'att_miss_high_mean_h',
 'att_miss_high_right_mean_diff',
 'att_obxd_right_mean_diff',
 'att_obxd_right_std_diff',
 'att_one_on_one_mean_h',
 'att_one_on_one_mean_a',
 'att_one_on_one_std_diff',
 'att_one_on_one_std_h',
 'att_post_high_std_a',
 'att_post_high_std_h',
 'att_post_right_mean_a',
 'att_post_right_mean_h',
 'attempts_ibox_std_a',
 'attempts_ibox_std_h',
 'backward_pass_mean_diff',
 'backward_pass_mean_h',
 'big_chance_created_mean_h',
 'big_chance_created_std_diff',
 'clean_sheet_std_diff',
 'contentious_decision_mean_a',
 'contentious_decision_mean_diff',
 'contentious_decision_std_a',
 'contentious_decision_std_h',
 'duel_won_pct_mean_diff',
 'effective_clearance_mean_diff',
 'error_lead_to_goal_mean_diff',
 'error_lead_to_goal_std_diff',
 'first_yellow_card_1t_mean_h',
 'first_yellow_card_1t_mean_a',
 'foul_throw_in_std_diff',
 'fouled_final_third_mean_diff',
 'goal_assist_deadball_mean_diff',
 'goal_assist_openplay_std_diff',
 'goal_assist_openplay_std_h',
 'goal_assist_setplay_std_a',
 'goal_assist_setplay_std_diff',
 'goal_assist_setplay_std_h',
 'goal_assist_std_diff',
 'goals_mean_diff',
 'goals_openplay_std_h',
 'goals_openplay_std_a',
 'good_high_claim_mean_diff',
 'high_to_low_goals_mean_a',
 'high_to_low_goals_mean_diff',
 'high_to_low_goals_std_a',
 'high_to_low_goals_std_h',
 'imp_prob_under_goals_0.5_h',
 'imp_prob_under_goals_2.5_diff',
 'interception_mean_a',
 'interception_mean_diff',
 'interception_mean_h',
 'interceptions_in_box_std_a',
 'interceptions_in_box_std_diff',
 'interceptions_in_box_std_h',
 'last_man_tackle_mean_a',
 'last_man_tackle_mean_diff',
 'left_div_right_foot_goals_std_diff',
 'leftside_pass_mean_a',
 'leftside_pass_mean_h',
 'leftside_pass_std_a',
 'long_pass_own_to_opp_mean_diff',
 'long_pass_own_to_opp_mean_h',
 'no_foot_goals_ratio_std_a',
 'no_foot_goals_ratio_std_diff',
 'no_foot_goals_ratio_std_h',
 'odd_ratio_under_corners_10.5',
 'odd_ratio_under_corners_8.5',
 'odds_home_under_both_score_h',
 'odds_away_under_both_score_a',
 'odds_away_under_goals_4.5_a',
 'odds_home_under_goals_4.5_h',
 'odds_away_over_goals_4.5_a',
 'odds_home_over_goals_4.5_h',
 'odds_home_under_goals_0.5_diff',
 'odds_home_under_goals_0.5_h',
 'own_goals_std_h',
 'own_goals_std_a',
 'pen_goals_conceded_mean_diff',
 'penalty_faced_std_a',
 'penalty_won_std_a',
 'penalty_faced_std_h',
 'penalty_won_std_h',
 'poss_won_att_3rd_std_a',
 'poss_won_att_3rd_std_diff',
 'poss_won_att_3rd_std_h',
 'post_scoring_att_std_a',
 'post_scoring_att_std_h',
 'prob_squared_under_goals_2.5',
 'pts_dropped_winning_pos_mean_a',
 'pts_dropped_winning_pos_std_diff',
 'pts_dropped_winning_pos_std_h',
 'ratio_over_goals_2.5_a',
 'ratio_over_goals_2.5_h',
 'ratio_under_goals_2.5_a',
 'ratio_under_goals_2.5_diff',
 'raw_prob_over_goals_1.5',
 'raw_prob_over_goals_2.5',
 'raw_prob_over_goals_3.5',
 'raw_prob_under_goals_1.5',
 'raw_prob_under_goals_2.5',
 'raw_prob_under_goals_3.5',
 'raw_prob_under_goals_4.5',
 'red_card_1t_mean_a',
 'red_card_1t_mean_diff',
 'red_card_1t_mean_h',
 'red_card_2t_mean_a',
 'red_card_2t_mean_diff',
 'red_card_mean_h',
 'red_card_mean_a',
 'red_card_std_diff',
 'right_to_left_goals_mean_diff',
 'right_to_left_goals_std_diff',
 'rightside_pass_div_leftside_pass_mean_a',
 'rightside_pass_div_leftside_pass_mean_h',
 'second_yellow_mean_h',
 'shots_mul_goals_std_h',
 'second_yellow_mean_a',
 'shots_mul_goals_std_a',
 'successful_final_third_passes_mean_a',
 'successful_final_third_passes_mean_diff',
 'successful_final_third_passes_std_a',
 'successful_final_third_passes_std_diff',
 'successful_put_through_std_a',
 'successful_put_through_std_diff',
 'successful_put_through_std_h',
 'total_clearance_mean_a',
 'total_clearance_mean_h',
 'total_fastbreak_std_diff',
 'total_high_claim_mean_a',
 'total_high_claim_mean_diff',
 'total_high_claim_mean_h',
 'total_keeper_sweeper_mean_a',
 'total_keeper_sweeper_mean_h',
 'total_launches_std_h',
 'total_launches_std_a',
 'total_red_card_mean_h',
 'total_red_card_mean_a',
 'total_red_card_std_diff',
 'total_throws_std_a',
 'total_throws_std_diff',
 'total_win_pct_over_goals_1.5_a',
 'total_win_pct_over_goals_1.5_h',
 'total_win_pct_over_goals_2.5_a',
 'total_win_pct_over_goals_2.5_h',
 'total_win_pct_over_goals_3.5_a',
 'total_win_pct_over_goals_3.5_h',
 'total_win_pct_under_goals_1.5_a',
 'total_win_pct_under_goals_1.5_h',
 'total_win_pct_under_goals_2.5_a',
 'total_win_pct_under_goals_2.5_h',
 'total_win_pct_under_goals_3.5_a',
 'total_yel_card_std_a',
 'total_yel_card_std_diff',
 'total_yel_card_std_h',
 'win_pct_away_over_goals_2.5_a',
 'win_pct_away_over_goals_2.5_diff',
 'win_pct_away_under_goals_2.5_a',
 'win_pct_away_under_goals_2.5_diff',
 'win_pct_home_over_goals_2.5_a',
 'win_pct_home_over_goals_2.5_diff',
 'win_pct_home_over_goals_2.5_h',
 'win_pct_home_under_goals_2.5_diff',
 'win_pct_home_under_goals_2.5_h',
 'winner_mean_diff']

In [16]:
def setup_dataset(test_date):
    ds = create_dataset(target=target,
                        test_date=test_date,
                        features=feats,
                        odds_features=True,
                        include_std=True,
                        test_weeks=6,
                        ignore_features=IGNORE_FEATURES,
                        drop_future_matches=False,
                       )
    train_data = ds.train_data.copy()#[ds.train_data.index.map(lambda x: "us_major_league_soccer" not in x and "mexican" not in x)]
    train_data.drop(columns=["hour_rank", "hour_before_16", "is_weekend"], inplace=True)
    train_data = train_data[train_data.index.isin(index)][list(set(test_feats)) + [target]].copy()#.reset_index(drop=True)
    test_data = ds.test_set[ds.test_set.index.isin(index)][list(set(test_feats)) + [target]].copy()#.reset_index(drop=True)
    val_data = ds.val_set[ds.val_set.index.isin(index)].copy()
    setup_kwargs = dict(
        preprocess=True,
        test_data=test_data[train_data.columns.tolist()],#.dropna(),
        #numeric_features=[x for x in train_data.columns.tolist() if x != target],
        #custom_pipeline=loaded,
        #train_size=0.75,
        session_id=123,
        normalize=True,
        normalize_method="robust",
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=False,
        multicollinearity_threshold=0.8,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        #ignore_features=ignore_features,
        fold_strategy="stratifiedkfold",#"timeseries",
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=4,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="linear",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.05,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.05,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )
    _ = setup(data=train_data, target=target, **setup_kwargs)
    return train_data, test_data, val_data, ds

In [17]:
train_data, test_data, val_data, ds = setup_dataset("4-Dec-2021")

In [18]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_ensemble():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "et", "rf", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="MCC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    blend = blend_models(cali)
    opti = tune_model(blend, optimize="Precision", choose_better=True, n_iter=50, search_library="optuna")
    return opti, cali, tuned_models

In [19]:
from pycaret.classification import stack_models, ensemble_model, blend_models
def train_linear_models():
    top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )
    tuned_models = [tune_model(model, optimize="MCC", choose_better=True, n_iter=50, search_library="optuna") for model in top_models]
    cali = [calibrate_model(tuned, method="sigmoid", calibrate_fold=4) for tuned in tuned_models]
    return tuned_models, cali

In [20]:
tuned_linear, cali_linear = train_linear_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6414,0.6893,0.6716,0.6507,0.661,0.2806,0.2807
1,0.5997,0.6586,0.6321,0.6124,0.6221,0.1969,0.197
2,0.6216,0.6679,0.6123,0.6442,0.6278,0.2435,0.2439
3,0.6126,0.6475,0.6015,0.6345,0.6175,0.2257,0.226
Mean,0.6188,0.6658,0.6294,0.6354,0.6321,0.2367,0.2369
SD,0.0152,0.0154,0.0267,0.0145,0.0171,0.0303,0.0303


In [53]:
top_models = compare_models(
            n_select=8,
            sort='MCC',
            include=["lr", "lda", "ridge", "svm"],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6166,0.6734,0.5836,0.65,0.6081,0.2351,0.24,0.3
lda,Linear Discriminant Analysis,0.6166,0.6705,0.5892,0.6475,0.6115,0.2347,0.2387,0.3025
ridge,Ridge Classifier,0.6166,0.0,0.5892,0.6476,0.6115,0.2347,0.2387,0.285
svm,SVM - Linear Kernel,0.5841,0.0,0.5441,0.6157,0.5734,0.1708,0.1739,0.305


In [43]:
tuned_linear[3]

SGDClassifier(alpha=0.10126692364605047, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.001022673152073415,
              fit_intercept=False, l1_ratio=0.2203376725525369,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l1', power_t=0.5,
              random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
evaluate_model(tuned_linear[3])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [51]:
evaluate_model(cali_linear[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [55]:
market="goals"
cutoff = 2.5
tips = tips_from_model(model=cali_linear[0],
                test_data=test_data,
                features=feats,
                market=market,
                cutoff=cutoff,
                      )

In [56]:
def show_results(df, groupby=["bet_type", "match_week"]):
    ix = np.logical_and(df["win"]>=0, True)# df["label"]>=1)
    #ix = np.logical_and(df["consensus"] >=0.1, ix)
    ix2 = np.logical_and(df["odds"]>=1.5, df["odds"]<2.)
    ix = np.logical_and(ix, ix2)
    #ix = np.logical_and(ix, ~df["validation"])
    #ix = np.logical_and(ix, df["exp_payoff_prec"] >0.88)
    #ix = np.logical_and(ix, df["exp_payoff"] <1.2)
    #ix = np.logical_and(ix, df["exp_payoff"] >0.8)
    ix = np.logical_and(ix, df["confidence"] >=0.60)
    #ix = np.logical_and(ix, df["confidence"] <=0.75)
    x = df[ix].groupby(groupby)[["win", "profit", "exp_payoff", "exp_payoff_prec", "odds", "confidence"]].mean()
    x["count"] = df[ix].groupby(groupby)[["win"]].count()
    x["buenas"] = df[ix].groupby(groupby)[["win"]].sum().astype(int)
    x["model"] = df["model"].iloc[0]
    return x
show_results(tips)

Unnamed: 0_level_0,Unnamed: 1_level_0,win,profit,exp_payoff,exp_payoff_prec,odds,confidence,count,buenas,model
bet_type,match_week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
goals_over_2.5,2021_19,0.818182,0.434545,1.193888,,1.743636,0.683591,11,9,model
goals_over_2.5,2021_20,0.7,0.144,1.087417,,1.641,0.66427,10,7,model
goals_over_2.5,2021_21,0.8,0.359,1.130386,,1.685,0.6716,10,8,model
goals_over_2.5,2021_22,0.571429,0.04,1.130597,,1.708571,0.6628,7,4,model
goals_over_2.5,2021_23,0.5,-0.14,1.216788,,1.815,0.66775,4,2,model
goals_over_2.5,2021_24,0.625,0.05875,1.168317,,1.73,0.674975,8,5,model
goals_over_2.5,2021_25,0.0,-1.0,1.20024,,1.8,0.6668,1,0,model
goals_under_2.5,2021_19,0.75,0.255,1.034973,,1.63,0.63415,4,3,model
goals_under_2.5,2021_20,0.8,0.372,1.14258,,1.706,0.67048,5,4,model
goals_under_2.5,2021_21,0.333333,-0.363333,1.200528,,1.746667,0.686033,3,1,model
