# Imports 

In [1]:
import feature_selection

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
from ml_bets.supplementary.functions import create_dataset, add_targets, add_columns_table, train_test_split
from feature_selection.feature_selection import FeatureSelection

from pycaret.classification import (add_metric,
    create_model,
    finalize_model,
    optimize_threshold,
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
)

# Parameters 

In [2]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }  # NEED TO BE FIXED. REWRITE PARAMETERS!!!!! 
metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]
user_date = "20-October-2021"
user_target = 'goals_2.5'
cutoff = user_target.split("_")[-1]
ignore_features = ['month','date','competition','month_start_or_end']
fold_strategy = "timeseries"

### Pycaret setup kwargs 

In [29]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=True,
        multicollinearity_threshold=0.99,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        ignore_features=ignore_features,
        fold_strategy=fold_strategy,
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

# Creation dataset (standard)

In [32]:
df = create_dataset()


/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


In [33]:
df = add_targets(df=df, cutoff=cutoff)
# Train-test split
train, test = train_test_split(df=df, sep=user_date) # insert table to evaluate and separation date 

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Pycaret evaluation 

In [34]:
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = train.select_dtypes(include=numerics).columns.tolist()
setup_kwargs['numeric_features'] = num_cols 

In [35]:
ignored = setup_kwargs["ignore_features"]

In [36]:
_ = setup(data=train, target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [37]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6069,0.6458,0.6155,0.6312,0.623,0.2103,0.2105,0.73
rf,Random Forest Classifier,0.6038,0.6381,0.6319,0.6243,0.6268,0.2001,0.201,0.3833
lda,Linear Discriminant Analysis,0.6057,0.6327,0.6184,0.6297,0.6237,0.2076,0.2078,0.2767
et,Extra Trees Classifier,0.5927,0.6266,0.6299,0.612,0.62,0.1775,0.178,0.3733
gbc,Gradient Boosting Classifier,0.5872,0.626,0.6196,0.6063,0.6126,0.1688,0.169,1.3033
lightgbm,Light Gradient Boosting Machine,0.5884,0.6156,0.6325,0.6051,0.6183,0.1695,0.1697,0.5633
xgboost,Extreme Gradient Boosting,0.5755,0.6115,0.6312,0.591,0.6103,0.1425,0.143,0.8667
ada,Ada Boost Classifier,0.586,0.6037,0.604,0.6087,0.6062,0.1684,0.1686,0.4833
dt,Decision Tree Classifier,0.5404,0.5386,0.5682,0.5637,0.5658,0.0771,0.0771,0.3067
dummy,Dummy Classifier,0.4726,0.5,0.0,0.0,0.0,0.0,0.0,0.27


In [None]:
evaluate_model(top_models[0])

## Finalize model

In [39]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [54]:
from pycaret.classification import predict_model
from pycaret.utils import check_metric
predictions = predict_model(final, data=test)

In [46]:
predictions

Unnamed: 0_level_0,corners_mean_a,DEF_h_mul_shots_mul_goals_mean_diff,goal_kicks_per_shot_std_diff,blocked_pass_mean_diff,big_chance_created_mean_diff,DEF_h,long_passes_mean_diff,total_fwd_zone_pass_mean_a,leftside_pass_std_a,corners_mean_diff,...,raw_prob_under_goals_2.5,odd_ratio_under_goals_2.5,prob_squared_under_goals_2.5,prob_over_goals_2.5,raw_prob_over_goals_2.5,odd_ratio_over_goals_2.5,prob_squared_over_goals_2.5,goals_2.5,Label,Score
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2220903_mexican_primera,1.4,549.4,-0.276675,-3.4,0.2,67.0,-4.8,171.2,25.169426,2.2,...,0.598802,False,0.316774,0.437173,0.465116,False,0.191120,False,True,0.5459
2220904_mexican_primera,9.4,0.0,0.079119,-1.8,0.0,69.0,6.6,251.0,10.212737,-4.8,...,0.571429,False,0.291032,0.460526,0.487805,False,0.212084,True,False,0.6203
2220905_mexican_primera,3.6,-112.0,-0.257669,-2.2,-0.4,70.0,-19.6,269.8,21.130547,3.4,...,0.666667,False,0.390625,0.375000,0.400000,False,0.140625,False,False,0.9828
2199216_us_major_league_soccer,6.4,-52.8,-0.091061,0.6,-0.6,66.0,9.4,293.8,26.632687,-1.4,...,0.454545,False,0.183673,0.571429,0.606061,False,0.326531,False,True,0.7460
2199205_us_major_league_soccer,5.8,-737.0,0.865412,-0.4,-1.6,67.0,16.0,257.8,25.440126,-0.8,...,0.476190,False,0.200139,0.552632,0.588235,False,0.305402,True,False,0.5965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2229111_italian_serie_a,4.8,-1890.0,0.167492,-3.2,-0.6,75.0,-9.0,277.6,30.086542,-1.4,...,0.476190,False,0.204030,0.548303,0.578035,False,0.300636,True,True,0.5747
2219449_spanish_la_liga,2.8,900.0,-0.003927,-1.2,-0.8,75.0,-0.6,202.2,23.381617,0.6,...,0.598802,False,0.323164,0.431525,0.454545,False,0.186213,True,False,0.7909
2210446_english_premier_league,4.8,304.0,-0.117245,0.8,-0.6,76.0,2.4,198.8,25.019992,0.2,...,0.523560,False,0.250000,0.500000,0.523560,False,0.250000,True,False,0.7704
2229104_italian_serie_a,3.2,-172.8,0.385819,3.4,0.4,72.0,7.0,214.4,28.474550,0.0,...,0.500000,False,0.224377,0.526316,0.555556,False,0.277008,True,False,0.5567


In [50]:
predictions[user_target]

match_id
2220903_mexican_primera           False
2220904_mexican_primera            True
2220905_mexican_primera           False
2199216_us_major_league_soccer    False
2199205_us_major_league_soccer     True
                                  ...  
2229111_italian_serie_a            True
2219449_spanish_la_liga            True
2210446_english_premier_league     True
2229104_italian_serie_a            True
2210449_english_premier_league    False
Name: goals_2.5, Length: 660, dtype: bool

In [55]:
predictions['Label'] = predictions['Label'].map(lambda x: x == "True")

In [48]:
metrics_list

['Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC']

In [56]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric,
    ) for metric in metrics_list
}

In [57]:
metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
metrics_df.index = ['standard_model']
for col, val in metrics_dict.items():
    metrics_df.loc['standard_model', col] = val 
metrics_df

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
standard_model,0.6091,0.6118,0.571,0.6526,0.6091,0.2216,0.2236


# Creation dataset (with new columns)

### Preparation 

In [58]:
features = Features() #Features(output=features_path)
examples = features.create()
examples = examples[~examples[f"prob_under_goals_{cutoff}"].isna()]
# Initialize PipelineDataset class
pipe_ds = PipelineDatasets(
    features=features,
    target=user_target,
    examples=examples,
    test_size=user_date,
)
n_train, n_test = pipe_ds.train_data, pipe_ds.test_data 

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Referee table. Merging dataframe 

In [61]:
current_path = os.getcwd() 
path = Path(current_path).parent / 'data/features/referee_table.csv'
raw_referee_table = pd.read_csv("referee_table.csv")

In [33]:
features

Features(dataset=Dataset(examples=ExamplesDataset(dates=                                         date match_week  season  week
match_id                                                              
1009316_spanish_la_liga   2018-08-20 21:00:00    2018_03    2018     3
1009317_spanish_la_liga   2018-08-17 21:15:00    2018_03    2018     3
1009318_spanish_la_liga   2018-08-18 17:15:00    2018_03    2018     3
1009319_spanish_la_liga   2018-08-19 17:15:00    2018_03    2018     3
1009320_spanish_la_liga   2018-08-17 19:15:00    2018_03    2018     3
...                                       ...        ...     ...   ...
2215348_german_bundesliga 2022-01-08 14:30:00    2021_24    2021    24
2215351_german_bundesliga 2022-01-08 14:30:00    2021_24    2021    24
2215345_german_bundesliga 2022-01-08 17:30:00    2021_24    2021    24
2215349_german_bundesliga 2022-01-09 14:30:00    2021_24    2021    24
2215350_german_bundesliga 2022-01-09 16:30:00    2021_24    2021    24

[17312 rows x 4 colu

In [62]:
from ml_bets.supplementary.functions import compute_seasons, add_columns_table

In [63]:
referee_table = add_columns_table(feat=features, df=raw_referee_table) # add unique id_match to referee table 

In [65]:
referee_cols  = referee_table.columns.tolist() 
referee_cols.remove('competition')
referee_cols.remove('date') # Remove repeated columns 
referee_train = pd.merge(n_train, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + train old data
referee_test = pd.merge(n_test, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + test old data

## Selection of features

In [87]:
feat_sel = FeatureSelection(target=user_target,
                            dataset=referee_train,
                            target_features=100,
                            filter_metrics=metric_param,
                            include=["lr", "et"]
                           )
#selected_features = feat_sel.repeat_pipeline()

In [86]:
def create_feature_list(self):
    """Run all necessary methods to extract the list of relevant features."""
    # Call creation features dataframe
    _ronya = self.run_feature_extraction()
     
    # Remove zeros and normalize
    _ronya = self.remove_zeros(_ronya)
    _ronya = self.normalize(dataframe=_ronya)
    # Get score
    self.features_df = self.feature_score(dataframe=_ronya)
    return
    top_n_features = self.calculate_number_features(
        number_features=self.number_features, df=list(self.features_df),
    )
    filtered = self.features_df.iloc[:top_n_features]
    self.feature_list = filtered.index.tolist()

In [88]:
create_feature_list(feat_sel)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Extra Trees Classifier,0.589,0.6243,0.6517,0.5686,0.6073,0.1805,0.1823,0.2383


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5685,0.5865,0.5506,0.5584,0.5545,0.1362,0.1362,0.3673


In [84]:
feat_sel.features_df

Unnamed: 0,model_id,model,feature,score
0,et,et,win_pct_home_under_goals_2.5_h,0.004005
1,et,et,win_pct_away_over_goals_2.5_a,0.00329
2,lr,lr,red_card_std_diff,1.83752
3,lr,lr,date_month_12,1.541038


In [89]:
feat_sel.features_df

Unnamed: 0_level_0,counts,normal_sum,final_score
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
red_card_std_diff,1,1.0,1.0
win_pct_home_under_goals_2.5_h,1,1.0,1.0
date_month_12,1,0.838651,0.838651
win_pct_away_over_goals_2.5_a,1,0.821322,0.821322


In [76]:
orig_features = set(feat_sel.feature_list)

In [77]:
feat_sel.create_feature_list()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Extra Trees Classifier,0.589,0.6243,0.6517,0.5686,0.6073,0.1805,0.1823,0.2383


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,brier
0,Logistic Regression,0.5685,0.5865,0.5506,0.5584,0.5545,0.1362,0.1362,0.3673


In [78]:
feat_sel.feature_list

['red_card_std_diff']

In [72]:
metric_param

{'Accuracy': 0.1,
 'AUC': 0.1,
 'Recall': 0.1,
 'Precision': 0.1,
 'F1': 0.1,
 'Kappa': -1.0,
 'MCC': -1.0}

In [70]:
selected_features

['red_card_std_diff']

## Pycaret evaluation

In [None]:
referee_train = referee_train[selected_features + [user_target]]
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = referee_train.select_dtypes(include=numerics).columns.tolist()
setup_kwargs['numeric_features'] = num_cols 

In [None]:
_ = setup(data=referee_train, target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [None]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

In [None]:
evaluate_model(top_models[0])

## Finalize model

In [None]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [None]:
predictions = predict_model(final, data=referee_test[selected_features + [user_target]])

In [None]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric
    ) for metric in metrics_list
}

In [None]:
new_model_metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
new_model_metrics_df.index = ['new_feat_model']
for col, val in metrics_dict.items():
    new_model_metrics_df.loc['new_feat_model', col] = val 
new_model_metrics_df

In [None]:
pd.concat([metrics_df, new_model_metrics_df])