# Imports 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

from ml_bets.constants import FEATURES_PATH
from ml_bets.features.features import Features
from ml_bets.modeling.match_model import PipelineDatasets, run_pycaret_setup
from ml_bets.supplementary.functions import create_dataset, add_targets, add_columns_table, train_test_split, compute_seasons 
from feature_selection.feature_selection import FeatureSelection

from pycaret.classification import (add_metric,
    create_model,
    finalize_model,
    optimize_threshold,
    save_model,
    compare_models, 
    evaluate_model,
    get_config,
    setup,
    tune_model,
    predict_model,
)

# Parameters 

In [2]:
metric_param = {
        "Accuracy": 0.1,
        "AUC": 0.1,
        "Recall": 0.1,
        "Precision": 0.1,
        "F1": 0.1,
        "Kappa": -1.0,
        "MCC": -1.0,
    }  # NEED TO BE FIXED. REWRITE PARAMETERS!!!!! 
metrics_list = ["Accuracy", "AUC", "Recall", "Precision", "F1", "Kappa", "MCC"]
user_date = "20-October-2021"
user_target = 'goals_2.5'
cutoff = user_target.split("_")[-1]
ignore_features = ['month','date','competition','month_start_or_end', 'season']
fold_strategy = "timeseries"

### Pycaret setup kwargs 

In [29]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=True,
        multicollinearity_threshold=0.99,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        ignore_features=ignore_features,
        fold_strategy=fold_strategy,
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

# Creation dataset (standard)

In [32]:
df = create_dataset()
df = add_targets(df=df, cutoff=cutoff)
# Train-test split
train, test = train_test_split(df=df, sep=user_date) # insert table to evaluate and separation date 

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Run this cell if target is boolean

In [2]:
train = train.astype({target_user: int})
test = test.astype({target_user: int})

Object `train.astype` not found.


## Pycaret evaluation 

In [34]:
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = train.select_dtypes(include=numerics).columns.tolist()
setup_kwargs['numeric_features'] = num_cols 

In [36]:
_ = setup(data=train, target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [37]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6069,0.6458,0.6155,0.6312,0.623,0.2103,0.2105,0.73
rf,Random Forest Classifier,0.6038,0.6381,0.6319,0.6243,0.6268,0.2001,0.201,0.3833
lda,Linear Discriminant Analysis,0.6057,0.6327,0.6184,0.6297,0.6237,0.2076,0.2078,0.2767
et,Extra Trees Classifier,0.5927,0.6266,0.6299,0.612,0.62,0.1775,0.178,0.3733
gbc,Gradient Boosting Classifier,0.5872,0.626,0.6196,0.6063,0.6126,0.1688,0.169,1.3033
lightgbm,Light Gradient Boosting Machine,0.5884,0.6156,0.6325,0.6051,0.6183,0.1695,0.1697,0.5633
xgboost,Extreme Gradient Boosting,0.5755,0.6115,0.6312,0.591,0.6103,0.1425,0.143,0.8667
ada,Ada Boost Classifier,0.586,0.6037,0.604,0.6087,0.6062,0.1684,0.1686,0.4833
dt,Decision Tree Classifier,0.5404,0.5386,0.5682,0.5637,0.5658,0.0771,0.0771,0.3067
dummy,Dummy Classifier,0.4726,0.5,0.0,0.0,0.0,0.0,0.0,0.27


In [None]:
evaluate_model(top_models[0])

## Finalize model

In [39]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [54]:
predictions = predict_model(final, data=test)

In [56]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric,
    ) for metric in metrics_list
}

In [57]:
metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
metrics_df.index = ['standard_model']
for col, val in metrics_dict.items():
    metrics_df.loc['standard_model', col] = val 
metrics_df

Unnamed: 0,Accuracy,AUC,Recall,Precision,F1,Kappa,MCC
standard_model,0.6091,0.6118,0.571,0.6526,0.6091,0.2216,0.2236


# Creation dataset (with new columns)

### Preparation 

In [58]:
features = Features() #Features(output=features_path)
examples = features.create()
examples = examples[~examples[f"prob_under_goals_{cutoff}"].isna()]
# Initialize PipelineDataset class
pipe_ds = PipelineDatasets(
    features=features,
    target=user_target,
    examples=examples,
    test_size=user_date,
)
n_train, n_test = pipe_ds.train_data, pipe_ds.test_data 

/home/guillem/ml_bets/data/future_matches/ESP1C.xls
/home/guillem/ml_bets/data/future_matches/ING1C.xls
/home/guillem/ml_bets/data/future_matches/ITA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MEX1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/FRA1C.xls
Excel file /home/guillem/ml_bets/data/future_matches/MLS1C.xls is empty. Skipping
/home/guillem/ml_bets/data/future_matches/ALE1C.xls


## Run this cell if target is boolean

In [2]:
n_train = n_train.astype({target_user: int})
n_test = n_test.astype({target_user: int})

Object `train.astype` not found.


## Referee table. Merging dataframe 

In [61]:
current_path = os.getcwd() 
path = Path(current_path).parent / 'data/features/referee_table.csv'
raw_referee_table = pd.read_csv("referee_table.csv")

In [63]:
referee_table = add_columns_table(feat=features, df=raw_referee_table) # add unique id_match to referee table 

In [65]:
referee_cols  = referee_table.columns.tolist() 
referee_cols.remove('competition')
referee_cols.remove('date') # Remove repeated columns 
referee_train = pd.merge(n_train, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + train old data
referee_test = pd.merge(n_test, referee_table[referee_cols], left_index=True, right_index=True, how='inner') # Referee data + test old data

## Selection of features

In [87]:
feat_sel = FeatureSelection(target=user_target,
                            dataset=referee_train,
                            target_features=100,
                            filter_metrics=metric_param,
                            include=["lr", "et"]
                           )
selected_features = feat_sel.repeat_pipeline()
selected_features 

### Parameters 

In [2]:
ignore_features = ['month','date','competition','season_x','month_start_or_end','raw_match_id','season_y']

### Pycaret setup kwargs 

In [29]:
setup_kwargs = dict(
        preprocess=True,
        train_size=0.75,
        session_id=123,
        normalize=True,
        transformation=True,
        ignore_low_variance=True,
        remove_multicollinearity=True,
        multicollinearity_threshold=0.99,
        n_jobs=-1,
        use_gpu=False,
        profile=False,
        ignore_features=ignore_features,
        fold_strategy=fold_strategy,
        remove_perfect_collinearity=True,
        create_clusters=False,
        fold=3,
        feature_selection=False,
        # you can use this to keep the 95 % most relevant features (fat_sel_threshold)
        feature_selection_threshold=0.5,
        combine_rare_levels=False,
        rare_level_threshold=0.02,
        pca=False,
        pca_method="kernel",
        pca_components=30,
        polynomial_features=False,
        polynomial_degree=2,
        polynomial_threshold=0.01,
        trigonometry_features=False,
        remove_outliers=True,
        outliers_threshold=0.01,
        feature_ratio=False,
        feature_interaction=False,
        # Makes everything slow AF. use to find out possibly interesting features
        interaction_threshold=0.01,
        fix_imbalance=True,
        log_experiment=False,
        verbose=False,
        silent=True,
        experiment_name="lagstest",
    )

## Pycaret evaluation

In [None]:
referee_train = referee_train[selected_features + [user_target]]
numerics = ["int16", "int32", "int64", "float16", "float32", "float64", "int", "float"]
num_cols = referee_train.select_dtypes(include=numerics).columns.tolist()
setup_kwargs['numeric_features'] = num_cols 

In [None]:
_ = setup(data=referee_train, target=user_target, **setup_kwargs)
x_train = get_config('X_train')

In [None]:
top_models = compare_models(
            n_select=5,
            sort='AUC',
            exclude=["qda", "knn", "nb", 'catboost'],
            verbose=True,
        )

In [None]:
evaluate_model(top_models[0])

## Finalize model

In [None]:
final = finalize_model(top_models[0]) # Choose the preferred model. Best performance model selected by default. 

## Metrics against test

In [None]:
predictions = predict_model(final, data=referee_test[selected_features + [user_target]])

In [None]:
metrics_dict = {
    metric: check_metric(
        actual=predictions[user_target], 
        prediction=predictions['Label'], 
        metric=metric
    ) for metric in metrics_list
}

In [None]:
new_model_metrics_df = pd.DataFrame(columns=metrics_dict.keys(), index=pd.Index(range(1)))
new_model_metrics_df.index = ['new_feat_model']
for col, val in metrics_dict.items():
    new_model_metrics_df.loc['new_feat_model', col] = val 
new_model_metrics_df

# Performance

In [None]:
pd.concat([metrics_df, new_model_metrics_df])