In [3]:
import datetime as dt
import pandas as pd

import modules.helper as helper
from importlib import reload
reload(helper)

periods = helper.get_periods(dt.datetime(2021, 7, 29), 14, 14)
folder = "03_2022_correct_EWMA_and_MA"
df = pd.DataFrame()
for period in periods:
    start, end = period
    tmp_date = start.strftime("%Y%m%d")
    tmp = pd.read_csv(f"./tests/sources/archive/{folder}/{tmp_date}-000000_bet.csv", index_col=0)
    df = df.append(tmp)
df.reset_index(inplace=True, drop=True)

num_features = [
    "lrc_proba_a",
    "lrc_proba_d",
    "lrc_proba_h",
    "rfc_proba_a",
    "rfc_proba_d",
    "rfc_proba_h",
    "mlp_proba_a",
    "mlp_proba_d",
    "mlp_proba_h",
    "knc_proba_a",
    "knc_proba_d",
    "knc_proba_h",
    "abc_proba_a",
    "abc_proba_d",
    "abc_proba_h",
    "Home Odds",
    "Deuce Odds",
    "Away Odds",
]

cat_features = ["lrc", "rfc", "mlp", "knc", "abc"]

df = df.filter(
    regex=(
        "^(lrc.*)|(rfc.*)|(mlp.*)|(knc.*)|(abc.*)|Result|.*Odds$"
    )
)


from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import (
    RBF,
    DotProduct,
    Matern,
    RationalQuadratic,
    WhiteKernel,
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, StandardScaler
from sklearn.svm import SVC

from steps.step_05 import ModelMaker

sfm = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=42))

lrc = {
    "name": "LogisticRegression",
    "classifier": LogisticRegression(random_state=42, max_iter=100000),
    "param_grid": [
        {
            "preprocessor__num__scaling": [None],
            "selector": [sfm],
            "selector__threshold": [f"{x/10}*median" for x in range(3, 6, 1)],
            "classifier__C": [x / 1000 for x in range(5, 16, 5)],
            "classifier__class_weight": [None],
        }
    ],
}

rfc = {
    "name": "RandomForestClassifier",
    "classifier": RandomForestClassifier(random_state=42),
    "param_grid": [
        {
            "preprocessor__num__binning": [None],
            "selector": [sfm],
            "selector__threshold": ["0.5*median", "1*median", "1.5*median"],
            "classifier__n_estimators": [50, 55, 60],
            "classifier__max_depth": [5, 7, 10],
            "classifier__min_samples_leaf": [5, 7, 10],
            "classifier__class_weight": [None],
        }
    ],
}

mlp = {
    "name": "MLPClassifier",
    "classifier": MLPClassifier(random_state=42, max_iter=10000),
    "param_grid": [
        {
            "preprocessor__num__scaling": [StandardScaler(), None],
            "selector": [sfm],
            "selector__threshold": [f"{x/10}*median" for x in range(1, 22, 5)],
            "classifier__hidden_layer_sizes": [[3, 3], [5, 5], [100]],
            "classifier__alpha": [2, 2.2, 2.4],
            "classifier__activation": ["tanh", "relu"],
        }
    ],
}

knc = {
    "name": "KNeighborsClassifier",
    "classifier": KNeighborsClassifier(),
    "param_grid": [
        {
            "preprocessor__num__scaling": [StandardScaler(), None],
            "preprocessor__num__binning": [None],
            "preprocessor__num__poly": [None],
            "selector": [sfm],
            "selector__threshold": [f"{x/10}*median" for x in range(1, 4, 1)],
            "classifier__n_neighbors": [200, 250, 300],
        }
    ],
}

abc = {
    "name": "AdaBoostClassifier",
    "classifier": AdaBoostClassifier(),
    "param_grid": [
        {
            "preprocessor__num__scaling": [StandardScaler(), None],
            "preprocessor__num__binning": [None],
            "preprocessor__num__poly": [None],
            "selector": [sfm],
            "selector__threshold": [f"{x/1000}*median" for x in range(25, 76, 25)],
            "classifier__n_estimators": [x for x in range(5, 15, 5)],
            "classifier__learning_rate": [x / 10 for x in range(16, 21, 1)],
        }
    ],
}

models = []
models.append(lrc)
models.append(rfc)
models.append(mlp)
models.append(knc)
models.append(abc)

from steps.step_05 import ModelMaker

ModelMaker(df, num_features, cat_features, models, True).do()

Start: 2022-04-25 17:01:48.997930

-----------------------------------------LogisticRegression-----------------------------------------
Fitting 3 folds for each of 9 candidates, totalling 27 fits

GridSearchCV:
Best score : -0.5906333333333333
Best params: {'classifier__C': 0.01, 'classifier__class_weight': None, 'preprocessor__num__scaling': None, 'selector': SelectFromModel(estimator=RandomForestClassifier(n_estimators=10,
                                                 random_state=42),
                threshold='0.4*median'), 'selector__threshold': '0.4*median'}

Score on full data after refit: -0.5845

              precision    recall  f1-score   support

           A       0.58      0.50      0.54       377
           D       0.00      0.00      0.00         0
           H       0.82      0.54      0.65       678

    accuracy                           0.52      1055
   macro avg       0.47      0.35      0.40      1055
weighted avg       0.74      0.52      0.61      1055

---

ValueError: Found array with 0 feature(s) (shape=(703, 0)) while a minimum of 1 is required.