In [None]:
import datetime as dt
import pickle

import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, StandardScaler
from sklearn.svm import SVC

import modules.engineer as eng
import modules.secretary as secr
import modules.features as feat
import modules.helper as helper

from steps.step_05 import ModelMaker

production = secr.load_production_update()

periods = helper.get_periods(dt.datetime(2021, 8, 3), 7, 53)
for period in periods:
    start, end = period
    print(f"{start} - {end}".center(100, "-"))

    games_before = production.loc[production["Date"] < start]
    
    games_after = production.loc[
        (production["Date"] >= start) & (production["Date"] < end)
    ]
    if games_after.empty:
        print("no games to predict - skip period!")
        continue
    
    data = eng.prepare_for_model(games_before)

    sfm = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=42))
    cv = 3

    lrc = {'name': 'LogisticRegression',
        'classifier': CalibratedClassifierCV(base_estimator=LogisticRegression(random_state=42,
                                                                                max_iter=100000), cv=cv),
        'param_grid': [{'preprocessor__num__scaling': [None],
                        'selector': [sfm],
                        'selector__threshold': [f"{x/10}*median" for x in range(1, 6, 2)],
                        'classifier__base_estimator__C': [10**-x for x in range(0, 7, 2)],
                        'classifier__base_estimator__class_weight': [None],
                        }]}

    rfc = {'name': 'RandomForestClassifier',
        'classifier': CalibratedClassifierCV(base_estimator=RandomForestClassifier(random_state=42), cv=cv),
        'param_grid': [{'preprocessor__num__binning': [None],
                        'selector': [sfm],
                        'selector__threshold': [f"{x/10}*median" for x in range(5, 20, 7)],
                        'classifier__base_estimator__n_estimators': [x for x in range(50, 201, 75)],
                        'classifier__base_estimator__max_depth': [x for x in range(15, 86, 35)],
                        'classifier__base_estimator__min_samples_leaf': [x for x in range(5, 36, 15)],
                        'classifier__base_estimator__class_weight': [None],
                        }]}

    mlp = {'name': 'MLPClassifier',
        'classifier': CalibratedClassifierCV(base_estimator=MLPClassifier(random_state=42,
                                                                            max_iter=10000), cv=cv),
        'param_grid': [{'preprocessor__num__scaling': [None],
                        'selector': [sfm],
                        'selector__threshold': [f"{x/10}*median" for x in range(5, 20, 7)],
                        'classifier__base_estimator__hidden_layer_sizes': [[10], [10, 10], [10, 10, 10]],
                        'classifier__base_estimator__alpha': [1*10**-x for x in range(-1, 4, 2)],
                        }]}

    knc = {'name': 'KNeighborsClassifier',
        'classifier': CalibratedClassifierCV(base_estimator=KNeighborsClassifier(), cv=cv),
        'param_grid': [{'preprocessor__num__scaling': [StandardScaler(), None],
                        'preprocessor__num__binning': [None],
                        'preprocessor__num__poly': [None],
                        'selector': [sfm],
                        'selector__threshold': [f"{x/10}*median" for x in range(1, 4, 1)],
                        'classifier__base_estimator__n_neighbors': [200, 250, 300]
                        }]}

    abc = {'name': 'AdaBoostClassifier',
        'classifier': CalibratedClassifierCV(base_estimator=AdaBoostClassifier(), cv=cv),
        'param_grid': [{'preprocessor__num__scaling': [StandardScaler(), None],
                        'preprocessor__num__binning': [None],
                        'preprocessor__num__poly': [None],
                        'selector': [sfm],
                        'selector__threshold': [f"{x/1000}*median" for x in range(25, 76, 25)],
                        'classifier__base_estimator__n_estimators': [x for x in range(5, 15, 5)],
                        'classifier__base_estimator__learning_rate': [x/10 for x in range(16, 21, 1)]
                        }]}

    models = []
    models.append(lrc)
    models.append(rfc)
    models.append(mlp)
    models.append(knc)
    models.append(abc)

    ModelMaker(data, feat.num_features(), feat.cat_features(), models, True).do()

    # make predictions
    
    to_drop = [
        "Home Team",
        "Away Team",
        "Home Coach",
        "Away Coach",
        "Home xG",
        "Away xG",
        "Home Points",
        "Away Points",
        "Home Possesion",
        "Away Possesion",
        "Home Goals",
        "Away Goals",
        "Kick Off",
        "Date",
        "Primary Key",
        "Home Shots",
        "Away Shots",
        "Home Shots on Target",
        "Away Shots on Target",
        "Home Fouls Committed",
        "Away Fouls Committed",
        "Home Corners",
        "Away Corners",
        "Home Yellow Cards",
        "Away Yellow Cards",
        "Home Red Cards",
        "Away Red Cards",
        "Result",
        "Notes",
    ]

    save_me = games_after[to_drop]

    games_after = games_after.drop(columns=to_drop)
    games_after.reset_index(inplace=True, drop=True)

    lrc = pickle.load(open("./tests/sources/models/LogisticRegression.sav", "rb"))
    rfc = pickle.load(open("./tests/sources/models/RandomForestClassifier.sav", "rb"))
    mlp = pickle.load(open("./tests/sources/models/MLPClassifier.sav", "rb"))
    knc = pickle.load(open("./tests/sources/models/KNeighborsClassifier.sav", "rb"))
    abc = pickle.load(open("./tests/sources/models/AdaBoostClassifier.sav", "rb"))

    models = {
        "lrc": lrc,
        "rfc": rfc,
        "mlp": mlp,
        "knc": knc,
        "abc": abc,
    }

    for key, model in models.items():
        games_after[key] = model.predict(games_after)
        games_after[
            [f"{key}_proba_a", f"{key}_proba_d", f"{key}_proba_h"]
        ] = model.predict_proba(games_after)
        games_after[
            [f"{key}_proba_a", f"{key}_proba_d", f"{key}_proba_h"]
        ] = games_after[[f"{key}_proba_a", f"{key}_proba_d", f"{key}_proba_h"]].round(6)

    games_after.reset_index(inplace=True, drop=True)
    save_me.reset_index(inplace=True, drop=True)
    games_after = pd.concat([games_after, save_me], axis=1)

    regex = [
        "MA",
        "Days Since Last Game",
        "Coach Substituted Within Last 3 Games",
        "Promoted Last Year",
        "Kick Off Before 17:00",
        "Current Position Before Matchday",
    ]
    for r in regex:
        games_after.drop(
            list(games_after.filter(regex=f"^.*{r}.*$")), axis=1, inplace=True
        )

    stamp = start.strftime("%Y%m%d-%H%M%S")
    games_after.to_csv(path_or_buf=f"./tests/sources/archive/{stamp}_bet.csv")
