In [None]:
import warnings

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings("ignore")

In [None]:
# all information needed to scrape data from football-data.co.uk

beginning_url = "https://www.football-data.co.uk/"
years = [
    "2223",
    "2122",
    "2021",
    "1920",
    "1819",
    "1718",
    "1617",
    "1516",
    "1415",
    "1314",
    "1213",
    "1213",
    "1112",
]
Leagues = {
    "PL": {
        "Foldername": "PL_data",
        "Leaguetag": "PL",
        "Leaguename": "E0",
        "Leagueurl": "https://www.football-data.co.uk/englandm.php",
    },
    "BL": {
        "Foldername": "BL_data",
        "Leaguetag": "BL",
        "Leaguename": "D1",
        "Leagueurl": "https://www.football-data.co.uk/germanym.php",
    },
    "PD": {
        "Foldername": "PD_data",
        "Leaguetag": "PD",
        "Leaguename": "SP1",
        "Leagueurl": "https://www.football-data.co.uk/spainm.php",
    },
    "SA": {
        "Foldername": "SA_data",
        "Leaguetag": "SA",
        "Leaguename": "I1",
        "Leagueurl": "https://www.football-data.co.uk/italym.php",
    },
}


considered_features = [
    "league",
    "kick_off_time",
    "HomeTeam",
    "AwayTeam",
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "full_time_result",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

# all elements to make categorical

categorical_features = [
    "league",
    "HomeTeam",
    "AwayTeam",
    "full_time_result",
    "half_time_result",
]
integer_features = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
]
odd_features = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]


# all columns with features that are not known on game day
not_known_on_game_day = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "HomeTeam_points",
    "AwayTeam_points",
]
odds = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

In [None]:
def compute_consensus_odds(df, columns_with_odds):
    """This function computes the consensus odds
    Input:
        df: dataframe
        columns_with_odds: list of columns with the odds
    Output:
        df: dataframe with the consensus odds added.
    """
    columns_with_odds = [x for x in columns_with_odds if x in list(df.columns)]
    home_odd_columns = [col for col in columns_with_odds if col.endswith("H")]
    draw_odd_columns = [col for col in columns_with_odds if col.endswith("D")]
    away_odd_columns = [col for col in columns_with_odds if col.endswith("A")]
    df["consensus_odds_home"] = df[home_odd_columns].mean(axis=1)
    df["consensus_odds_draw"] = df[draw_odd_columns].mean(axis=1)
    df["consensus_odds_away"] = df[away_odd_columns].mean(axis=1)
    return df, df.columns


def add_indexer_column(data: pd.DataFrame, number_of_folds: int):
    n = int(len(data) / number_of_folds)
    indexer = []
    for i in range(number_of_folds):
        indexer.extend([i + 1] * n)
    if len(indexer) < len(data):
        indexer.extend([number_of_folds] * (len(data) - len(indexer)))
    data["indexer"] = indexer
    return data


def create_test_train_data_per_iteration(i, data_dum, scaler):
    data_dum[data_dum["indexer"] <= i]
    x_train_subset = data_dum.drop(columns=["full_time_result"], axis=1)
    y_train_subset = data_dum["full_time_result"]
    test_subset = data_dum[data_dum["indexer"] == i]
    y_test_subset = test_subset["full_time_result"]
    x_test_subset = data_dum.drop(columns=["full_time_result"], axis=1)
    x_train_subset = scaler.fit_transform(x_train_subset)
    x_test_subset = scaler.fit_transform(x_test_subset)
    return x_train_subset, y_train_subset, x_test_subset, y_test_subset


def compute_MAE(y_pred, y_real):
    """computes the mean right classification rate."""
    if len(y_pred) != len(y_real):
        raise ValueError("Arrays must be of the same length")
    result = []
    y_pred = np.array(y_pred)
    y_real = np.array(y_real)
    for i in range(len(y_pred)):
        if y_pred[i] == y_real[i]:
            result.append(1)
        else:
            result.append(0)
    return 1 - np.mean(result)

In [None]:
def data_robustness_check(data):
    """Drop the columns, where all entries are NaN
    Input:
        data: dataframe
    Output:
    data: dataframe
    .
    """
    data = data.dropna(axis=1, how="all")
    return data


def compute_percentages_out_of_consensus_odds(df):
    """This function computes the percentages out of the consensus odds
    Input:
        df: dataframe
        columns_with_consensus_odds: list of columns with the consensus odds
    Output:
        df: dataframe with the percentages out of the consensus odds added.
    """
    df["consensus_percentage_home"] = 1 / df["consensus_odds_home"]
    df["consensus_percentage_draw"] = 1 / df["consensus_odds_draw"]
    df["consensus_percentage_away"] = 1 / df["consensus_odds_away"]
    df["consensus_sum_of_percentages"] = (
        df["consensus_percentage_home"]
        + df["consensus_percentage_draw"]
        + df["consensus_percentage_away"]
    )
    df["consensus_percentage_home"] = (
        df["consensus_percentage_home"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_draw"] = (
        df["consensus_percentage_draw"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_away"] = (
        df["consensus_percentage_away"] / df["consensus_sum_of_percentages"]
    )
    return df


def rolling_forecast_origin_generator(data, min_train_size, horizon):
    """generates the rolling forecast origin."""
    for i in range(len(data) - min_train_size - horizon + 1):
        split_train = data[: min_train_size + i]
        split_test = data[min_train_size + i : min_train_size + i + horizon]
        yield split_train, split_test


def cross_validation_score(model, cv, metric, y):
    """computes the cross validation score."""
    cv_scores = []
    for cv_train, cv_test in cv:
        model.fit(
            cv_train.drop(columns=["full_time_result"]),
            y=cv_train["full_time_result"],
        )
        preds = model.predict(cv_test.drop(columns=["full_time_result"]))
        score = metric(y_true=cv_test["full_time_result"], y_pred=preds)
        cv_scores.append(score)
    return np.array(cv_scores)

In [None]:
data = pd.read_csv(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/data_features_added.csv",
    index_col=False,
)
data = data.drop(columns="index")
number_of_folds = 40
# drop all columns, with information not present at game start
data = data.sort_values(by="Date")
data = data.set_index("Date")  #
data, columns_with_odds = compute_consensus_odds(df=data, columns_with_odds=odds)
data = compute_percentages_out_of_consensus_odds(df=data)

# just for now, in the final one I want to have a loop doing all of this
data = data.loc[data["league"] == "E0"]

league = "E0"
# turning the target variable into integers
data["full_time_result"] = np.where(
    data.full_time_result == "H",
    2,
    np.where(data.full_time_result == "A", 1, 0),
)
data = data.drop(columns=not_known_on_game_day)
# filling NAs
data = data.fillna(np.nan)
odds = data[list(odds)]
data = data.drop(columns=odds, axis=1)
data = data.drop(columns=["league", "kick_off_time"], axis=1)
# turning categorical into dummy vars
data_dum = pd.get_dummies(data)
data_dum = data_dum.fillna(-33)
data_dum = data_robustness_check(data=data_dum)

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=5)]
# Number of features to consider at every split
max_features = ["auto", "sqrt"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 500, num=5)]
max_depth.append(None)
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {
    "n_estimators": n_estimators,
    "max_features": ["sqrt"],
    "max_depth": max_depth,
    "bootstrap": bootstrap,
}
print(random_grid)

In [None]:
X = data_dum.drop(columns=["full_time_result"])
y = data_dum["full_time_result"]

In [None]:
rf = RandomForestRegressor()
tscv = TimeSeriesSplit(n_splits=5)
rf_model_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=random_grid,
    cv=tscv,
    scoring="f1_macro",
    n_jobs=-2,
)

In [None]:
rf_model_grid_search.fit(X, y)

In [None]:
rf_model_grid_search.cv_results_

In [None]:
import pickle

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/models/final_model_E0.pkl",
    "rb",
) as f:
    x = pickle.load(f)

In [None]:
x