In [None]:
import warnings

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

warnings.filterwarnings("ignore")
from sklearn.feature_selection import RFE
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
# all information needed to scrape data from football-data.co.uk

beginning_url = "https://www.football-data.co.uk/"
years = [
    "2223",
    "2122",
    "2021",
    "1920",
    "1819",
    "1718",
    "1617",
    "1516",
    "1415",
    "1314",
    "1213",
    "1213",
    "1112",
]
Leagues = {
    "PL": {
        "Foldername": "PL_data",
        "Leaguetag": "PL",
        "Leaguename": "E0",
        "Leagueurl": "https://www.football-data.co.uk/englandm.php",
    },
    "BL": {
        "Foldername": "BL_data",
        "Leaguetag": "BL",
        "Leaguename": "D1",
        "Leagueurl": "https://www.football-data.co.uk/germanym.php",
    },
    "PD": {
        "Foldername": "PD_data",
        "Leaguetag": "PD",
        "Leaguename": "SP1",
        "Leagueurl": "https://www.football-data.co.uk/spainm.php",
    },
    "SA": {
        "Foldername": "SA_data",
        "Leaguetag": "SA",
        "Leaguename": "I1",
        "Leagueurl": "https://www.football-data.co.uk/italym.php",
    },
}


considered_features = [
    "league",
    "kick_off_time",
    "HomeTeam",
    "AwayTeam",
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "full_time_result",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

# all elements to make categorical

categorical_features = [
    "league",
    "HomeTeam",
    "AwayTeam",
    "full_time_result",
    "half_time_result",
]
integer_features = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
]
odd_features = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]


# all columns with features that are not known on game day
not_known_on_game_day = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "HomeTeam_points",
    "AwayTeam_points",
]
odds = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

In [None]:
def compute_consensus_odds(df, columns_with_odds):
    """This function computes the consensus odds
    Input:
        df: dataframe
        columns_with_odds: list of columns with the odds
    Output:
        df: dataframe with the consensus odds added.
    """
    columns_with_odds = [x for x in columns_with_odds if x in list(df.columns)]
    home_odd_columns = [col for col in columns_with_odds if col.endswith("H")]
    draw_odd_columns = [col for col in columns_with_odds if col.endswith("D")]
    away_odd_columns = [col for col in columns_with_odds if col.endswith("A")]

    df["consensus_odds_home"] = df[home_odd_columns].mean(axis=1)
    df["consensus_odds_draw"] = df[draw_odd_columns].mean(axis=1)
    df["consensus_odds_away"] = df[away_odd_columns].mean(axis=1)
    return df


def add_indexer_column(data: pd.DataFrame, number_of_folds: int):
    n = int(len(data) / number_of_folds)
    indexer = []
    for i in range(number_of_folds):
        indexer.extend([i + 1] * n)
    if len(indexer) < len(data):
        indexer.extend([number_of_folds] * (len(data) - len(indexer)))
    data["indexer"] = indexer
    return data


def create_test_train_data_per_iteration(i, data_dum, scaler):
    data_dum[data_dum["indexer"] <= i]
    x_train_subset = data_dum.drop(columns=["full_time_result"], axis=1)
    y_train_subset = data_dum["full_time_result"]
    test_subset = data_dum[data_dum["indexer"] == i]
    y_test_subset = test_subset["full_time_result"]
    x_test_subset = data_dum.drop(columns=["full_time_result"], axis=1)
    x_train_subset = scaler.fit_transform(x_train_subset)
    x_test_subset = scaler.fit_transform(x_test_subset)
    return x_train_subset, y_train_subset, x_test_subset, y_test_subset


def compute_MAE(y_pred, y_real):
    """computes the mean right classification rate."""
    if len(y_pred) != len(y_real):
        raise ValueError("Arrays must be of the same length")
    result = []
    y_pred = np.array(y_pred)
    y_real = np.array(y_real)
    for i in range(len(y_pred)):
        if y_pred[i] == y_real[i]:
            result.append(1)
        else:
            result.append(0)
    return 1 - np.mean(result)

In [None]:
def data_robustness_check(data):
    """Drop the columns, where all entries are NaN
    Input:
        data: dataframe
    Output:
    data: dataframe
    .
    """
    data = data.dropna(axis=1, how="all")
    return data


def compute_percentages_out_of_consensus_odds(df):
    """This function computes the percentages out of the consensus odds
    Input:
        df: dataframe
        columns_with_consensus_odds: list of columns with the consensus odds
    Output:
        df: dataframe with the percentages out of the consensus odds added.
    """
    df["consensus_percentage_home"] = 1 / df["consensus_odds_home"]
    df["consensus_percentage_draw"] = 1 / df["consensus_odds_draw"]
    df["consensus_percentage_away"] = 1 / df["consensus_odds_away"]
    df["consensus_sum_of_percentages"] = (
        df["consensus_percentage_home"]
        + df["consensus_percentage_draw"]
        + df["consensus_percentage_away"]
    )
    df["consensus_percentage_home"] = (
        df["consensus_percentage_home"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_draw"] = (
        df["consensus_percentage_draw"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_away"] = (
        df["consensus_percentage_away"] / df["consensus_sum_of_percentages"]
    )
    return df


def rolling_forecast_origin_generator(data, min_train_size, horizon):
    """generates the rolling forecast origin."""
    for i in range(len(data) - min_train_size - horizon + 1):
        split_train = data[: min_train_size + i]
        split_test = data[min_train_size + i : min_train_size + i + horizon]
        yield split_train, split_test


def cross_validation_score(model, cv, metric, y):
    """computes the cross validation score."""
    cv_scores = []
    for cv_train, cv_test in cv:
        model.fit(
            cv_train.drop(columns=["full_time_result"]),
            y=cv_train["full_time_result"],
        )
        preds = model.predict(cv_test.drop(columns=["full_time_result"]))
        score = metric(y_true=cv_test["full_time_result"], y_pred=preds)
        cv_scores.append(score)
    return np.array(cv_scores)

In [None]:
# turning categorical into dummy vars
def data_preparation(data, league, not_known_on_game_day, odds):
    """prepares the data, to be used in the model
    Input:
        data: dataframe
        not_known_on_game_day: list of columns, which are not known on game day
        odds: list of columns, which are the odds.

    """
    data = data.drop(columns="index")
    data = data.set_index("Date")
    data = data.loc[data["league"] == league]
    data = compute_consensus_odds(df=data, columns_with_odds=odds)
    data = compute_percentages_out_of_consensus_odds(df=data)
    odds = data[list(odds)]
    data = data.drop(columns=not_known_on_game_day)
    data = data.drop(columns=["league", "kick_off_time"], axis=1)
    data = data.drop(columns=odds, axis=1)
    data_dum = pd.get_dummies(data)
    data_dum = data_dum.fillna(-33)
    data_dum = data_robustness_check(data=data_dum)
    return data_dum, odds

In [None]:
data = pd.read_csv(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/data_features_added.csv",
    index_col=False,
)

In [None]:
data_dum, odds = data_preparation(
    data=data,
    league="E0",
    not_known_on_game_day=not_known_on_game_day,
    odds=odds,
)

In [None]:
def __best_feature_selection_rfe(scaler, clf, i, data_dum):
    """computes the best feature selection for Logistic Regression"
    Input:
        scaler: MinMaxScaler
        clf: LogisticRegression
        i: number of features
        data_dum: dataframe
        Output:
        X_train: X_train
        Y_train: Y_train.
    """
    rfe = RFE(estimator=clf, n_features_to_select=i, step=1)
    X = data_dum.drop(columns=["full_time_result"])
    y = data_dum["full_time_result"]
    rfe.fit(X, y)
    X_temp = rfe.transform(X)
    X_train = X_temp[0:3500]
    Y_train = y[0:3500]
    X_train = scaler.fit_transform(X_train)
    return X_train, Y_train


# Creating loop to test which set of features is the best one for Logistic Regression
def best_feature_selection_LogisticRegression(data_dum, min_feat, max_feat):
    """computes the best feature selection for Logistic Regression
    Input:
        data_dum: dataframe
        Output:
        acc_results: list of accuracy results
        n_features: list of number of features.
    """
    acc_results = []
    n_features = []
    scaler = MinMaxScaler()
    clf = LogisticRegression(max_iter=1000, multi_class="multinomial")

    for i in range(min_feat, max_feat):
        X_train, Y_train = __best_feature_selection_rfe(
            scaler=scaler,
            clf=clf,
            i=i,
            data_dum=data_dum,
        )
        scores = cross_val_score(clf, X_train, Y_train, scoring="accuracy", cv=5)
        acc_results.append(scores.mean())
        n_features.append(i)
    return acc_results, n_features

In [None]:
acc_results, n_features = best_feature_selection_LogisticRegression(
    data_dum=data_dum,
    min_feat=3,
    max_feat=5,
)

In [None]:
acc_results
## 11 min pro feature

In [None]:
plt.plot(n_features, acc_results)
plt.ylabel("Accuracy")
plt.xlabel("N features")
plt.show()


# getting the best 13 features from RFE
rfe = RFE(estimator=clf, n_features_to_select=13, step=1)
rfe.fit(X, y)
X_transformed = rfe.transform(X)

np.random.seed(101)


cv_rolling = rolling_forecast_origin_generator(
    data=data_dum,
    min_train_size=400,
    horizon=50,
)
cv_scores1 = cross_validation_score(
    model=logisticRegr,
    cv=cv_rolling,
    metric=balanced_accuracy_score,
    y="full_time_result",
)

In [None]:
data_dum.to_excel("datasandbox.xlsx")