In [None]:
import pickle
import warnings

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler

In [None]:
# all information needed to scrape data from football-data.co.uk


beginning_url = "https://www.football-data.co.uk/"
years = [
    "2223",
    "2122",
    "2021",
    "1920",
    "1819",
    "1718",
    "1617",
    "1516",
    "1415",
    "1314",
    "1213",
    "1213",
    "1112",
]
Leagues = {
    "PL": {
        "Foldername": "PL_data",
        "Leaguetag": "PL",
        "Leaguename": "E0",
        "Leagueurl": "https://www.football-data.co.uk/englandm.php",
    },
    "BL": {
        "Foldername": "BL_data",
        "Leaguetag": "BL",
        "Leaguename": "D1",
        "Leagueurl": "https://www.football-data.co.uk/germanym.php",
    },
    "PD": {
        "Foldername": "PD_data",
        "Leaguetag": "PD",
        "Leaguename": "SP1",
        "Leagueurl": "https://www.football-data.co.uk/spainm.php",
    },
    "SA": {
        "Foldername": "SA_data",
        "Leaguetag": "SA",
        "Leaguename": "I1",
        "Leagueurl": "https://www.football-data.co.uk/italym.php",
    },
}


considered_features = [
    "league",
    "kick_off_time",
    "HomeTeam",
    "AwayTeam",
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "full_time_result",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

# all elements to make categorical

categorical_features = [
    "league",
    "HomeTeam",
    "AwayTeam",
    "full_time_result",
    "half_time_result",
]
integer_features = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
]
odd_features = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
    "consensus_odds_home",
    "consensus_odds_draw",
    "consensus_odds_away",
    "consensus_sum_of_percentages",
]


# all columns with features that are not known on game day
not_known_on_game_day = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "HomeTeam_points",
    "AwayTeam_points",
]
odds = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
    "consensus_odds_home",
    "consensus_odds_draw",
    "consensus_odds_away",
    "consensus_sum_of_percentages",
]

In [None]:
def compute_consensus_odds(df, columns_with_odds):
    """This function computes the consensus odds
    Input:
        df: dataframe
        columns_with_odds: list of columns with the odds
    Output:
        df: dataframe with the consensus odds added.
    """
    columns_with_odds = [x for x in columns_with_odds if x in list(df.columns)]
    home_odd_columns = [col for col in columns_with_odds if col.endswith("H")]
    draw_odd_columns = [col for col in columns_with_odds if col.endswith("D")]
    away_odd_columns = [col for col in columns_with_odds if col.endswith("A")]

    df["consensus_odds_home"] = df[home_odd_columns].mean(axis=1)
    df["consensus_odds_draw"] = df[draw_odd_columns].mean(axis=1)
    df["consensus_odds_away"] = df[away_odd_columns].mean(axis=1)
    return df

In [None]:
def data_robustness_check(data):
    """Drop the columns, where all entries are NaN
    Input:
        data: dataframe
    Output:
    data: dataframe
    .
    """
    data = data.dropna(axis=1, how="all")
    return data


def compute_percentages_out_of_consensus_odds(df):
    """This function computes the percentages out of the consensus odds
    Input:
        df: dataframe
        columns_with_consensus_odds: list of columns with the consensus odds
    Output:
        df: dataframe with the percentages out of the consensus odds added.
    """
    df["consensus_percentage_home"] = 1 / df["consensus_odds_home"]
    df["consensus_percentage_draw"] = 1 / df["consensus_odds_draw"]
    df["consensus_percentage_away"] = 1 / df["consensus_odds_away"]
    df["consensus_sum_of_percentages"] = (
        df["consensus_percentage_home"]
        + df["consensus_percentage_draw"]
        + df["consensus_percentage_away"]
    )
    df["consensus_percentage_home"] = (
        df["consensus_percentage_home"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_draw"] = (
        df["consensus_percentage_draw"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_away"] = (
        df["consensus_percentage_away"] / df["consensus_sum_of_percentages"]
    )
    return df

In [None]:
# turning categorical into dummy vars
def data_preparation(data, league, not_known_on_game_day, odds):
    """Prepares the data, to be used in the model
    Input:
        data: dataframe
        not_known_on_game_day: list of columns, which are not known on game day
        odds: list of columns, which are the odds.

    """
    data = data.drop(columns="index")
    data = data.set_index("Date")
    data = data.loc[data["league"] == league]
    data = compute_consensus_odds(df=data, columns_with_odds=odds)
    data = compute_percentages_out_of_consensus_odds(df=data)
    odds = data[list(odds)]
    data = data.drop(columns=not_known_on_game_day)
    data = data.drop(columns=["league", "kick_off_time"], axis=1)
    data = data.drop(columns=odds, axis=1)
    data_dum = pd.get_dummies(data)
    data_dum = data_dum.fillna(-33)
    data_dum = data_robustness_check(data=data_dum)
    return data_dum, odds

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data = pd.read_csv(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/data_features_added.csv",
    index_col=False,
)

In [None]:
from config

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/data/processed_I1_LOGIT_model.pkl",
    "rb",
) as f:
    x = pickle.load(f)

In [None]:
from epp_final_project_sbp.utilities import read_yaml

project_info_store = read_yaml(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/src/epp_final_project_sbp/data/project_info.yaml",
)

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/simulation_results_SP1.pkl",
    "rb",
) as f:
    y = pickle.load(f)
y

In [None]:
data_dum, odds = data_preparation(
    data=data,
    league="E0",
    not_known_on_game_day=not_known_on_game_day,
    odds=odds,
)

In [None]:
def best_feature_selection_RFECV_logit(scaler, clf, min_feat, data_dum, cv_split):
    """Computes the best feature selection for Logistic Regression
    this function does "
    Input:
        scaler: MinMaxScaler
        clf: LogisticRegression
        i: number of features
        data_dum: dataframe
        Output:
        X_train: X_train
        Y_train: Y_train.
    """
    X_train = pd.DataFrame()
    Y_train = pd.DataFrame()
    model_rfecv = RFECV(
        estimator=clf,
        min_features_to_select=min_feat,
        step=1,
        cv=cv_split,
        n_jobs=-2,
        scoring="f1_macro",
    )
    X_train = data_dum.drop(columns=["full_time_result"])
    Y_train = data_dum["full_time_result"]
    model_rfecv.fit(X_train, Y_train)
    X_temp = model_rfecv.transform(X_train)
    X_train = scaler.fit_transform(X_temp)
    return model_rfecv, X_train, Y_train


def cv_get_rf_model(
    max_depth_of_trees,
    n_bootstrap_iterations,
    time_series_split,
    data,
):
    n_estimators = [
        int(x) for x in np.linspace(start=50, stop=n_bootstrap_iterations, num=10)
    ]
    max_features = ["sqrt"]
    depth_of_trees = [int(x) for x in np.linspace(5, max_depth_of_trees, num=5)]
    depth_of_trees.append(None)
    tscv = TimeSeriesSplit(n_splits=time_series_split)
    grid = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": depth_of_trees,
        "bootstrap": [True],
    }
    X_train = data.drop(columns=["full_time_result"])
    Y_train = data["full_time_result"]

    rf_model_grid_search = random_forests_model(
        split=tscv,
        X_train=X_train,
        Y_train=Y_train,
        random_grid=grid,
    )
    # fine tune logistic regression

    GridSearchCV(
        clf,
        parameters,
        scoring="f1_macro",
        cv=tscv,
        n_jobs=-2,
        verbose=1,
    )
    return rf_model_grid_search


def random_forests_model(split, X_train, Y_train, random_grid):
    rf = RandomForestClassifier()
    rf_model_grid_search = GridSearchCV(
        estimator=rf,
        param_grid=random_grid,
        cv=split,
        scoring="f1_macro",
        n_jobs=-2,
    )
    rf_model_grid_search.fit(X=X_train, y=Y_train)
    return rf_model_grid_search

In [None]:
# general information
train_share = 0.8


def data_split(data, train_share):
    """Splits the data into training and test split.
    Since the data is already sorted by date the first train_share per cent
    are taken as a training set, the rest is the test set.
    Input:
        data: dataframe
        train_share: float
        Output:
    train_data: dataframe
    .
    """
    train_sample = int(len(data) * train_share)
    test_data = data.iloc[train_sample:]
    train_data = data.iloc[:train_sample]
    return train_data, test_data


train_data, test_data = data_split(data=data_dum, train_share=train_share)
tscv = TimeSeriesSplit(n_splits=10, max_train_size=None, test_size=None)

# logit model
min_feat = 4
clf = LogisticRegression(
    max_iter=1000,
    C=0.01,
    multi_class="multinomial",
    fit_intercept=True,
)
scaler = MinMaxScaler()


## rf model
max_depth_of_trees = 100
n_bootstrap_iterations = 100
time_series_split = 5
data = train_data

In [None]:
train_data

In [None]:
logit_model, x_train_used, y_train_used = best_feature_selection_RFECV_logit(
    scaler=scaler,
    clf=clf,
    min_feat=min_feat,
    data_dum=train_data,
    cv_split=tscv,
)

In [None]:
rf_model_grid_search = cv_get_rf_model(
    max_depth_of_trees=max_depth_of_trees,
    n_bootstrap_iterations=n_bootstrap_iterations,
    time_series_split=time_series_split,
    data=train_data,
)

In [None]:
# printing best fits and time elapsed
gs.fit(x_train_used, y_train_used)
warnings.filterwarnings("ignore")

In [None]:
x_test = test_data.drop(columns=["full_time_result"])
y_test = test_data["full_time_result"]
x_test_logit = logit_model.transform(x_test)
tpred_lr = gs.best_estimator_.predict(x_test_logit)

In [None]:
X_train = train_data.drop(columns=["full_time_result"])
Y_train = train_data["full_time_result"]
rf_model = rf_model_grid_search.best_estimator_
rf_model.fit(X_train, Y_train)
tpred_rf = rf_model.predict(x_test)

In [None]:
import pickle

0.5483108108108109 {'C': 0.01, 'class_weight': None, 'fit_intercept': False, 'solver': 'lbfgs'}

In [None]:
# testing models on unseen data
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
tpred_knn = knn.predict(x_test)
np.mean(y_test == tpred_knn)

bet_on_outcome_if_in_line_with_consensus_model_forecast

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/simulation_results_D1.pkl",
    "rb",
) as f:
    y = pickle.load(f)
y

In [None]:
def compute_features_last_n_games(df, n):
    """This function computes the features for the last n games
    Input:
        df: dataframe
        n: number of games
    Output:
        df: dataframe with the features added.
    """
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    assert isinstance(n, int), "n needs to be an integer"

    try:
        df = __compute_sum_of_points_last_n_games(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute sum of points last n games. Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_shots_on_target(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute mean shots on target.  Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_shots_on_target_opponents(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute mean shots on target opponents.  Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_goals_shot_last_n_matches(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute mean goals shot last n matches.  Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_goals_against_team_last_n_matches(
            df=df,
            number_of_matches=n,
        )
    except:
        raise Exception(
            "Could not compute mean goals against team last n matches.  Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_goal_difference_last_n_matches(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute mean goal difference last n matches.  Please check, if the right datasource is given.",
        )
    try:
        df = __compute_mean_corners_got_last_n_games(df=df, number_of_matches=n)
    except:
        raise Exception(
            "Could not compute mean corners got last n games.  Please check, if the right datasource is given.",
        )

    df["full_time_result"] = np.where(
        df.full_time_result == "H",
        2,
        np.where((df.full_time_result == "A"), 1, 0),
    )

    df = df.fillna(np.nan)

    return df


def add_percentages_to_odds(df, columns):
    """This function adds the percentages to the odds
    Input:
        df: dataframe
        columns: list of columns with odds
    Output:
        df: dataframe with the percentages added.
    """
    for col in columns:
        df[col + "_percentage"] = 1 / df[col]
    return df


def __add_points_based_on_game_outcome(df):
    """This function adds the points of the teams based on the outcome of the game
    Input:
        df: dataframe
    Output:
        df: dataframe with the points added.
    """
    df["HomeTeam_points"] = np.where(
        df["full_time_result"] == "H",
        3,
        np.where(df["full_time_result"] == "D", 1, 0),
    )
    df["AwayTeam_points"] = np.where(
        df["full_time_result"] == "A",
        3,
        np.where(df["full_time_result"] == "D", 1, 0),
    )
    return df


def __get_home_and_away_team(df, row_number):
    """This function returns the home and away team if a given row
    Input:
        df: dataframe
    Output:
        home_team: home team
        away_team: away team.
    """
    home_team = df.iloc[row_number]["HomeTeam"]
    away_team = df.iloc[row_number]["AwayTeam"]
    return home_team, away_team


def __get_last_n_matches(df, number_of_matches, row_number):
    """This function returns the last n matches of the home and away team
    Input:
        data: dataframe
        number_of_matches: number of matches
        row_number: row number
    Output:
        home_matches: dataframe with the last n matches of the home team
        away_matches: dataframe with the last n matches of the away team.
    """
    home_team = df.iloc[row_number]["HomeTeam"]
    away_team = df.iloc[row_number]["AwayTeam"]
    home_matches = df[
        ((df["HomeTeam"] == home_team) | (df["AwayTeam"] == home_team))
        & (df.index < row_number)
    ]
    away_matches = df[
        ((df["HomeTeam"] == away_team) | (df["AwayTeam"] == away_team))
        & (df.index < row_number)
    ]

    if home_matches.shape[0] >= number_of_matches:
        home_matches = home_matches.tail(number_of_matches)
    if away_matches.shape[0] >= number_of_matches:
        away_matches = away_matches.tail(number_of_matches)

    return home_matches, away_matches


def __extract_list_of_points(matches, team):
    """This function computes the sum of points of a team in a set of matches
    Input:
        matches: dataframe with the matches
        team: team name
    Output:
        matches_points: list of points.
    """
    matches_points = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            matches_points.append(matches.iloc[i]["HomeTeam_points"])
        else:
            matches_points.append(matches.iloc[i]["AwayTeam_points"])
    return matches_points


def __compute_sum_of_points_last_n_games(df, number_of_matches):
    """This function adds the sum of points in the last n games without the current game
    Input:
        df: dataframe
        n: number of games
    Output:
        df: dataframe with the sum of points added.
    """
    df = __add_points_based_on_game_outcome(df)

    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df=df,
            number_of_matches=number_of_matches,
            row_number=row_number,
        )

        if home_matches.shape[0] > 0:
            home_matches_points = []
            home_matches_points = __extract_list_of_points(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_sum_points_last_" + str(number_of_matches) + "_matches",
            ] = sum(home_matches_points)

        if away_matches.shape[0] > 0:
            away_matches_points = []
            away_matches_points = __extract_list_of_points(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_sum_points_last_" + str(number_of_matches) + "_matches",
            ] = sum(away_matches_points)

    return df


def __extract_list_of_shots_on_target(matches, team):
    """This function extracts the number of shots on target for a given team
    Input:
        matches: dataframe with the matches
        team: team
    Output:
        list_of_shots_on_target: list with the number of shots on target.
    """
    list_of_shots_on_target = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            list_of_shots_on_target.append(
                matches.iloc[i].loc["hometeam_shots_on_target"],
            )
        else:
            list_of_shots_on_target.append(
                matches.iloc[i].loc["awayteam_shots_on_target"],
            )
    return list_of_shots_on_target


def __compute_mean_shots_on_target(df, number_of_matches):
    """This function computes the mean shots on target in the last n matches
    Input:
        data: dataframe
        number_of_matches: number of matches
    Output:
        data: dataframe with the mean shots on target added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_shots_on_target = []
            home_team_shots_on_target = __extract_list_of_shots_on_target(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_mean_shots_on_target_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(home_team_shots_on_target)

        if away_matches.shape[0] > 0:
            away_team_shots_on_target = []
            away_team_shots_on_target = __extract_list_of_shots_on_target(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_mean_shots_on_target_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(away_team_shots_on_target)

    return df


def __extract_list_of_shots_on_target_opponents(matches, team):
    """This function extracts the number of shots on target for the opponents of a given team
    Input:
        matches: dataframe with the matches
        team: team
    Output:
        list_of_shots_on_target: list with the number of shots on target of the opponents.
    """
    list_of_shots_on_target_opponents = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            list_of_shots_on_target_opponents.append(
                matches.iloc[i].loc["awayteam_shots_on_target"],
            )
        else:
            list_of_shots_on_target_opponents.append(
                matches.iloc[i].loc["hometeam_shots_on_target"],
            )
    return list_of_shots_on_target_opponents


def __compute_mean_shots_on_target_opponents(df, number_of_matches):
    """This function computes the mean shots on target of the opponents in the last n matches
    Input:
        data: dataframe
        number_of_matches: number of matches
    Output:
        data: dataframe with the mean shots on target of the opponents added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_shots_on_target_opponents = []
            home_team_shots_on_target_opponents = (
                __extract_list_of_shots_on_target_opponents(
                    matches=home_matches,
                    team=home_team,
                )
            )
            df.loc[
                row_number,
                "HomeTeam_mean_shots_on_target_opponents_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(home_team_shots_on_target_opponents)

        if away_matches.shape[0] > 0:
            away_team_shots_on_target_opponents = []
            away_team_shots_on_target_opponents = (
                __extract_list_of_shots_on_target_opponents(
                    matches=away_matches,
                    team=away_team,
                )
            )
            df.loc[
                row_number,
                "AwayTeam_mean_shots_on_target_opponents_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(away_team_shots_on_target_opponents)

    return df


def __extract_list_of_goals(matches, team):
    """This function extracts a list of goals for a given team
    Input:
        matches: dataframe with the matches
        team: team
    Output:
        list_of_goals: list of goals.
    """
    list_of_goals = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            list_of_goals.append(matches.iloc[i].loc["full_time_goals_hometeam"])
        else:
            list_of_goals.append(matches.iloc[i].loc["full_time_goals_awayteam"])
    return list_of_goals


def __compute_mean_goals_shot_last_n_matches(df, number_of_matches):
    """This function computes the mean goals shot in the last n matches
    Input:
        data: dataframe
        number_of_matches: number of matches
    Output:
        data: dataframe with the mean goals shot added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_goals = []
            home_team_goals = __extract_list_of_goals(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_mean_goals_shot_last_" + str(number_of_matches) + "_matches",
            ] = np.mean(home_team_goals)
        if away_matches.shape[0] > 0:
            away_team_goals = []
            away_team_goals = __extract_list_of_goals(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_mean_goals_shot_last_" + str(number_of_matches) + "_matches",
            ] = np.mean(away_team_goals)
    return df


def __extract_list_of_goals_against(matches, team):
    """This function extracts a list of goals against for a given team
    Input:
        matches: dataframe with the matches
        team: team
    Output:
        list_of_goals: list of goals.
    """
    list_of_goals = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            list_of_goals.append(matches.iloc[i].loc["full_time_goals_awayteam"])
        else:
            list_of_goals.append(matches.iloc[i].loc["full_time_goals_hometeam"])
    return list_of_goals


def __compute_mean_goals_against_team_last_n_matches(df, number_of_matches):
    """This function computes the mean goals against in the last n matches
    Input:
        data: dataframe
        number_of_matches: number of matches
    Output:
        data: dataframe with the mean goals against added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_goals = []
            home_team_goals = __extract_list_of_goals_against(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_mean_goals_against_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(home_team_goals)
        if away_matches.shape[0] > 0:
            away_team_goals = []
            away_team_goals = __extract_list_of_goals_against(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_mean_goals_against_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(away_team_goals)
    return df


def __extract_the_goal_difference_list(matches, team):
    """This function computes the mean goal difference of a team in a set of matches
    Input:
        matches: dataframe with the matches
        team: team name
    Output:
        matches_goal_difference: list of goal differences.
    """
    matches_goal_difference = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            matches_goal_difference.append(
                matches.iloc[i]["full_time_goals_hometeam"]
                - matches.iloc[i]["full_time_goals_awayteam"],
            )
        else:
            matches_goal_difference.append(
                matches.iloc[i]["full_time_goals_awayteam"]
                - matches.iloc[i]["full_time_goals_hometeam"],
            )
    return matches_goal_difference


def __compute_mean_goal_difference_last_n_matches(df, number_of_matches):
    """This function computes the mean goal difference in the last n matches
    Input:
        data: dataframe
        number_of_matches: number of matches
    Output:
        data: dataframe with the mean goal difference added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_goal_difference = []
            home_team_goal_difference = __extract_the_goal_difference_list(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_mean_goal_difference_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(home_team_goal_difference)
        if away_matches.shape[0] > 0:
            away_team_goal_difference = []
            away_team_goal_difference = __extract_the_goal_difference_list(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_mean_goal_difference_last_"
                + str(number_of_matches)
                + "_matches",
            ] = np.mean(away_team_goal_difference)
    return df


def __extract_the_list_of_corners(matches, team):
    """This function computes the sum of corners of a team in a set of matches
    Input:
        matches: dataframe with the matches
        team: team name
    Output:
        matches_corners: list of corners.
    """
    matches_corners = []
    for i in range(len(matches)):
        if matches.iloc[i].loc["HomeTeam"] == team:
            matches_corners.append(matches.iloc[i]["hometeam_corners"])
        else:
            matches_corners.append(matches.iloc[i]["awayteam_corners"])
    return matches_corners


def __compute_mean_corners_got_last_n_games(df, number_of_matches):
    """This function computes the mean corners got in the last n games
    Input:
        df: dataframe
        n: number of games
    Output:
        df: dataframe with the mean corners got added.
    """
    for row_number in range(df.shape[0]):
        home_team, away_team = __get_home_and_away_team(df=df, row_number=row_number)
        home_matches, away_matches = __get_last_n_matches(
            df,
            number_of_matches,
            row_number,
        )

        if home_matches.shape[0] > 0:
            home_team_corners = []
            home_team_corners = __extract_the_list_of_corners(
                matches=home_matches,
                team=home_team,
            )
            df.loc[
                row_number,
                "HomeTeam_mean_corners_got_last_" + str(number_of_matches) + "_matches",
            ] = np.mean(home_team_corners)

        if away_matches.shape[0] > 0:
            away_team_corners = []
            away_team_corners = __extract_the_list_of_corners(
                matches=away_matches,
                team=away_team,
            )
            df.loc[
                row_number,
                "AwayTeam_mean_corners_got_last_" + str(number_of_matches) + "_matches",
            ] = np.mean(away_team_corners)

    return df

In [None]:
type(INFORMATION_SCRAPING["years"][0])

In [None]:
import numpy as np
import pandas as pd
from epp_final_project_sbp.config import BLD, ODD_FEATURES, TEST_DIR

In [None]:
pd.read_csv(TEST_DIR / "data_management" / "Fixture_output_data_engineering.csv")

In [None]:
ODD_FEATURES

In [None]:
data = pd.read_csv(BLD / "python" / "data" / "data_cleaned.csv")
data = data.head(20)
data = add_percentages_to_odds(df=data, columns=ODD_FEATURES)
data.to_csv("Fixture_input_data_feature_engineering.csv")

In [None]:
data = compute_features_last_n_games(df=data, n=5)

In [None]:
data.to_csv("Fixture_output_data_engineering.csv")

In [None]:
@pytask.mark.depends_on(
    {
        "data": BLD / "python" / "data" / "data_cleaned.csv",
        "scripts": ["feature_engineering.py"],
    },
)
@pytask.mark.produces(BLD / "python" / "data" / "data_features_added.csv")
def task_feature_engineering(depends_on, produces):
    """Feature engineering."""
    data.to_csv(produces, index=False)