In [None]:
import pickle
import warnings

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler

In [None]:
# all information needed to scrape data from football-data.co.uk


beginning_url = "https://www.football-data.co.uk/"
years = [
    "2223",
    "2122",
    "2021",
    "1920",
    "1819",
    "1718",
    "1617",
    "1516",
    "1415",
    "1314",
    "1213",
    "1213",
    "1112",
]
Leagues = {
    "PL": {
        "Foldername": "PL_data",
        "Leaguetag": "PL",
        "Leaguename": "E0",
        "Leagueurl": "https://www.football-data.co.uk/englandm.php",
    },
    "BL": {
        "Foldername": "BL_data",
        "Leaguetag": "BL",
        "Leaguename": "D1",
        "Leagueurl": "https://www.football-data.co.uk/germanym.php",
    },
    "PD": {
        "Foldername": "PD_data",
        "Leaguetag": "PD",
        "Leaguename": "SP1",
        "Leagueurl": "https://www.football-data.co.uk/spainm.php",
    },
    "SA": {
        "Foldername": "SA_data",
        "Leaguetag": "SA",
        "Leaguename": "I1",
        "Leagueurl": "https://www.football-data.co.uk/italym.php",
    },
}


considered_features = [
    "league",
    "kick_off_time",
    "HomeTeam",
    "AwayTeam",
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "full_time_result",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
]

# all elements to make categorical

categorical_features = [
    "league",
    "HomeTeam",
    "AwayTeam",
    "full_time_result",
    "half_time_result",
]
integer_features = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
]
odd_features = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
    "consensus_odds_home",
    "consensus_odds_draw",
    "consensus_odds_away",
    "consensus_sum_of_percentages",
]


# all columns with features that are not known on game day
not_known_on_game_day = [
    "full_time_goals_hometeam",
    "full_time_goals_awayteam",
    "half_time_goals_hometeam",
    "half_time_goals_awayteam",
    "half_time_result",
    "hometeam_shots",
    "awayteam_shots",
    "hometeam_shots_on_target",
    "awayteam_shots_on_target",
    "hometeam_corners",
    "awayteam_corners",
    "hometeam_fouls_done",
    "awayteam_fouls_done",
    "hometeam_yellow_cards",
    "awayteam_yellow_cards",
    "hometeam_red_cards",
    "awayteam_red_cards",
    "HomeTeam_points",
    "AwayTeam_points",
]
odds = [
    "B365H",
    "B365D",
    "B365A",
    "BSH",
    "BSD",
    "BSA",
    "BWH",
    "BWD",
    "BWA",
    "GBH",
    "GBD",
    "GBA",
    "IWH",
    "IWD",
    "IWA",
    "LBH",
    "LBD",
    "LBA",
    "PSH",
    "PSD",
    "PSA",
    "SBH",
    "SBD",
    "SBA",
    "SJH",
    "SJD",
    "SJA",
    "VCH",
    "VCD",
    "VCA",
    "WHH",
    "WHD",
    "WHA",
    "consensus_odds_home",
    "consensus_odds_draw",
    "consensus_odds_away",
    "consensus_sum_of_percentages",
]

In [None]:
def compute_consensus_odds(df, columns_with_odds):
    """This function computes the consensus odds
    Input:
        df: dataframe
        columns_with_odds: list of columns with the odds
    Output:
        df: dataframe with the consensus odds added.
    """
    columns_with_odds = [x for x in columns_with_odds if x in list(df.columns)]
    home_odd_columns = [col for col in columns_with_odds if col.endswith("H")]
    draw_odd_columns = [col for col in columns_with_odds if col.endswith("D")]
    away_odd_columns = [col for col in columns_with_odds if col.endswith("A")]

    df["consensus_odds_home"] = df[home_odd_columns].mean(axis=1)
    df["consensus_odds_draw"] = df[draw_odd_columns].mean(axis=1)
    df["consensus_odds_away"] = df[away_odd_columns].mean(axis=1)
    return df

In [None]:
def data_robustness_check(data):
    """Drop the columns, where all entries are NaN
    Input:
        data: dataframe
    Output:
    data: dataframe
    .
    """
    data = data.dropna(axis=1, how="all")
    return data


def compute_percentages_out_of_consensus_odds(df):
    """This function computes the percentages out of the consensus odds
    Input:
        df: dataframe
        columns_with_consensus_odds: list of columns with the consensus odds
    Output:
        df: dataframe with the percentages out of the consensus odds added.
    """
    df["consensus_percentage_home"] = 1 / df["consensus_odds_home"]
    df["consensus_percentage_draw"] = 1 / df["consensus_odds_draw"]
    df["consensus_percentage_away"] = 1 / df["consensus_odds_away"]
    df["consensus_sum_of_percentages"] = (
        df["consensus_percentage_home"]
        + df["consensus_percentage_draw"]
        + df["consensus_percentage_away"]
    )
    df["consensus_percentage_home"] = (
        df["consensus_percentage_home"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_draw"] = (
        df["consensus_percentage_draw"] / df["consensus_sum_of_percentages"]
    )
    df["consensus_percentage_away"] = (
        df["consensus_percentage_away"] / df["consensus_sum_of_percentages"]
    )
    return df

In [None]:
# turning categorical into dummy vars
def data_preparation(data, league, not_known_on_game_day, odds):
    """prepares the data, to be used in the model
    Input:
        data: dataframe
        not_known_on_game_day: list of columns, which are not known on game day
        odds: list of columns, which are the odds.

    """
    data = data.drop(columns="index")
    data = data.set_index("Date")
    data = data.loc[data["league"] == league]
    data = compute_consensus_odds(df=data, columns_with_odds=odds)
    data = compute_percentages_out_of_consensus_odds(df=data)
    odds = data[list(odds)]
    data = data.drop(columns=not_known_on_game_day)
    data = data.drop(columns=["league", "kick_off_time"], axis=1)
    data = data.drop(columns=odds, axis=1)
    data_dum = pd.get_dummies(data)
    data_dum = data_dum.fillna(-33)
    data_dum = data_robustness_check(data=data_dum)
    return data_dum, odds

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data = pd.read_csv(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/data/data_features_added.csv",
    index_col=False,
)

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/data/processed_I1_LOGIT_model.pkl",
    "rb",
) as f:
    x = pickle.load(f)

In [None]:
x

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/data/processed_SP1_RF_model.pkl",
    "rb",
) as f:
    y = pickle.load(f)
y

In [None]:
data_dum, odds = data_preparation(
    data=data,
    league="E0",
    not_known_on_game_day=not_known_on_game_day,
    odds=odds,
)

In [None]:
def best_feature_selection_RFECV_logit(scaler, clf, min_feat, data_dum, cv_split):
    """computes the best feature selection for Logistic Regression
    this function does "
    Input:
        scaler: MinMaxScaler
        clf: LogisticRegression
        i: number of features
        data_dum: dataframe
        Output:
        X_train: X_train
        Y_train: Y_train.
    """
    X_train = pd.DataFrame()
    Y_train = pd.DataFrame()
    model_rfecv = RFECV(
        estimator=clf,
        min_features_to_select=min_feat,
        step=1,
        cv=cv_split,
        n_jobs=-2,
        scoring="f1_macro",
    )
    X_train = data_dum.drop(columns=["full_time_result"])
    Y_train = data_dum["full_time_result"]
    model_rfecv.fit(X_train, Y_train)
    X_temp = model_rfecv.transform(X_train)
    X_train = scaler.fit_transform(X_temp)
    return model_rfecv, X_train, Y_train


def cv_get_rf_model(
    max_depth_of_trees,
    n_bootstrap_iterations,
    time_series_split,
    data,
):
    n_estimators = [
        int(x) for x in np.linspace(start=50, stop=n_bootstrap_iterations, num=10)
    ]
    max_features = ["sqrt"]
    depth_of_trees = [int(x) for x in np.linspace(5, max_depth_of_trees, num=5)]
    depth_of_trees.append(None)
    tscv = TimeSeriesSplit(n_splits=time_series_split)
    grid = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": depth_of_trees,
        "bootstrap": [True],
    }
    X_train = data.drop(columns=["full_time_result"])
    Y_train = data["full_time_result"]

    rf_model_grid_search = random_forests_model(
        split=tscv,
        X_train=X_train,
        Y_train=Y_train,
        random_grid=grid,
    )
    # fine tune logistic regression

    GridSearchCV(
        clf,
        parameters,
        scoring="f1_macro",
        cv=tscv,
        n_jobs=-2,
        verbose=1,
    )
    return rf_model_grid_search


def random_forests_model(split, X_train, Y_train, random_grid):
    rf = RandomForestClassifier()
    rf_model_grid_search = GridSearchCV(
        estimator=rf,
        param_grid=random_grid,
        cv=split,
        scoring="f1_macro",
        n_jobs=-2,
    )
    rf_model_grid_search.fit(X=X_train, y=Y_train)
    return rf_model_grid_search

In [None]:
# general information
train_share = 0.8


def data_split(data, train_share):
    """Splits the data into training and test split.
    Since the data is already sorted by date the first train_share per cent
    are taken as a training set, the rest is the test set.
    Input:
        data: dataframe
        train_share: float
        Output:
    train_data: dataframe
    .
    """
    train_sample = int(len(data) * train_share)
    test_data = data.iloc[train_sample:]
    train_data = data.iloc[:train_sample]
    return train_data, test_data


train_data, test_data = data_split(data=data_dum, train_share=train_share)
tscv = TimeSeriesSplit(n_splits=10, max_train_size=None, test_size=None)

# logit model
min_feat = 4
clf = LogisticRegression(
    max_iter=1000,
    C=0.01,
    multi_class="multinomial",
    fit_intercept=True,
)
scaler = MinMaxScaler()


## rf model
max_depth_of_trees = 100
n_bootstrap_iterations = 100
time_series_split = 5
data = train_data

In [None]:
train_data

In [None]:
logit_model, x_train_used, y_train_used = best_feature_selection_RFECV_logit(
    scaler=scaler,
    clf=clf,
    min_feat=min_feat,
    data_dum=train_data,
    cv_split=tscv,
)

In [None]:
rf_model_grid_search = cv_get_rf_model(
    max_depth_of_trees=max_depth_of_trees,
    n_bootstrap_iterations=n_bootstrap_iterations,
    time_series_split=time_series_split,
    data=train_data,
)

In [None]:
# printing best fits and time elapsed
gs.fit(x_train_used, y_train_used)
warnings.filterwarnings("ignore")

In [None]:
x_test = test_data.drop(columns=["full_time_result"])
y_test = test_data["full_time_result"]
x_test_logit = logit_model.transform(x_test)
tpred_lr = gs.best_estimator_.predict(x_test_logit)

In [None]:
X_train = train_data.drop(columns=["full_time_result"])
Y_train = train_data["full_time_result"]
rf_model = rf_model_grid_search.best_estimator_
rf_model.fit(X_train, Y_train)
tpred_rf = rf_model.predict(x_test)

In [None]:
import pickle

from epp_final_project_sbp.config import INFORMATION_SCRAPING

0.5483108108108109 {'C': 0.01, 'class_weight': None, 'fit_intercept': False, 'solver': 'lbfgs'}

In [None]:
# testing models on unseen data
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
tpred_knn = knn.predict(x_test)
np.mean(y_test == tpred_knn)

In [None]:
with open(
    "/Users/luisenriquekaiser/Documents/Final/epp_final_project_sbp/bld/python/models/final_model_SP1.pkl",
    "rb",
) as f:
    y = pickle.load(f)
y

In [None]:
INFORMATION_SCRAPING