### Load dependencies

In [13]:
import sys
print(f"Python version: {sys.version}\n")

print("Importiere Dependencies...")

import pandas as pd
print(f"... Pandas version: {pd.__version__}")
      
from datetime import datetime
print("... datetime: pre-installed")

import numpy as np
print("... Numpy: version {}".format(np.__version__))

import IPython
print("... IPython: version {}".format(IPython.__version__))

import sklearn
print("... scikit-learn: version {}".format(sklearn.__version__))

import math
print("... math: pre-installed")

import matplotlib.pyplot as plt
%matplotlib inline

import itertools
print("... itertools: pre-installed")

Python version: 3.8.10 (default, Jun  2 2021, 10:49:15) 
[GCC 9.4.0]

Importiere Dependencies...
... Pandas version: 1.2.2
... datetime: pre-installed
... Numpy: version 1.19.4
... IPython: version 7.20.0
... scikit-learn: version 1.0.1
... math: pre-installed
... itertools: pre-installed


In [14]:
OFFSET_COACH = 3
OFFSET_GAMES = 3

In [15]:
df_test = pd.read_csv('./src/tmp/df_test_ext.csv', index_col=0)
df_test.reset_index(inplace=True)
df_test.drop(columns=['index'], inplace=True)

df = df_test.copy(deep=True)

In [16]:
# df = df.loc[df['Season'].isin(['15/16', '16/17', '17/18', '18/19', '20/21'])]

In [17]:
#overview of all features
def info_me(df):
    df.info()

#describes numerical features
def describe_me(df):
    nr = len(df.describe().columns)
    cols = df.describe().columns
    rows = ['mean', 'std', 'min', '25%', '50%', '75%', 'max']

    fig, ax = plt.subplots(nrows=math.ceil(nr/3), ncols=3, figsize=(15, math.ceil(nr/3)*5))
    row_counter = 0
    col_counter = 0

    for col in cols:
        Y = []
        for row in rows:
            Y.append(df.describe()[col][row])

        color = 'tab:blue'
        ax[row_counter, col_counter].set_ylabel(col, fontsize=14)
        ax[row_counter, col_counter].plot(rows, Y, color=color)
        ax[row_counter, col_counter].tick_params(axis='y', labelsize=14)
        ax[row_counter, col_counter].tick_params(axis='x', rotation=60, labelsize=14)

        col_counter += 1
        if col_counter % 3 == 0:
            col_counter = 0
            row_counter += 1

    fig.tight_layout()

#correlation matrix of numerical features
def corr_me(df):
    size = len(df.corr().columns) * 0.7
    matrix = np.around(df.corr().values, decimals=2)
    labels = df.corr().columns

    fig, ax = plt.subplots(figsize=(size, size))
    im = ax.imshow(matrix)

    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))

    ax.set_xticklabels(labels, fontsize=14)
    ax.set_yticklabels(labels, fontsize=14)

    plt.setp(ax.get_xticklabels(), rotation=60, ha="right",
             rotation_mode="anchor")

    for i in range(len(labels)):
        for j in range(len(labels)):
            text = ax.text(j, i, matrix[i, j],
                           ha="center",
                           va="center",
                           color="w",
                           fontsize=14)

    fig.tight_layout()

In [18]:
# describe_me(df)

In [19]:
num_features = ['Home Offensive Shape',
                'Away Offensive Shape',
                'Home Aggresive Shape',
                'Away Aggresive Shape',
                'Home Offensive Shape Against',
                'Away Offensive Shape Against',
                'Home Aggresive Shape Against',
                'Away Aggresive Shape Against',
                'Home Rate',
                'Deuce Rate',
                'Away Rate',
                'Home Current Position On Table', # can also be treated as categorical feature
                'Away Current Position On Table', # can also be treated as categorical feature
                'Weighted Season'
               ]

cat_features = ['Home Team',
                'Away Team',
                'Matchday',
                'Weekday',
                'Season',
                'Home Coach',
                'Away Coach',
                f'Home Coach Substituted Within Last {OFFSET_COACH} Games',
                f'Away Coach Substituted Within Last {OFFSET_COACH} Games',
                'Home Promoted Last Year',
                'Away Promoted Last Year',
                'Kick Off Before 17:00',
               ]

In [20]:
from sklearn.model_selection import train_test_split

features = df.drop('Full Time Result', axis=1)
label = df['Full Time Result']

test_size = 306 - (3 * 9) #1 Season (306 games) - 3 Matchdays
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    label,
                                                    test_size=test_size,
                                                    random_state=0,
                                                    shuffle=False)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('binning', KBinsDiscretizer(encode='onehot-dense')),
#     ('poly', PolynomialFeatures())    
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)])

from sklearn.neighbors import KNeighborsClassifier

rfc = KNeighborsClassifier()

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

sfm = SelectFromModel(RandomForestClassifier(random_state=42)) 
rfecv = RFECV(RandomForestClassifier(n_estimators=10, random_state=42), step=10)

### MAGIC PART OF PIPELINE!!!
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('selector',  sfm),
                       ('regressor', rfc)])

from sklearn.model_selection import GridSearchCV

param_grid = {
#     'preprocessor__num__poly__degree': [1, 2],
    'preprocessor__num__binning__strategy': ['uniform', 'quantile'],
    'preprocessor__num__binning__n_bins': [2, 5],
    'preprocessor__num__binning__encode': ['onehot', 'ordinal'],
#     'selector__threshold': ['0.5*median', '0.75*median', '1*median'],
    'regressor__n_neighbors': [2, 5, 10],
}

search = GridSearchCV(pipe,
                      param_grid,
                      cv=3,
                      n_jobs=-1,
                      refit=True,
                      verbose=3)

search.fit(X_train, y_train)

print()
print('GridSearchCV:')
print(f"Best score : {search.best_score_}")
print(f"Best params: {search.best_params_}")

print()
print("train_test_split() data:")
print(f"Train: {search.score(X_train, y_train)}")
print(f"Test : {search.score(X_test, y_test)}")

print()
print('Binning:')
print(search.best_estimator_.named_steps['preprocessor'].transformers_[0][1].named_steps['binning'].bin_edges_)

print()
prediction = search.predict(X_test)
(unique, counts) = np.unique(prediction, return_counts=True)
for u, c in zip(unique, counts):
    print(f"value: {u}, count: {c}")
    
from sklearn.metrics import classification_report
# Precision: Of all the predicted positives, how many were actually positive?
# Recall: Of all positives, how many did the model said it was positive?
# F1-score: The harmonic mean of precision and recall.
print()
print(classification_report(prediction, y_test))

Fitting 3 folds for each of 24 candidates, totalling 72 fits





GridSearchCV:
Best score : 0.48234508650917435
Best params: {'preprocessor__num__binning__encode': 'ordinal', 'preprocessor__num__binning__n_bins': 2, 'preprocessor__num__binning__strategy': 'quantile', 'regressor__n_neighbors': 10}

train_test_split() data:
Train: 0.5701125895598772
Test : 0.5017921146953405

Binning:
[array([-2.55732928, -0.10270387,  4.04705429])
 array([-2.78056771, -0.0898195 ,  3.98970198])
 array([-3.12542458e+00, -5.11890256e-04,  3.44800578e+00])
 array([-3.2698666 ,  0.00660538,  3.69429449])
 array([-2.85484302, -0.01601399,  4.1746384 ])
 array([-2.76610877, -0.02840234,  3.62416579])
 array([-3.29333932, -0.02992606,  3.88271649])
 array([-3.20068758, -0.03538998,  3.87764211])
 array([-0.87763959, -0.25957726,  9.03894985])
 array([-0.80512866, -0.38133792, 14.4513378 ])
 array([-0.92156931, -0.3073998 , 13.25551028])
 array([-1.66245626,  0.0740581 ,  1.61762643])
 array([-1.61591062, -0.07421482,  1.66019296])
 array([-0.35910682,  3.41344546])]

value

In [21]:
def get_selected_features():     
    return 'TODOOOOO...'

print()
print("Selected features:")
print(get_selected_features())


Selected features:
TODOOOOO...


In [251]:
from importlib import reload
import modules.make_simulations as msim
reload(msim)

# msim.simulate_betting_each_matchday(X_test, y_test, prediction)
msim.simulate_betting_each_game(X_test, y_test, prediction)
# msim.simulate_betting_each_game_on_favorite(X_test, y_test)

Bet per game: 10 Euro

Augsburg - RB Leipzig (prediction=A result=A)
Win     --> 5.3 Euro
Balance --> 5.3 Euro

Freiburg - Werder Bremen (prediction=H result=D)
Win     --> -10 Euro
Balance --> -4.7 Euro

Hertha - Stuttgart (prediction=H result=A)
Win     --> -10 Euro
Balance --> -14.7 Euro

Hoffenheim - Dortmund (prediction=A result=A)
Win     --> 7.5 Euro
Balance --> -7.2 Euro

Mainz - Leverkusen (prediction=A result=A)
Win     --> 5.7 Euro
Balance --> -1.5 Euro

Bielefeld - Bayern Munich (prediction=A result=A)
Win     --> 1.6 Euro
Balance --> 0.1 Euro

M'gladbach - Wolfsburg (prediction=H result=D)
Win     --> -10 Euro
Balance --> -9.9 Euro

FC Koln - Ein Frankfurt (prediction=H result=D)
Win     --> -10 Euro
Balance --> -19.9 Euro

Schalke 04 - Union Berlin (prediction=H result=D)
Win     --> -10 Euro
Balance --> -29.9 Euro

Stuttgart - FC Koln (prediction=H result=D)
Win     --> -10 Euro
Balance --> -39.9 Euro

Bayern Munich - Ein Frankfurt (prediction=H result=H)
Win     --> 1.6

Balance --> -26.0 Euro

RB Leipzig - Bayern Munich (prediction=A result=A)
Win     --> 13.7 Euro
Balance --> -12.3 Euro

M'gladbach - Freiburg (prediction=H result=H)
Win     --> 7.5 Euro
Balance --> -4.8 Euro

Stuttgart - Werder Bremen (prediction=H result=H)
Win     --> 9.0 Euro
Balance --> 4.2 Euro

Union Berlin - Hertha (prediction=A result=D)
Win     --> -10 Euro
Balance --> -5.8 Euro

Bielefeld - Freiburg (prediction=H result=H)
Win     --> 18.7 Euro
Balance --> 12.9 Euro

Bayern Munich - Union Berlin (prediction=H result=D)
Win     --> -10 Euro
Balance --> 2.9 Euro

Ein Frankfurt - Wolfsburg (prediction=A result=H)
Win     --> -10 Euro
Balance --> -7.1 Euro

Hertha - M'gladbach (prediction=A result=D)
Win     --> -10 Euro
Balance --> -17.1 Euro

Werder Bremen - RB Leipzig (prediction=A result=A)
Win     --> 4.0 Euro
Balance --> -13.1 Euro

Stuttgart - Dortmund (prediction=A result=A)
Win     --> 8.3 Euro
Balance --> -4.8 Euro

Schalke 04 - Augsburg (prediction=D result=H)
Win   