# Model Building

This file is part of my work on Udacity's Nano Degree Programme.

As capstone project I compare the performance of a machine learning model in predicting matches of the 2020 UEFA European Football Championship with emy personal bets in a football guessing game played on the platform www.kicktipp.de

In this notebook a simple model is trained that predicts the number of goals of a team in a match. As features serve the rolling averages that arise from the previous feature engineering step.

The model is a simple Random Forest trained using GridSearch. As parameters for GridSearch serve the depth of the tree and the different types of rolling averages from the feature engineering step.

In [22]:
import numpy as np
import pandas as pd

In [23]:
import os

In [24]:
import pickle

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

In [26]:
from sklearn.pipeline import Pipeline

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [28]:
from sklearn.ensemble import RandomForestClassifier

## Input parameter

In [29]:
file_matches_training = '20210825_features_matches_training.xlsx'
name_output_model = 'model_new.pkl'

path_input = '../data/'
path_models = '../models/'

## Model building

In [30]:
# The FeatureSelectionTransformer chooses only engineered features w.r.t. a single (weighted) average
class FeatureSelectionTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, type_feat = 'weighted_mean_10'):
        self.type_feat = type_feat
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        cols_feat = X.columns
        cols_feat_selected = [col for col in cols_feat if self.type_feat in col]
        
        cols_feat_selected.sort()
        
        return X[cols_feat_selected]

### Preparation of Train, Test, Split

In [31]:
# Reading of the training data
df_train = pd.read_excel(os.path.join(path_input, file_matches_training),
                         encoding = 'iso-8859-1'
                        )

In [32]:
df_train.head(5)

Unnamed: 0,date,team_A,team_B,goals_A,goals_weighted_mean_5_A,goals_weighted_mean_10_A,goals_normal_mean_5_A,goals_normal_mean_10_A,attempts_total_weighted_mean_5_A,attempts_total_weighted_mean_10_A,...,blocks_normal_mean_5_B,blocks_normal_mean_10_B,clearances_weighted_mean_5_B,clearances_weighted_mean_10_B,clearances_normal_mean_5_B,clearances_normal_mean_10_B,passes_accuracy_weighted_mean_5_B,passes_accuracy_weighted_mean_10_B,passes_accuracy_normal_mean_5_B,passes_accuracy_normal_mean_10_B
0,2020-09-04 18:45:00,Italy,Bosnia and Herzegovina,1,4.244941,4.116726,4.2,3.7,18.396572,19.617894,...,3.4,2.7,0.0,0.0,0.0,0.0,78.923203,84.556986,87.6,86.6
1,2020-09-04 18:45:00,Netherlands,Poland,1,2.50994,2.637424,2.8,2.7,17.41733,17.55574,...,3.4,2.8,0.0,0.0,0.0,0.0,75.829301,81.102871,84.8,83.2
2,2020-09-07 18:45:00,Bosnia and Herzegovina,Poland,1,1.505589,1.764878,1.8,1.9,14.557373,16.215489,...,3.0,2.9,0.0,0.0,0.0,0.0,74.230419,80.144467,83.8,83.5
3,2020-09-07 18:45:00,Netherlands,Italy,0,1.942797,2.272537,2.2,2.4,16.164877,17.235452,...,2.8,2.2,0.0,0.0,0.0,0.0,80.241055,86.41834,89.8,89.4
4,2020-10-11 16:00:00,Bosnia and Herzegovina,Netherlands,0,1.066147,1.536431,1.2,1.9,14.56612,15.793928,...,2.0,2.3,0.0,0.938661,0.0,2.2,77.367331,83.784879,87.0,87.2


In [33]:
cols_blacklist = ['date', 'team_A', 'team_B', 'goals_A', 'goals_B']
cols_to_keep = [col for col in df_train.columns if col not in cols_blacklist]

In [34]:
X = df_train[cols_to_keep]
y = df_train['goals_A'].astype(int)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

### Pipeline and GridSearch

In [36]:
# We train a very simple model.
# It justs consists of a feature selection (which type of rolling average we choose) and a RandomForest
pipe = Pipeline([('Transformer_FeatSelection', FeatureSelectionTransformer()),
                 ('clf', RandomForestClassifier(n_estimators = 200, n_jobs = 4))
                ])

In [37]:
# The GridSearch just performs on the different types of rolling averages and different depths of the random forest
parameters = {'Transformer_FeatSelection__type_feat' : ['weighted_mean_5', 'weighted_mean_10', 'normal_mean_5', 'normal_mean_10'],
              'clf__max_depth' : [3, 5, 10, 20]
             }

In [38]:
cv = GridSearchCV(pipe, param_grid = parameters, verbose = 3)

In [39]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.356, total=   4.5s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.4s remaining:    0.0s


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.356, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.1s remaining:    0.0s


[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.381, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.371, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=3, score=0.371, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.338, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.369, total=   1.0s
[CV] Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=weighted_mean_5, clf__max_depth=5, score=0.362,

[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.381, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.340, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=5, score=0.384, total=   0.6s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.350, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.381, total=   0.7s
[CV] Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10 
[CV]  Transformer_FeatSelection__type_feat=normal_mean_5, clf__max_depth=10, score=0.356, total=   0.7s
[

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   59.0s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('Transformer_FeatSelection',
                                        FeatureSelectionTransformer(type_feat='weighted_mean_10')),
                                       ('clf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                    

## Export of Model

In [41]:
pickle.dump(cv, open(os.path.join(path_models, name_output_model), 'wb'))