# Setup Notebook

* **Colab Users:** Setup data location in your drive accordingly.
* **Local Setup**: Put the notebook into a folder with csv files and bypass the first cell.

In [1]:
## NOTE: 'Colab Users' only

# import drive 
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

# move files to current folder
!mv 'drive/My Drive/IE582Fall2019_data_files/' .

# unzip all of the files
!unzip 'IE582Fall2019_data_files/bets.zip'
!unzip 'IE582Fall2019_data_files/booking.zip'
!unzip 'IE582Fall2019_data_files/goals.zip'
!unzip 'IE582Fall2019_data_files/matches.zip'
!unzip 'IE582Fall2019_data_files/stats.zip'

Mounted at /content/drive/
mv: inter-device move failed: 'drive/My Drive/IE582Fall2019_data_files/' to './IE582Fall2019_data_files'; unable to remove target: Directory not empty


KeyboardInterrupt: ignored

In [82]:
!pip install scikit-learn==0.22

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [0]:
LEAGUE_ID = 148 # PREMIER LEAGUE ID

In [94]:
# read match csv data into pandas frame
matches = pd.read_csv('matches.csv')

# filter unfinished matches
matches = matches[matches['match_status'] == 'Finished']

# filter desired league matches
matches = matches[matches['league_id'] == LEAGUE_ID]

# visualize data for sanity check
matches.head()

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id
1,2614,2619,13331,1505561400,Finished,0,Crystal Palace,Southampton,0.0,1.0,0.0,1.0,0.0,1.0,,,148
5,2626,2623,13329,1505570400,Finished,0,Watford,Manchester City,0.0,6.0,0.0,3.0,0.0,3.0,,,148
6,2629,2621,13327,1505570400,Finished,0,Liverpool,Burnley,1.0,1.0,1.0,1.0,1.0,1.0,,,148
7,2641,2654,13456,1505570400,Finished,0,Sheffield Utd,Norwich,0.0,1.0,0.0,1.0,0.0,1.0,,,148
27,2617,2616,13324,1505651400,Finished,0,Chelsea,Arsenal,0.0,0.0,0.0,0.0,0.0,0.0,,,148


# Feature Extraction

In [0]:
features = ['last_n_home_goals', 'last_n_away_goals', 'last_n_home_points', 'last_n_away_points', 'last_n_goals', 'last_n_points']

In [96]:
# GOALS IN LAST X HOME MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_home_goals'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_hometeam_id == team_id]
    cumulative_goals = team_matches.match_hometeam_score.cumsum()
    matches.loc[matches.match_hometeam_id == team_id, 'last_n_home_goals'] = (cumulative_goals.shift(1) -
                                                                              cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals
1,2614,2619,13331,1505561400,Finished,0,Crystal Palace,Southampton,0.0,1.0,0.0,1.0,0.0,1.0,,,148,
5,2626,2623,13329,1505570400,Finished,0,Watford,Manchester City,0.0,6.0,0.0,3.0,0.0,3.0,,,148,
6,2629,2621,13327,1505570400,Finished,0,Liverpool,Burnley,1.0,1.0,1.0,1.0,1.0,1.0,,,148,
7,2641,2654,13456,1505570400,Finished,0,Sheffield Utd,Norwich,0.0,1.0,0.0,1.0,0.0,1.0,,,148,
27,2617,2616,13324,1505651400,Finished,0,Chelsea,Arsenal,0.0,0.0,0.0,0.0,0.0,0.0,,,148,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5801,2615,2616,273263,1576332000,Finished,0,Chelsea,Bournemouth,0.0,1.0,0.0,0.0,0.0,0.0,,,148,1.333333
5803,2630,2629,273262,1576332000,Finished,0,Burnley,Newcastle,1.0,0.0,0.0,0.0,0.0,0.0,,,148,1.333333
5804,2632,2654,273268,1576332000,Finished,0,Sheffield Utd,Aston Villa,2.0,0.0,0.0,0.0,0.0,0.0,,,148,2.000000
5808,2641,2611,273265,1576332000,Finished,0,Leicester,Norwich,1.0,1.0,1.0,1.0,1.0,1.0,,,148,2.000000


In [97]:
# GOALS IN LAST X AWAY MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_away_goals'] = float('NaN')

team_ids = matches.match_awayteam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_awayteam_id == team_id]
    cumulative_goals = team_matches.match_awayteam_score.cumsum()
    matches.loc[matches.match_awayteam_id == team_id, 'last_n_away_goals'] = (cumulative_goals.shift(1) -
                                                                              cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals
1,2614,2619,13331,1505561400,Finished,0,Crystal Palace,Southampton,0.0,1.0,0.0,1.0,0.0,1.0,,,148,,
5,2626,2623,13329,1505570400,Finished,0,Watford,Manchester City,0.0,6.0,0.0,3.0,0.0,3.0,,,148,,
6,2629,2621,13327,1505570400,Finished,0,Liverpool,Burnley,1.0,1.0,1.0,1.0,1.0,1.0,,,148,,
7,2641,2654,13456,1505570400,Finished,0,Sheffield Utd,Norwich,0.0,1.0,0.0,1.0,0.0,1.0,,,148,,
27,2617,2616,13324,1505651400,Finished,0,Chelsea,Arsenal,0.0,0.0,0.0,0.0,0.0,0.0,,,148,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5801,2615,2616,273263,1576332000,Finished,0,Chelsea,Bournemouth,0.0,1.0,0.0,0.0,0.0,0.0,,,148,1.333333,1.000000
5803,2630,2629,273262,1576332000,Finished,0,Burnley,Newcastle,1.0,0.0,0.0,0.0,0.0,0.0,,,148,1.333333,1.666667
5804,2632,2654,273268,1576332000,Finished,0,Sheffield Utd,Aston Villa,2.0,0.0,0.0,0.0,0.0,0.0,,,148,2.000000,1.333333
5808,2641,2611,273265,1576332000,Finished,0,Leicester,Norwich,1.0,1.0,1.0,1.0,1.0,1.0,,,148,2.000000,1.000000


In [102]:
# POINTS IN LAST X HOME MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['match_hometeam_points'] = (matches['match_hometeam_score'] > matches['match_awayteam_score']) * 2 + (matches['match_hometeam_score'] >= matches['match_awayteam_score']) * 1
matches['last_n_home_points'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_hometeam_id == team_id]
    cumulative_points = team_matches['match_hometeam_points'].cumsum()
    matches.loc[matches.match_hometeam_id == team_id, 'last_n_home_points'] = (cumulative_points.shift(1) -
                                                                              cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,last_n_goals
1,2614,2619,13331,1505561400,Finished,0,Crystal Palace,Southampton,0.0,1.0,0.0,1.0,0.0,1.0,,,148,,,0,,
5,2626,2623,13329,1505570400,Finished,0,Watford,Manchester City,0.0,6.0,0.0,3.0,0.0,3.0,,,148,,,0,,
6,2629,2621,13327,1505570400,Finished,0,Liverpool,Burnley,1.0,1.0,1.0,1.0,1.0,1.0,,,148,,,1,,
7,2641,2654,13456,1505570400,Finished,0,Sheffield Utd,Norwich,0.0,1.0,0.0,1.0,0.0,1.0,,,148,,,0,,
27,2617,2616,13324,1505651400,Finished,0,Chelsea,Arsenal,0.0,0.0,0.0,0.0,0.0,0.0,,,148,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5801,2615,2616,273263,1576332000,Finished,0,Chelsea,Bournemouth,0.0,1.0,0.0,0.0,0.0,0.0,,,148,1.333333,1.000000,0,2.000000,
5803,2630,2629,273262,1576332000,Finished,0,Burnley,Newcastle,1.0,0.0,0.0,0.0,0.0,0.0,,,148,1.333333,1.666667,3,1.000000,
5804,2632,2654,273268,1576332000,Finished,0,Sheffield Utd,Aston Villa,2.0,0.0,0.0,0.0,0.0,0.0,,,148,2.000000,1.333333,3,1.333333,
5808,2641,2611,273265,1576332000,Finished,0,Leicester,Norwich,1.0,1.0,1.0,1.0,1.0,1.0,,,148,2.000000,1.000000,1,3.000000,


In [104]:
# POINTS IN LAST X AWAY MATCHES
MATCH_COUNT = 3

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['match_awayteam_points'] = (matches['match_awayteam_score'] > matches['match_hometeam_score']) * 2 + (matches['match_awayteam_score'] >= matches['match_hometeam_score']) * 1
matches['last_n_away_points'] = float('NaN')

team_ids = matches.match_awayteam_id.unique()
for team_id in team_ids:
    team_matches = matches[matches.match_awayteam_id == team_id]
    cumulative_points = team_matches['match_awayteam_points'].cumsum()
    matches.loc[matches.match_awayteam_id == team_id, 'last_n_away_points'] = (cumulative_points.shift(1) -
                                                                              cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,last_n_goals,match_awayteam_points,last_n_away_points
5673,2641,2614,273248,1575484200,Finished,0,Southampton,Norwich,2.0,1.0,2.0,0.0,2.0,0.0,,,148,1.0,0.666667,3,1.0,,0,1.333333
5676,2612,2621,273250,1575486900,Finished,0,Liverpool,Everton,5.0,2.0,4.0,2.0,4.0,2.0,,,148,2.333333,1.666667,3,3.0,,0,1.0
5677,2630,2654,273244,1575570600,Finished,0,Sheffield Utd,Newcastle,0.0,2.0,0.0,1.0,0.0,1.0,,,148,2.333333,1.0,0,2.333333,,3,1.0
5679,2613,2617,273241,1575573300,Finished,0,Arsenal,Brighton,1.0,2.0,0.0,1.0,0.0,1.0,,,148,1.666667,1.0,0,1.0,,3,0.0
5692,2616,2612,273254,1575718200,Finished,0,Everton,Chelsea,3.0,1.0,1.0,0.0,1.0,0.0,,,148,1.0,2.333333,3,1.333333,,0,2.0
5702,2619,2623,273259,1575727200,Finished,0,Watford,Crystal Palace,0.0,0.0,0.0,0.0,0.0,0.0,,,148,0.333333,1.333333,1,0.333333,,1,1.333333
5703,2621,2615,273252,1575727200,Finished,0,Bournemouth,Liverpool,0.0,3.0,0.0,2.0,0.0,2.0,,,148,0.666667,1.666667,0,1.333333,,3,2.333333
5705,2629,2628,273258,1575727200,Finished,0,Tottenham,Burnley,5.0,0.0,3.0,0.0,3.0,0.0,,,148,1.666667,1.333333,3,1.666667,,0,1.0
5719,2627,2626,273255,1575736200,Finished,0,Manchester City,Manchester Utd,1.0,2.0,0.0,2.0,0.0,2.0,,,148,2.333333,1.666667,0,3.0,,3,1.333333
5745,2611,2632,273251,1575810000,Finished,0,Aston Villa,Leicester,1.0,4.0,1.0,2.0,1.0,2.0,,,148,1.666667,4.333333,0,2.0,,3,3.0


In [113]:
# GOALS IN LAST X MATCHES
MATCH_COUNT = 5

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_goals'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[(matches.match_hometeam_id == team_id) | (matches.match_awayteam_id == team_id)]
    cumulative_goals = (team_matches.match_hometeam_score * (team_matches.match_hometeam_id == team_id) +
                        (team_matches.match_awayteam_score * (team_matches.match_awayteam_id == team_id))).cumsum()
    matches.loc[(matches.match_hometeam_id == team_id) |
                (matches.match_awayteam_id == team_id), 'last_n_goals'] = (cumulative_goals.shift(1) -
                                                                           cumulative_goals.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,last_n_goals,match_awayteam_points,last_n_away_points,last_n_points
5673,2641,2614,273248,1575484200,Finished,0,Southampton,Norwich,2.0,1.0,2.0,0.0,2.0,0.0,,,148,1.0,0.666667,3,1.0,1.0,0,1.333333,
5676,2612,2621,273250,1575486900,Finished,0,Liverpool,Everton,5.0,2.0,4.0,2.0,4.0,2.0,,,148,2.333333,1.666667,3,3.0,1.2,0,1.0,
5677,2630,2654,273244,1575570600,Finished,0,Sheffield Utd,Newcastle,0.0,2.0,0.0,1.0,0.0,1.0,,,148,2.333333,1.0,0,2.333333,1.6,3,1.0,
5679,2613,2617,273241,1575573300,Finished,0,Arsenal,Brighton,1.0,2.0,0.0,1.0,0.0,1.0,,,148,1.666667,1.0,0,1.0,1.4,3,0.0,
5692,2616,2612,273254,1575718200,Finished,0,Everton,Chelsea,3.0,1.0,1.0,0.0,1.0,0.0,,,148,1.0,2.333333,3,1.333333,1.2,0,2.0,
5702,2619,2623,273259,1575727200,Finished,0,Watford,Crystal Palace,0.0,0.0,0.0,0.0,0.0,0.0,,,148,0.333333,1.333333,1,0.333333,0.8,1,1.333333,
5703,2621,2615,273252,1575727200,Finished,0,Bournemouth,Liverpool,0.0,3.0,0.0,2.0,0.0,2.0,,,148,0.666667,1.666667,0,1.333333,1.0,3,2.333333,
5705,2629,2628,273258,1575727200,Finished,0,Tottenham,Burnley,5.0,0.0,3.0,0.0,3.0,0.0,,,148,1.666667,1.333333,3,1.666667,1.4,0,1.0,
5719,2627,2626,273255,1575736200,Finished,0,Manchester City,Manchester Utd,1.0,2.0,0.0,2.0,0.0,2.0,,,148,2.333333,1.666667,0,3.0,2.2,3,1.333333,
5745,2611,2632,273251,1575810000,Finished,0,Aston Villa,Leicester,1.0,4.0,1.0,2.0,1.0,2.0,,,148,1.666667,4.333333,0,2.0,1.4,3,3.0,


In [114]:
# POINTS IN LAST X MATCHES
MATCH_COUNT = 5

# sort according to epoch
matches.sort_values(by=['epoch'])
matches['last_n_points'] = float('NaN')

team_ids = matches.match_hometeam_id.unique()
for team_id in team_ids:
    team_matches = matches[(matches.match_hometeam_id == team_id) | (matches.match_awayteam_id == team_id)]
    cumulative_points = (team_matches.match_hometeam_points * (team_matches.match_hometeam_id == team_id) +
                         (team_matches.match_awayteam_points * (team_matches.match_awayteam_id == team_id))).cumsum()
    matches.loc[(matches.match_hometeam_id == team_id) |
                (matches.match_awayteam_id == team_id), 'last_n_points'] = (cumulative_points.shift(1) -
                                                                           cumulative_points.shift(1+MATCH_COUNT)) / MATCH_COUNT
matches.tail(20)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,last_n_home_goals,last_n_away_goals,match_hometeam_points,last_n_home_points,last_n_goals,match_awayteam_points,last_n_away_points,last_n_points
5673,2641,2614,273248,1575484200,Finished,0,Southampton,Norwich,2.0,1.0,2.0,0.0,2.0,0.0,,,148,1.0,0.666667,3,1.0,1.0,0,1.333333,0.8
5676,2612,2621,273250,1575486900,Finished,0,Liverpool,Everton,5.0,2.0,4.0,2.0,4.0,2.0,,,148,2.333333,1.666667,3,3.0,1.2,0,1.0,1.4
5677,2630,2654,273244,1575570600,Finished,0,Sheffield Utd,Newcastle,0.0,2.0,0.0,1.0,0.0,1.0,,,148,2.333333,1.0,0,2.333333,1.6,3,1.0,1.6
5679,2613,2617,273241,1575573300,Finished,0,Arsenal,Brighton,1.0,2.0,0.0,1.0,0.0,1.0,,,148,1.666667,1.0,0,1.0,1.4,3,0.0,0.8
5692,2616,2612,273254,1575718200,Finished,0,Everton,Chelsea,3.0,1.0,1.0,0.0,1.0,0.0,,,148,1.0,2.333333,3,1.333333,1.2,0,2.0,0.8
5702,2619,2623,273259,1575727200,Finished,0,Watford,Crystal Palace,0.0,0.0,0.0,0.0,0.0,0.0,,,148,0.333333,1.333333,1,0.333333,0.8,1,1.333333,0.6
5703,2621,2615,273252,1575727200,Finished,0,Bournemouth,Liverpool,0.0,3.0,0.0,2.0,0.0,2.0,,,148,0.666667,1.666667,0,1.333333,1.0,3,2.333333,0.6
5705,2629,2628,273258,1575727200,Finished,0,Tottenham,Burnley,5.0,0.0,3.0,0.0,3.0,0.0,,,148,1.666667,1.333333,3,1.666667,1.4,0,1.0,1.2
5719,2627,2626,273255,1575736200,Finished,0,Manchester City,Manchester Utd,1.0,2.0,0.0,2.0,0.0,2.0,,,148,2.333333,1.666667,0,3.0,2.2,3,1.333333,2.0
5745,2611,2632,273251,1575810000,Finished,0,Aston Villa,Leicester,1.0,4.0,1.0,2.0,1.0,2.0,,,148,1.666667,4.333333,0,2.0,1.4,3,3.0,0.8


# Dataset & Metrics

In [0]:
matches['total_goals'] = matches.match_hometeam_score + matches.match_awayteam_score
matches['over_2.5_goals'] = matches['total_goals'] > 2
matches = matches.dropna(subset=features)

epoch = 1575484200
train_matches = matches[matches['epoch'] < epoch]
test_matches = matches[matches['epoch'] >= epoch]

In [0]:
from sklearn.model_selection import GridSearchCV

# Penalized Regression

In [117]:
from sklearn.linear_model import Lasso

reg_cv = GridSearchCV(Lasso(),
                      param_grid={'alpha': np.arange(0.01, 0.1, 0.02)},
                      cv=10,
                      n_jobs=-1,
                      return_train_score=True)
reg_cv.fit(train_matches[features], train_matches['total_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([0.01, 0.03, 0.05, 0.07, 0.09])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [118]:
def cv_results(cv, regressor=True):
    param = 'total_goals' if regressor else 'over_2.5_goals'

    print('--- PARAMS ---')
    print(cv.cv_results_['params'])
    print()

    print('--- BEST PARAMS ---')
    print(cv.best_params_)
    print()

    print('--- TRAIN SCORE (LOSS/ACC) ---')
    print(cv.best_estimator_.score(train_matches[features], train_matches[param]))
    print()

    print('--- TEST SCORE (LOSS/ACC) ---')
    print(cv.best_estimator_.score(test_matches[features], test_matches[param]))
    print()

cv_results(reg_cv, regressor=True)

--- PARAMS ---
[{'alpha': 0.01}, {'alpha': 0.03}, {'alpha': 0.049999999999999996}, {'alpha': 0.06999999999999999}, {'alpha': 0.08999999999999998}]

--- BEST PARAMS ---
{'alpha': 0.049999999999999996}

--- TRAIN SCORE (LOSS/ACC) ---
0.01548673056526051

--- TEST SCORE (LOSS/ACC) ---
0.021878477967836374



# Decision Trees

In [119]:
from sklearn.tree import DecisionTreeClassifier

DecisionTreeClassifier()

dt_cls_cv = GridSearchCV(DecisionTreeClassifier(),
                         param_grid={
                             'min_samples_leaf': np.arange(1, 10, 2),
                             'ccp_alpha': np.arange(0, 1, 0.2)
                         },
                         cv=10,
                         n_jobs=-1)
dt_cls_cv.fit(train_matches[features], train_matches['over_2.5_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': array([0. , 0.2, 0.4, 0.6, 0.8]),
       

In [120]:
cv_results(dt_cls_cv, regressor=False)

--- PARAMS ---
[{'ccp_alpha': 0.0, 'min_samples_leaf': 1}, {'ccp_alpha': 0.0, 'min_samples_leaf': 3}, {'ccp_alpha': 0.0, 'min_samples_leaf': 5}, {'ccp_alpha': 0.0, 'min_samples_leaf': 7}, {'ccp_alpha': 0.0, 'min_samples_leaf': 9}, {'ccp_alpha': 0.2, 'min_samples_leaf': 1}, {'ccp_alpha': 0.2, 'min_samples_leaf': 3}, {'ccp_alpha': 0.2, 'min_samples_leaf': 5}, {'ccp_alpha': 0.2, 'min_samples_leaf': 7}, {'ccp_alpha': 0.2, 'min_samples_leaf': 9}, {'ccp_alpha': 0.4, 'min_samples_leaf': 1}, {'ccp_alpha': 0.4, 'min_samples_leaf': 3}, {'ccp_alpha': 0.4, 'min_samples_leaf': 5}, {'ccp_alpha': 0.4, 'min_samples_leaf': 7}, {'ccp_alpha': 0.4, 'min_samples_leaf': 9}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 1}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 3}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 5}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 7}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 9}, {'ccp_alpha': 0.8, 'min_samples_leaf': 1}, {'ccp_

In [121]:
from sklearn.tree import DecisionTreeRegressor

dt_reg_cv = GridSearchCV(DecisionTreeRegressor(),
                         param_grid={
                             'min_samples_leaf': np.arange(1, 10, 2),
                             'ccp_alpha': np.arange(0, 1, 0.2)
                         },
                         cv=10,
                         n_jobs=-1)
dt_reg_cv.fit(train_matches[features], train_matches['total_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'ccp_alpha': array([0. , 0.2, 0.4, 0.6, 0.8]),
                         'min_samples_leaf': array([1, 3, 5, 7, 9])},
             p

In [122]:
cv_results(dt_reg_cv, regressor=True)

--- PARAMS ---
[{'ccp_alpha': 0.0, 'min_samples_leaf': 1}, {'ccp_alpha': 0.0, 'min_samples_leaf': 3}, {'ccp_alpha': 0.0, 'min_samples_leaf': 5}, {'ccp_alpha': 0.0, 'min_samples_leaf': 7}, {'ccp_alpha': 0.0, 'min_samples_leaf': 9}, {'ccp_alpha': 0.2, 'min_samples_leaf': 1}, {'ccp_alpha': 0.2, 'min_samples_leaf': 3}, {'ccp_alpha': 0.2, 'min_samples_leaf': 5}, {'ccp_alpha': 0.2, 'min_samples_leaf': 7}, {'ccp_alpha': 0.2, 'min_samples_leaf': 9}, {'ccp_alpha': 0.4, 'min_samples_leaf': 1}, {'ccp_alpha': 0.4, 'min_samples_leaf': 3}, {'ccp_alpha': 0.4, 'min_samples_leaf': 5}, {'ccp_alpha': 0.4, 'min_samples_leaf': 7}, {'ccp_alpha': 0.4, 'min_samples_leaf': 9}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 1}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 3}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 5}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 7}, {'ccp_alpha': 0.6000000000000001, 'min_samples_leaf': 9}, {'ccp_alpha': 0.8, 'min_samples_leaf': 1}, {'ccp_

# Random Forest

In [123]:
from sklearn.ensemble import RandomForestClassifier

rf_cls_cv = GridSearchCV(RandomForestClassifier(n_estimators=500, min_samples_leaf=5),
                         param_grid={
                             'max_features': np.arange(1, 6, 1),
                         },
                         cv=10,
                         n_jobs=-1)
rf_cls_cv.fit(train_matches[features], train_matches['over_2.5_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=5,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=500, n_jobs=None,
                                              oob_score=False,
                                              rand

In [124]:
cv_results(rf_cls_cv, regressor=False)

--- PARAMS ---
[{'max_features': 1}, {'max_features': 2}, {'max_features': 3}, {'max_features': 4}, {'max_features': 5}]

--- BEST PARAMS ---
{'max_features': 1}

--- TRAIN SCORE (LOSS/ACC) ---
0.7631133671742809

--- TEST SCORE (LOSS/ACC) ---
0.5



In [125]:
from sklearn.ensemble import RandomForestRegressor

rf_reg_cv = GridSearchCV(RandomForestRegressor(n_estimators=500, min_samples_leaf=5),
                         param_grid={
                             'max_features': np.arange(1, 6, 1),
                         },
                         cv=10,
                         n_jobs=-1)
rf_reg_cv.fit(train_matches[features], train_matches['total_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=5,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=500, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [126]:
cv_results(rf_reg_cv, regressor=True)

--- PARAMS ---
[{'max_features': 1}, {'max_features': 2}, {'max_features': 3}, {'max_features': 4}, {'max_features': 5}]

--- BEST PARAMS ---
{'max_features': 1}

--- TRAIN SCORE (LOSS/ACC) ---
0.22345093713947028

--- TEST SCORE (LOSS/ACC) ---
0.06925058545449225



# Stochastic Gradient Boosting

In [127]:
from sklearn.ensemble import GradientBoostingClassifier

gb_cls_cv = GridSearchCV(GradientBoostingClassifier(min_samples_leaf=10),
                         param_grid={
                             'learning_rate': np.arange(0.1, 1, 0.2),
                             'n_estimators': np.arange(100, 200, 20),
                             'max_depth': np.arange(1, 6, 1)
                         },
                         cv=10,
                         n_jobs=-1)
gb_cls_cv.fit(train_matches[features], train_matches['over_2.5_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=10,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no...
                 

In [128]:
cv_results(gb_cls_cv, regressor=False)

--- PARAMS ---
[{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}, {'learni

In [129]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg_cv = GridSearchCV(GradientBoostingRegressor(min_samples_leaf=10),
                         param_grid={
                             'learning_rate': np.arange(0.1, 1, 0.2),
                             'n_estimators': np.arange(100, 200, 20),
                             'max_depth': np.arange(1, 6, 1)
                         },
                         cv=10,
                         n_jobs=-1)
gb_reg_cv.fit(train_matches[features], train_matches['total_goals'])

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=10,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter...
                            

In [130]:
cv_results(gb_reg_cv, regressor=True)

--- PARAMS ---
[{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 120}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 140}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 160}, {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 180}, {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}, {'learni

# Comparison

* For evaluating regression, we use R^2 regression score function. R^2 = 1.0 means perfect fit. A constant model that predicts expected value of the target variable achieves R^2 = 0.0. R^2 < 0 possible if the fit is bad.
* For evaluating classification, we use mean accuracy.
* We see observe significant improvements in test performance for penalized regression and slight improvements for decision trees and gradient boosted trees with classifcation. There is sharp drop in training performance for both random forest models and in gradient boosted trees with regression.
* Our tree-based regression models are worse than penalized regression. In classification, even though gradient boosted trees perform the best decision tree is very close. My intuition is that our models are not satisfactory and that we can do better by spending more time on feature engineering.
* Random forest models are perform significantly worse in testing than training which indicates that those models might be overfitted.  