# Football match prediction

Part of the code use in this notebook is this [kaggle kernel](https://www.kaggle.com/airback/match-outcome-prediction-in-football).

In [1]:
import os,sys
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
                             recall_score, mean_squared_error, mean_absolute_error, r2_score, classification_report)
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from libs.loaders import load_football
from libs.football import get_fifa_data, create_feables
import pickle
import pkg_resources

os.environ['MOUNT_POINT'] = '/strata'
print("System version: {}".format(sys.version))
print("XGBoost version: {}".format(pkg_resources.get_distribution('xgboost').version))
print("LightGBM version: {}".format(pkg_resources.get_distribution('lightgbm').version))

System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
XGBoost version: 0.6
LightGBM version: 0.2


In [2]:
%%time
countries, matches, leagues, teams, players = load_football()
print(countries.shape)
print(matches.shape)
print(leagues.shape)
print(teams.shape)
print(players.shape)

(11, 2)
(25979, 115)
(11, 3)
(299, 5)
(183978, 42)
CPU times: user 4.73 s, sys: 728 ms, total: 5.46 s
Wall time: 19.2 s


In [3]:
leagues

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [4]:
matches.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [5]:
matches.tail()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
25974,25975,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992091,10190,10191,1,...,,,,,,,,,,
25975,25976,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992092,9824,10199,1,...,,,,,,,,,,
25976,25977,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992093,9956,10179,2,...,,,,,,,,,,
25977,25978,24558,24558,2015/2016,9,2015-09-22 00:00:00,1992094,7896,10243,0,...,,,,,,,,,,
25978,25979,24558,24558,2015/2016,9,2015-09-23 00:00:00,1992095,10192,9931,4,...,,,,,,,,,,


In [6]:
#Reduce match data to fulfill run time requirements
rows = ["country_id", "league_id", "season", "stage", "date", "match_api_id", "home_team_api_id", 
        "away_team_api_id", "home_team_goal", "away_team_goal", "home_player_1", "home_player_2",
        "home_player_3", "home_player_4", "home_player_5", "home_player_6", "home_player_7", 
        "home_player_8", "home_player_9", "home_player_10", "home_player_11", "away_player_1",
        "away_player_2", "away_player_3", "away_player_4", "away_player_5", "away_player_6",
        "away_player_7", "away_player_8", "away_player_9", "away_player_10", "away_player_11"]
match_data = matches.dropna(subset = rows)
print(match_data.shape)


(21374, 115)


In [None]:
%%time
fifa_data_filename = 'fifa_data.pk'
if os.path.isfile(fifa_data_filename):
    fifa_data = pd.read_pickle(fifa_data_filename)
else:
    fifa_data = get_fifa_data(match_data, players)
    fifa_data.to_pickle(fifa_data_filename)
print(fifa_data.shape)


(21374, 23)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.94 ms


In [None]:
%%time
bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_cols_selected = ['B365', 'BW']      
feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)
print(feables.shape)


Generating match features...


In [33]:
feables.head()

Unnamed: 0,match_api_id,league_id,home_team_goals_difference,away_team_goals_difference,games_won_home_team,games_won_away_team,games_against_won,games_against_lost,season,home_player_1_overall_rating,home_player_2_overall_rating,home_player_3_overall_rating,home_player_4_overall_rating,home_player_5_overall_rating,home_player_6_overall_rating,home_player_7_overall_rating,home_player_8_overall_rating,home_player_9_overall_rating,home_player_10_overall_rating,home_player_11_overall_rating,away_player_1_overall_rating,away_player_2_overall_rating,away_player_3_overall_rating,away_player_4_overall_rating,away_player_5_overall_rating,away_player_6_overall_rating,away_player_7_overall_rating,away_player_8_overall_rating,away_player_9_overall_rating,away_player_10_overall_rating,away_player_11_overall_rating,B365_Win,B365_Draw,B365_Defeat,BW_Win,BW_Draw,BW_Defeat,label
0,530023.0,21518.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,65.0,80.0,82.0,79.0,76.0,80.0,79.0,67.0,85.0,68.0,89.0,67.0,74.0,71.0,76.0,72.0,71.0,76.0,76.0,78.0,74.0,77.0,0.556783,0.262925,0.180292,0.548686,0.270248,0.181066,Win
1,530084.0,21518.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,75.0,63.0,71.0,65.0,68.0,73.0,74.0,72.0,70.0,71.0,68.0,78.0,73.0,73.0,77.0,80.0,80.0,75.0,84.0,79.0,77.0,78.0,0.336872,0.285831,0.377297,0.311479,0.277935,0.410586,Draw
2,530085.0,21518.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,74.0,65.0,76.0,78.0,74.0,69.0,74.0,77.0,79.0,71.0,75.0,91.0,85.0,84.0,80.0,75.0,80.0,83.0,77.0,82.0,84.0,86.0,0.268293,0.284553,0.447154,0.23212,0.278545,0.489335,Win
3,530087.0,21518.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,79.0,74.0,82.0,56.0,57.0,79.0,79.0,68.0,74.0,82.0,72.0,82.0,76.0,81.0,68.0,77.0,83.0,74.0,80.0,78.0,83.0,86.0,0.336872,0.285831,0.377297,0.342518,0.279284,0.378197,Draw
4,530089.0,21518.0,0.0,0.0,0.0,0.0,0.0,0.0,2008.0,64.0,79.0,68.0,67.0,65.0,77.0,63.0,79.0,73.0,69.0,76.0,71.0,74.0,75.0,73.0,71.0,74.0,72.0,76.0,72.0,72.0,70.0,0.466919,0.287335,0.245747,0.518664,0.275049,0.206287,Defeat


In [34]:
features = feables[feables.columns.difference(['match_api_id', 'label'])]
labs = feables['label']
print(features.shape)
print(labs.shape)

(2706, 36)
(2706,)


In [35]:
X_train, X_test, y_train, y_test = train_test_split(features, labs, test_size=0.2, random_state=42, stratify=labs)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(X_train.shape[0] + X_val.shape[0] + X_test.shape[0])

(1731, 36)
(433, 36)
(542, 36)
2706


In [36]:
clf = XGBClassifier(max_depth=8, 
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [37]:
%%time
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='merror')

[0]	validation_0-merror:0.51963
[1]	validation_0-merror:0.505774
[2]	validation_0-merror:0.494226
[3]	validation_0-merror:0.484988
[4]	validation_0-merror:0.496536
[5]	validation_0-merror:0.496536
[6]	validation_0-merror:0.496536
[7]	validation_0-merror:0.498845
[8]	validation_0-merror:0.501155
[9]	validation_0-merror:0.489607
[10]	validation_0-merror:0.491917
[11]	validation_0-merror:0.489607
[12]	validation_0-merror:0.487298
[13]	validation_0-merror:0.494226
[14]	validation_0-merror:0.491917
[15]	validation_0-merror:0.491917
[16]	validation_0-merror:0.487298
[17]	validation_0-merror:0.491917
[18]	validation_0-merror:0.487298
[19]	validation_0-merror:0.484988
[20]	validation_0-merror:0.482679
[21]	validation_0-merror:0.491917
[22]	validation_0-merror:0.496536
[23]	validation_0-merror:0.496536
[24]	validation_0-merror:0.494226
[25]	validation_0-merror:0.496536
[26]	validation_0-merror:0.498845
[27]	validation_0-merror:0.489607
[28]	validation_0-merror:0.496536
[29]	validation_0-merror:

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=1)

In [38]:
y_pred = clf.predict(X_test)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test, y_pred)))

Score of XGBClassifier for training set: 0.9977.
Score of XGBClassifier for test set: 0.4926.


In [39]:
#https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
def classification_metrics_multilabel(y_true, y_pred, labels):
    m_acc = accuracy_score(y_true, y_pred)
    m_f1 = f1_score(y_true, y_pred, labels, average='weighted')
    m_precision = precision_score(y_true, y_pred, labels, average='weighted')
    m_recall = recall_score(y_true, y_pred, labels, average='weighted')
    m_conf = confusion_matrix(y_true, y_pred, labels)
    report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1, 'Confusion Matrix':m_conf}
    return report

In [40]:
labels = ["Win", "Draw", "Defeat"]
report = classification_metrics_multilabel(y_test, y_pred, labels)
report

{'Accuracy': 0.49261992619926198, 'Confusion Matrix': array([[191,  30,  42],
        [ 75,  13,  39],
        [ 71,  18,  63]]), 'F1': 0.46071930963702967, 'Precision': 0.44764745563007513, 'Recall': 0.49261992619926198}

In [41]:
clf = LGBMClassifier(num_leaves=255,
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [42]:
%%time
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True, eval_metric='multi_error')

[1]	valid_0's multi_error: 0.528868
[2]	valid_0's multi_error: 0.517321
[3]	valid_0's multi_error: 0.528868
[4]	valid_0's multi_error: 0.51963
[5]	valid_0's multi_error: 0.52194
[6]	valid_0's multi_error: 0.52194
[7]	valid_0's multi_error: 0.503464
[8]	valid_0's multi_error: 0.510393
[9]	valid_0's multi_error: 0.51963
[10]	valid_0's multi_error: 0.510393
[11]	valid_0's multi_error: 0.505774
[12]	valid_0's multi_error: 0.503464
[13]	valid_0's multi_error: 0.508083
[14]	valid_0's multi_error: 0.512702
[15]	valid_0's multi_error: 0.501155
[16]	valid_0's multi_error: 0.503464
[17]	valid_0's multi_error: 0.505774
[18]	valid_0's multi_error: 0.505774
[19]	valid_0's multi_error: 0.498845
[20]	valid_0's multi_error: 0.496536
[21]	valid_0's multi_error: 0.501155
[22]	valid_0's multi_error: 0.498845
[23]	valid_0's multi_error: 0.501155
[24]	valid_0's multi_error: 0.505774
[25]	valid_0's multi_error: 0.505774
[26]	valid_0's multi_error: 0.510393
[27]	valid_0's multi_error: 0.508083
[28]	valid_0's

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [43]:
y_pred = clf.predict(X_test)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test, y_pred)))

Score of LGBMClassifier for training set: 0.9988.
Score of LGBMClassifier for test set: 0.4926.


In [44]:
report = classification_metrics_multilabel(y_test, y_pred, labels)
report

{'Accuracy': 0.49261992619926198, 'Confusion Matrix': array([[185,  34,  44],
        [ 73,  17,  37],
        [ 70,  17,  65]]), 'F1': 0.46698421561522618, 'Precision': 0.4571211208687429, 'Recall': 0.49261992619926198}

# 2) Concept drift

In [45]:
la_liga_id = 21518
matches_target = matches[matches['league_id'] == la_liga_id]
print(matches_target.shape)
print(matches_target.head(5))
print(matches_target.tail(5))

(3040, 115)
          id  country_id  league_id     season  stage                 date  \
21517  21518       21518      21518  2008/2009      1  2008-08-30 00:00:00   
21518  21519       21518      21518  2008/2009      1  2008-08-31 00:00:00   
21519  21520       21518      21518  2008/2009      1  2008-08-31 00:00:00   
21520  21521       21518      21518  2008/2009      1  2008-08-31 00:00:00   
21521  21522       21518      21518  2008/2009      1  2008-08-31 00:00:00   

       match_api_id  home_team_api_id  away_team_api_id  home_team_goal  \
21517        530023             10267              8661               3   
21518        530084              8371             10205               1   
21519        530085              9783              8633               2   
21520        530086              8388              8634               1   
21521        530087              8696              8302               1   

       away_team_goal  home_player_X1  home_player_X2  home_player_X

In [46]:
cols = ["country_id", "league_id", "season", "stage", "date", "match_api_id", "home_team_api_id", 
        "away_team_api_id", "home_team_goal", "away_team_goal", "home_player_1", "home_player_2",
        "home_player_3", "home_player_4", "home_player_5", "home_player_6", "home_player_7", 
        "home_player_8", "home_player_9", "home_player_10", "home_player_11", "away_player_1",
        "away_player_2", "away_player_3", "away_player_4", "away_player_5", "away_player_6",
        "away_player_7", "away_player_8", "away_player_9", "away_player_10", "away_player_11"]
match_data = matches_target.dropna(subset = cols)
print(match_data.shape)

(2707, 115)


In [47]:
%%time
fifa_data = get_fifa_data(match_data, players)


CPU times: user 4min 14s, sys: 4 ms, total: 4min 14s
Wall time: 4min 14s


In [48]:
pd.set_option('display.max_columns', None) #show all columns in pandas

In [49]:
print(fifa_data.shape)
fifa_data.head()

(2707, 23)


Unnamed: 0,home_player_1_overall_rating,home_player_2_overall_rating,home_player_3_overall_rating,home_player_4_overall_rating,home_player_5_overall_rating,home_player_6_overall_rating,home_player_7_overall_rating,home_player_8_overall_rating,home_player_9_overall_rating,home_player_10_overall_rating,home_player_11_overall_rating,away_player_1_overall_rating,away_player_2_overall_rating,away_player_3_overall_rating,away_player_4_overall_rating,away_player_5_overall_rating,away_player_6_overall_rating,away_player_7_overall_rating,away_player_8_overall_rating,away_player_9_overall_rating,away_player_10_overall_rating,away_player_11_overall_rating,match_api_id
21517,65.0,80.0,82.0,79.0,76.0,80.0,79.0,67.0,85.0,68.0,89.0,67.0,74.0,71.0,76.0,72.0,71.0,76.0,76.0,78.0,74.0,77.0,530023.0
21518,75.0,63.0,71.0,65.0,68.0,73.0,74.0,72.0,70.0,71.0,68.0,78.0,73.0,73.0,77.0,80.0,80.0,75.0,84.0,79.0,77.0,78.0,530084.0
21519,74.0,65.0,76.0,78.0,74.0,69.0,74.0,77.0,79.0,71.0,75.0,91.0,85.0,84.0,80.0,75.0,80.0,83.0,77.0,82.0,84.0,86.0,530085.0
21521,79.0,74.0,82.0,56.0,57.0,79.0,79.0,68.0,74.0,82.0,72.0,82.0,76.0,81.0,68.0,77.0,83.0,74.0,80.0,78.0,83.0,86.0,530087.0
21523,64.0,79.0,68.0,67.0,65.0,77.0,63.0,79.0,73.0,69.0,76.0,71.0,74.0,75.0,73.0,71.0,74.0,72.0,76.0,72.0,72.0,70.0,530089.0


In [50]:
%%time
bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_cols_selected = ['B365', 'BW']      
feables = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True, all_leagues = False)
print(feables.shape)
feables.head()

Generating match features...
Generating match labels...
Generating bookkeeper data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(2706, 38)
CPU times: user 1min 55s, sys: 0 ns, total: 1min 55s
Wall time: 1min 55s


In [51]:
feables_2015_2016 = feables[feables['season'] == 2015]
print(feables_2015_2016.shape)
feables_2014_2015 = feables[feables['season'] == 2014]
print(feables_2014_2015.shape)
feables_rest = feables[(feables['season'] != 2014) & (feables['season'] != 2015)]
print(feables_rest.shape)

(351, 38)
(364, 38)
(1991, 38)


In [52]:
X_train = feables_rest[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_train = feables_rest['label']
X_test1 = feables_2014_2015[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_test1 = feables_2014_2015['label']
X_test2 = feables_2015_2016[feables_rest.columns.difference(['match_api_id', 'label', 'season'])]
y_test2 = feables_2015_2016['label']

print(X_train.shape)
print(X_test1.shape)
print(X_test2.shape)


(1991, 35)
(364, 35)
(351, 35)


In [53]:
feables_up_to_2014_2015 = feables[feables['season'] != 2015]
print(feables_up_to_2014_2015.shape)
X_train2 = feables_up_to_2014_2015[feables_up_to_2014_2015.columns.difference(['match_api_id', 'label', 'season'])]
y_train2 = feables_up_to_2014_2015['label']

(2355, 38)


In [56]:
clf = XGBClassifier(max_depth=8, 
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [57]:
%%time
clf.fit(X_train, y_train, verbose=True, eval_metric='merror')

CPU times: user 3.54 s, sys: 3.52 s, total: 7.06 s
Wall time: 2.14 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=1)

In [58]:
y_pred1 = clf.predict(X_test1)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))

Score of XGBClassifier for training set: 0.9965.
Score of XGBClassifier for test set: 0.5220.


In [59]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

Score of XGBClassifier for training set: 0.9965.
Score of XGBClassifier for test set: 0.4872.


In [60]:
%%time
clf.fit(X_train2, y_train2, verbose=True, eval_metric='merror')

CPU times: user 3.77 s, sys: 4.36 s, total: 8.12 s
Wall time: 2.64 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=1)

In [61]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

Score of XGBClassifier for training set: 0.9919.
Score of XGBClassifier for test set: 0.5242.


Now let's try with LightGBM

In [62]:
clf = LGBMClassifier(num_leaves=255,
                    learning_rate=0.1, 
                    scale_pos_weight=2,
                    min_child_weight=5,
                    n_estimators=100,
                    subsample=1)

In [63]:
%%time
clf.fit(X_train, y_train, verbose=True, eval_metric='multi_error')

CPU times: user 5.89 s, sys: 23.7 s, total: 29.6 s
Wall time: 10.8 s


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [64]:
y_pred1 = clf.predict(X_test1)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test1, y_pred1)))

Score of LGBMClassifier for training set: 1.0000.
Score of LGBMClassifier for test set: 0.5192.


In [65]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train, clf.predict(X_train))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

Score of LGBMClassifier for training set: 1.0000.
Score of LGBMClassifier for test set: 0.5128.


In [66]:
%%time
clf.fit(X_train2, y_train2, verbose=True, eval_metric='multi_error')

CPU times: user 7.12 s, sys: 25.7 s, total: 32.8 s
Wall time: 12.9 s


LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_unbalance=False, learning_rate=0.1, max_bin=255, max_depth=-1,
        max_drop=50, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=100, nthread=-1, num_leaves=255,
        objective='multiclass', reg_alpha=0, reg_lambda=0,
        scale_pos_weight=2, seed=0, sigmoid=1.0, silent=True,
        skip_drop=0.5, subsample=1, subsample_for_bin=50000,
        subsample_freq=1, uniform_drop=False, xgboost_dart_mode=False)

In [67]:
y_pred2 = clf.predict(X_test2)
print("Score of {} for training set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_train2, clf.predict(X_train2))))
print("Score of {} for test set: {:.4f}.".format(clf.__class__.__name__, accuracy_score(y_test2, y_pred2)))

Score of LGBMClassifier for training set: 0.9996.
Score of LGBMClassifier for test set: 0.5584.
