In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb


DATA_PATH = 'data/'

In [3]:
training_data = pd.read_csv('data/training_data.csv')
training_data.drop(['Unnamed: 0', 'Date', 'HomeTeam', 'AwayTeam', 
                 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HM1', 'HM2', 'HM3',
                 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4',  
                 'AM5', 'MW', 'HTFormPts', 'ATFormPts',
                 'HTFormPtsStr', 'ATFormPtsStr'], 1, inplace=True)
training_data.columns

Index(['season', 'FTR', 'HTP', 'ATP', 'B365H', 'B365D', 'B365A', 'gameId',
       'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'DiffLP'],
      dtype='object')

In [13]:
EMA_data = pd.read_csv('data/EMA_data.csv')
EMA_data.drop(['Unnamed: 0', 'f_DateHome', 'f_seasonHome', 'HomeTeam',
               'homeGame_x', 'f_cornersAgainstHome', 'f_cornersForHome',
               'f_freesAgainstHome', 'f_freesForHome', 'f_halfTimeGoalsAgainstHome',
               'f_halfTimeGoalsForHome', 'f_redsAgainstHome', 'f_redsForHome',
               'f_yellowsAgainstHome', 'f_yellowsForHome', 'f_DateAway', 'f_seasonAway', 
               'AwayTeam', 'homeGame_y', 'f_cornersAgainstAway', 'f_cornersForAway', 
               'f_freesAgainstAway', 'f_freesForAway', 'f_halfTimeGoalsAgainstAway', 'f_halfTimeGoalsForAway',
               'f_redsAgainstAway', 'f_redsForAway', 'f_yellowsAgainstAway', 'f_yellowsForAway'], 1, inplace=True)
EMA_data.columns

Index(['gameId', 'f_goalsAgainstHome', 'f_goalsForHome', 'f_shotsAgainstHome',
       'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_goalsAgainstAway', 'f_goalsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway'],
      dtype='object')

In [14]:
df = pd.merge(EMA_data, training_data, left_on='gameId', right_index=True)

In [15]:
df['season'].apply(str)

0        708
1        708
2        708
3        708
4        708
        ... 
4499    1819
4500    1819
4501    1819
4502    1819
4503    1819
Name: season, Length: 4504, dtype: object

In [16]:
df.head()

Unnamed: 0,gameId,gameId_x,f_goalsAgainstHome,f_goalsForHome,f_shotsAgainstHome,f_shotsForHome,f_shotsOnTargetAgainstHome,f_shotsOnTargetForHome,f_goalsAgainstAway,f_goalsForAway,...,ATP,B365H,B365D,B365A,gameId_y,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
0,22,22,1.0,0.51,11.84,5.98,4.43,4.98,1.02,0.49,...,0.333333,1.44,4.0,7.5,23,-1.0,-0.333333,-0.333333,-0.333333,-13
1,23,23,1.946674,2.307026,17.680976,9.02613,9.613518,5.640181,3.45,0.0,...,1.333333,2.2,3.25,3.2,24,0.0,0.333333,-0.333333,-0.333333,-1
2,24,24,0.49,1.98,13.37,14.53,5.45,8.55,0.0,4.98,...,1.333333,3.1,3.2,2.25,25,0.333333,0.333333,0.0,0.0,6
3,25,25,0.49,0.98,16.57,9.39,6.49,6.41,2.0,1.53,...,2.0,2.62,3.1,3.1,26,0.333333,0.666667,-0.666667,-0.666667,1
4,27,27,0.51,1.49,10.53,14.53,5.53,9.04,3.333156,2.986402,...,2.0,1.53,3.8,6.0,28,0.333333,1.0,-0.666667,-0.666667,-10


In [17]:
df.columns

Index(['gameId', 'gameId_x', 'f_goalsAgainstHome', 'f_goalsForHome',
       'f_shotsAgainstHome', 'f_shotsForHome', 'f_shotsOnTargetAgainstHome',
       'f_shotsOnTargetForHome', 'f_goalsAgainstAway', 'f_goalsForAway',
       'f_shotsAgainstAway', 'f_shotsForAway', 'f_shotsOnTargetAgainstAway',
       'f_shotsOnTargetForAway', 'season', 'FTR', 'HTP', 'ATP', 'B365H',
       'B365D', 'B365A', 'gameId_y', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts',
       'DiffLP'],
      dtype='object')

In [18]:
X = df.drop(['gameId', 'gameId_x', 'gameId_y', 'FTR', 
           'season', 'gameId_y', ], 1)
y = df['FTR']
# Use 'season' columns to create training batches
seasons = df['season']

In [19]:
print('Number of matches:', X.shape[0])
print('Number of features:', X.shape[1])

Number of matches: 4504
Number of features: 22


In [20]:
# create features variable to we can study feature importance later
features = X.columns

# Prepare the data

We will now split the data into training and testing and scale our data.

We can come back and try feature engineering later.

In [21]:
def transform_results(results):
    transformed = []
    for i in range(len(results)):
        if results[i] == 'H':
            transformed.append(0)
        elif results[i] == 'A':
            transformed.append(2)
        else:
            transformed.append(1)
    return np.array(transformed)
            
y = transform_results(y)

In [22]:
# Scale our numeric columns
scaler = StandardScaler()

X = scaler.fit_transform(X)
    

In [23]:
# rf_model = RandomForestClassifier(n_estimators=1500, max_depth=8,
#                                min_samples_leaf=3, n_jobs=-1,
#                                random_state=42)

In [24]:
# gb_model = GradientBoostingClassifier(n_estimators=250,
#                                       max_depth=3,
#                                       random_state=42)

In [25]:
# X_train = X[:-380]
# y_train = y[:-380]
# X_test = X[-380:]
# y_test = y[-380:]

# model.fit(X_train, y_train)
# y_preds = model.predict(X_test)
# accuracy_score(y_test, y_preds)

NameError: name 'model' is not defined

In [26]:
gb_model = GradientBoostingClassifier(n_estimators=1500,
                                      learning_rate=0.01,
                                      min_samples_split=2,
                                      min_weight_fraction_leaf=0.01,
                                      subsample=0.9,
                                      max_depth=2,
                                      min_samples_leaf=1,
                                   random_state=42)


In [27]:
rf_model = RandomForestClassifier(n_estimators=1000,
                                  max_depth=14,
                                  random_state=42,
                                  min_samples_leaf=40,
                                  n_jobs=-1)

In [28]:
cb_model = CatBoostClassifier(depth=6,
                              iterations=60,
                              learning_rate=0.01,
                              random_seed=42,
                              logging_level='Silent')

In [137]:
# XGB params
xgb_params = {'max_depth': 2,
          'eta': 0.01,
          'objective': 'multi:softmax',
          'min_child_weight': 5,
          'subsample': 0.7,
          'colsample_bytree': 0.6,
          'num_class': 3
         }

In [133]:
# LightGBM params
lgb_params = {'num_leaves': 20,
              'max_depth': 6,
              'metric': 'multi_logloss',
              'learning_rate': 0.001,
              'boosting_type': 'gbdt',
              'feature_fraction': 0.9,
              'objective': 'multiclass'}

In [138]:
idx = 0
results = []

for season_len in seasons.value_counts():
    X_train = np.concatenate((X[:idx], X[idx+season_len:]))
    y_train = np.concatenate((y[:idx], y[idx+season_len:]))
    X_val = X[idx:idx+season_len]
    y_val = y[idx:idx+season_len]
    
#     dtrain = xgb.DMatrix(data=X_train, label=y_train)
#     dtest = xgb.DMatrix(data=X_val, label=y_val)
#     bst = xgb.train(xgb_params, dtrain, 2)
#     xgb_preds = bst.predict(dtest)
#     print(xgb_preds)
    
#     model = rf_model
#     model.fit(X_train, y_train)
#     rf_preds = model.predict(X_val)
#     print(rf_preds)
    
#     model = gb_model
#     model.fit(X_train, y_train)
#     gb_preds = model.predict(X_val)
#     print(gb_preds)

    model = cb_model
    model.fit(X_train, y_train)
    cb_preds = model.predict(X_val)
    print(cb_preds)
    
    break
    
    
    accuracy = accuracy_score(y_val, y_preds)
    print(accuracy)
    # print(classification_report(y_val, y_preds))
    results.append(accuracy)
    idx +=season_len


[0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 2. 0.
 0. 0. 2. 2. 0. 1. 0. 0. 0. 2. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 2. 0. 2. 0.
 0. 2. 2. 2. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 2. 0. 2. 0. 0. 2. 2.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 1. 2. 0. 2. 0. 0. 2. 0.
 0. 2. 0. 0. 0. 0. 0. 2. 0. 2. 2. 2. 0. 0. 1. 2. 2. 2. 0. 2. 0. 0. 0. 0.
 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 0. 2. 0. 0. 2. 0. 0. 2. 0. 0. 0. 2. 1. 2.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 0.
 2. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 2. 0. 2. 0. 0. 0. 2. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 2. 0. 0. 0.
 2. 2. 0. 2. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 2. 0. 0. 0.
 2. 0. 0. 2. 2. 2. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 2. 0. 2. 0. 0. 0. 0. 2.
 0. 0. 2. 1. 2. 2. 0. 0. 0. 2. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 2. 2. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 1. 2. 0. 2. 0. 0. 0. 0. 0. 0.
 0. 2. 0. 2. 2. 0. 0. 2. 0. 0. 2. 0. 0. 0. 0. 0. 0.

In [135]:
print(f'Average accuracy :{sum(results)/len(results)}')
print(f'Min accuracy :{min(results)}')
print(f'Max accuracy :{max(results)}')
print(f'Standard Deviation :{np.std(results)}')

Average accuracy :0.4625254651344053
Min accuracy :0.43351063829787234
Max accuracy :0.5078947368421053
Standard Deviation :0.02198986346138475


In [111]:

feature_dict = {}
for feature, importance in zip(features, model.feature_importances_):
    feature_dict[feature] = importance
    
for k in sorted(feature_dict, key=feature_dict.get, reverse=True):
    print(k, feature_dict[k])

NameError: name 'model' is not defined

In [77]:
# X_train = X[:-380]
# y_train = y[:-380]
# X_test = X[-380:]
# y_test = y[-380:]

# dtrain = xgb.DMatrix(data=X_train, label=y_train)
# dtest = xgb.DMatrix(data=X_test)

# params = {'max_depth': 6,
#           'eta': 0.3,
#           'objective': 'multi:softmax',
#           'num_class': 3
#          }
# bst = xgb.train(params, dtrain, 2)
# y_preds = bst.predict(dtest)

print(accuracy_score(y_test, y_preds))
print(classification_report(y_test, y_preds))
print(confusion_matrix(y_test, y_preds))


ValueError: Found input variables with inconsistent numbers of samples: [380, 351]