In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from comet_ml import Experiment
from ipywidgets import interact, IntSlider

from src.data.dataset import split_dataset, tidy_plays_df
from src.data.data_query import StorageEngine
from src.data.plays_model import game_json_to_plays_list
from src.features.features import basic_features, advanced_features, normalize_plays_coords
#from src.models.xgboost import train_xgb

In [2]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("./data/processed/plays_2015-2020.csv", index_col=False)
advanced_df = advanced_features(df)
advanced_df.head()

Unnamed: 0,seconds_elapsed,period_idx,x_coord,y_coord,x_coord_norm,y_coord_norm,dist_from_net,angle_from_net,Backhand,Deflected,...,SHOT,STOP,TAKEAWAY,previous_x_coord,previous_y_coord,seconds_from_previous,dist_from_previous,rebound,angle_change,speed
0,33.0,1,-83.0,13.0,83.0,-13.0,14.317821,-65.224859,0,0,...,0,0,0,0.0,0.0,33.0,84.011904,0,0.0,2.545815
1,96.0,1,-34.0,2.0,34.0,-2.0,55.036352,-2.082565,0,0,...,0,0,0,-69.0,-22.0,18.0,42.43819,0,0.0,2.357677
2,174.0,1,-57.0,-29.0,57.0,29.0,43.185646,42.184443,0,0,...,0,0,0,-97.0,-5.0,1.0,46.647615,0,0.0,46.647615
3,221.0,1,61.0,4.0,61.0,4.0,28.284271,8.130102,0,0,...,0,0,0,88.0,-39.0,14.0,50.774009,0,0.0,3.626715
4,240.0,1,64.0,7.0,64.0,7.0,25.96151,15.642246,0,0,...,1,0,0,61.0,4.0,19.0,4.242641,1,7.512144,0.223297


In [4]:
df3 = pd.get_dummies(df.shot_type)
df2 = pd.get_dummies(df.period_type)



In [60]:
df.columns

Index(['event_idx', 'event_type_id', 'period_idx', 'period_type', 'game_time',
       'period_time', 'shot_type', 'team_initiative_id',
       'team_initiative_name', 'x_coord', 'y_coord', 'shooter_id',
       'shooter_name', 'goalie_id', 'goalie_name', 'strength',
       'empty_net_bool', 'previous_event_idx', 'previous_event_period',
       'previous_event_period_time', 'previous_event_time',
       'previous_event_type', 'previous_event_x_coord',
       'previous_event_y_coord', 'gamePk', 'game_season', 'game_type',
       'game_start_time', 'x_coord_norm', 'y_coord_norm', 'angle_from_net',
       'rebound', 'previous_event_x_coord_norm', 'previous_event_y_coord_norm',
       'angle_change'],
      dtype='object')

In [12]:
df['game_time'] = df['game_time'].astype('datetime64[ns]')
result_df = pd.concat([advanced_df, df2, df3, df[['game_time']]], axis=1).round(2).fillna(0)
result_df['game_time'] = result_df['game_time'].astype('datetime64[ns]')
dff = result_df[result_df['game_time'].dt.year<=2019]
test = result_df[result_df['game_time'].dt.year>2019]
dff.drop(['game_time'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
# indices_to_keep = ~dff.isin([np.nan, np.inf, -np.inf]).any(1)
dff = dff.replace({np.nan: 0, np.inf:0, -np.inf:0})

In [18]:
dff.GOAL.value_counts(normalize=True)

0    0.998074
1    0.001926
Name: GOAL, dtype: float64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dff.drop(['GOAL'],axis=1).fillna(0), dff['GOAL'], test_size=0.30, random_state=42)


In [9]:
boolean_features=['BLOCKED_SHOT','FACEOFF','GIVEAWAY', 'HIT','MISSED_SHOT','OTHER','PENALTY','PERIOD_START','SHOT','TAKEAWAY']+df2.columns.tolist()+df3.columns.tolist()


In [10]:
X_train.describe()

Unnamed: 0,seconds_elapsed,period_idx,x_coord,y_coord,x_coord_norm,y_coord_norm,dist_from_net,angle_from_net,Backhand,Deflected,...,OVERTIME,REGULAR,SHOOTOUT,Backhand.1,Deflected.1,Slap Shot,Snap Shot,Tip-In,Wrap-around,Wrist Shot
count,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,...,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0,274521.0
mean,1830.952095,2.031054,-0.1495,-0.276238,58.489125,-0.040175,35.984031,-0.797621,0.078948,0.016032,...,0.01688,0.982078,0.001042,0.078948,0.016032,0.163587,0.146557,0.050958,0.01017,0.533249
std,1057.198979,0.847041,63.262602,19.045173,24.107863,19.047134,24.010439,36.662272,0.269659,0.125597,...,0.128823,0.132669,0.03226,0.269659,0.125597,0.369901,0.353664,0.219912,0.100335,0.498894
min,0.0,1.0,-99.0,-42.0,-99.0,-42.0,0.0,-147.99,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,929.0,1.0,-63.0,-15.0,45.0,-14.0,17.46,-28.77,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1813.0,2.0,-3.0,0.0,63.0,0.0,33.42,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2722.0,3.0,63.0,14.0,76.0,14.0,49.41,27.47,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,6672.0,6.0,99.0,42.0,99.0,42.0,189.79,180.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
temp_df['GOAL'].value_counts()

0.0    392127
1.0        47
Name: GOAL, dtype: int64

In [23]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=30, step=1)
fit = rfe.fit(X_train, y_train)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Num Features: 30
Selected Features: [False  True False  True  True  True  True False False  True  True False
  True  True  True False  True  True  True  True  True False  True  True
  True False  True False False False  True  True  True  True  True  True
  True  True  True False False  True False  True]
Feature Ranking: [10  1  7  1  1  1  1  2  4  1  1 11  1  1  1  8  1  1  1  1  1 15  1  1
  1 13  1  9  3  6  1  1  1  1  1  1  1  1  1  5 12  1 14  1]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = RandomForestClassifier(random_state = 42, max_depth=6)
rfe = RFE(model, n_features_to_select=30, step=1)
fit = rfe.fit(X_train, y_train)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 30
Selected Features: [ True  True  True  True  True  True  True  True  True False  True  True
 False False  True False False False False False  True False  True  True
  True False False  True  True  True  True  True  True  True  True  True
  True  True False  True  True False False  True]
Feature Ranking: [ 1  1  1  1  1  1  1  1  1 11  1  1  9 14  1  3  6  8 10  2  1 15  1  1
  1  4  5  1  1  1  1  1  1  1  1  1  1  1 13  1  1  7 12  1]


In [None]:
feature_select['RFE_random_forest'] = = fit.support_

In [25]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 42, max_depth=6)
sfs = SequentialFeatureSelector(model, n_features_to_select=30, direction='forward')
sfs.fit(X_train, y_train)

KeyboardInterrupt: 

In [32]:
X_train.head()

Unnamed: 0,seconds_elapsed,period_idx,x_coord,y_coord,x_coord_norm,y_coord_norm,dist_from_net,angle_from_net,Backhand,Deflected,...,OVERTIME,REGULAR,SHOOTOUT,Backhand.1,Deflected.1,Slap Shot,Snap Shot,Tip-In,Wrap-around,Wrist Shot
238968,3446.0,3,74.0,-3.0,74.0,-3.0,15.3,-11.31,0,0,...,0,1,0,0,0,0,0,0,0,1
113826,3292.0,3,-68.0,-3.0,68.0,3.0,21.21,8.13,0,0,...,0,1,0,0,0,0,0,0,0,1
998,2155.0,2,72.0,-1.0,72.0,-1.0,17.03,-3.37,0,0,...,0,1,0,0,0,0,0,0,0,1
229157,3224.0,3,37.0,29.0,37.0,29.0,59.54,29.15,0,0,...,0,1,0,0,0,0,0,0,0,1
118682,1196.0,1,-38.0,24.0,38.0,-24.0,56.36,-25.2,0,0,...,0,1,0,0,0,0,0,0,0,1


In [30]:
from xverse.transformer import MonotonicBinning

clf = MonotonicBinning()
clf.fit(X_train, y_train)

print(clf.bins)


ValueError: The input feature(s) should be numeric type. Some of the input features                             has character values in it. Please use a encoder before performing monotonic operations.

In [26]:
from xverse.transformer import WOE
clf = WOE()
clf.fit(X_train, y_train)

ValueError: The input feature(s) should be numeric type. Some of the input features                             has character values in it. Please use a encoder before performing monotonic operations.

In [None]:
## cross validation

In [None]:
#Random forest

## MLP
##Light gbm

In [None]:
### Random forest


In [None]:
##Light gbm
evals = {}
evals['lgb'] = {}
lparams = {}

seeds = [0, 1]
nfold = [5, 5]
stratified=[False, False]
shuffle=[True, True]
n_estimators = 30000
early_stopping_rounds = 300
verbose_eval = 0
learning_rate = 0.01
reg_alpha = [0.4, 1]
reg_lambda = [0.7, 1]
subsample = [0.45, 1]
colsample_bytree = [0.3, 0.225]
max_depth = -1
verbose = -1
n_jobs = 4

lparams[0] = dict(boosting_type='gbdt',
               objective='binary',
               metric='auc',
               learning_rate= learning_rate,
               num_leaves= 200,
               max_bin=500,
               min_child_weight= 0.035,
               subsample= subsample[0],
               colsample_bytree= colsample_bytree[0],
               min_data_in_leaf= 150,
               max_depth= max_depth,
               bagging_seed= seeds[0],
               reg_alpha= reg_alpha[0],
               reg_lambda= reg_lambda[0],
               verbose= verbose,
               seed= seeds[0],
               n_jobs= n_jobs,)

lparams[1] = dict(boosting_type='gbdt',
               objective='binary',
               metric='auc',
               learning_rate= learning_rate,
               n_estimators= n_estimators,
               subsample= subsample[1],
               colsample_bytree= colsample_bytree[1],
               max_depth= max_depth,
               bagging_seed= seeds[1],
               reg_alpha= reg_alpha[1],
               reg_lambda= reg_lambda[1],
               verbose= verbose,
               seed= seeds[1],
               n_jobs= n_jobs,)

test_preds = np.zeros(len(test))    
dtrain = lgb.Dataset(train, y)
dtest = test.copy()
testlen = test.shape[0]
del train, test, trainf, testf
gc.collect()

In [None]:
for i, seed in enumerate(seeds):
    print(f'Training Model with SEED : {seed}')
    evals['lgb'][i] = lgb.cv(lparams[i],
                             dtrain,
                             nfold=nfold[i], 
                             stratified=stratified[i],
                             shuffle=shuffle[i],
                             num_boost_round=n_estimators,
                             early_stopping_rounds=early_stopping_rounds,
                             verbose_eval=verbose_eval,
                             return_cvbooster=True,
                             seed = seed,
                             show_stdv=True)
#     filename = 'lgb_'+ i+'_.sav'
#     pickle.dump(evals['lgb'][i], open(filename, 'wb'))
    print(f'SEED {i} Average fold  AUC {np.round(max(evals["lgb"][i]["auc-mean"]),5)}')
    test_preds += stats.rankdata(np.mean(evals['lgb'][i]['cvbooster'].predict(dtest, num_iteration=evals['lgb'][i]['cvbooster'].best_iteration), axis=0)) / (testlen * len(seeds))