In [1]:
import csv
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df_all=pd.read_csv('data_original.csv')
df_teid=pd.read_csv('sample_submission.csv')

In [3]:
#split data into training and testing
test_id=df_teid['shot_id'].tolist()

In [4]:
df_all['shot_made_flag'].value_counts()

0.0    14232
1.0    11465
Name: shot_made_flag, dtype: int64

In [5]:
df_all['combined_shot_type'].value_counts()

Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64

In [6]:
df_all['period'].value_counts()

3    8296
1    8048
4    7260
2    6718
5     330
6      38
7       7
Name: period, dtype: int64

In [7]:
df_all['shot_zone_area'].value_counts()

Center(C)                13455
Right Side Center(RC)     4776
Right Side(R)             4588
Left Side Center(LC)      4044
Left Side(L)              3751
Back Court(BC)              83
Name: shot_zone_area, dtype: int64

In [8]:
df_all['shot_zone_basic'].value_counts()

Mid-Range                12625
Restricted Area           7136
Above the Break 3         5620
In The Paint (Non-RA)     4578
Right Corner 3             387
Left Corner 3              280
Backcourt                   71
Name: shot_zone_basic, dtype: int64

In [9]:
df_all['shot_type'].value_counts()

2PT Field Goal    24271
3PT Field Goal     6426
Name: shot_type, dtype: int64

In [10]:
df_all['opponent'].value_counts()

SAS    1978
PHX    1781
HOU    1666
SAC    1643
DEN    1642
POR    1539
UTA    1490
MIN    1474
GSW    1356
LAC    1285
DAL    1199
MEM    1030
BOS     946
SEA     828
IND     761
PHI     720
ORL     719
DET     715
OKC     677
TOR     664
NYK     657
MIA     627
CHA     620
CLE     619
CHI     610
WAS     600
MIL     586
NOH     581
NJN     520
ATL     519
NOP     344
VAN     246
BKN      55
Name: opponent, dtype: int64

In [11]:
#string format into datetime
df_all['game_date'] = pd.to_datetime(df_all['game_date'])

In [12]:
#if it is a back to back game 
#(df_all['game_date'][100]-df_all['game_date'][1]).days #int
def btb(lst): #0,1
    btb_lst=[0]
    flag=0
    for i in range(1,len(lst)):
        if (lst[i]-lst[i-1]).days==1:
            btb_lst.append(1)
            flag=1
        elif (lst[i]-lst[i-1]).days==0:
            btb_lst.append(flag)
        else:
            flag=0
            btb_lst.append(flag)
    return btb_lst

df_all['btb']=btb(df_all['game_date'])

In [13]:
#home court game or not
def homecourt(row):
    if '@' in row:
        return 0
    elif 'vs' in row:
        return 1
    else:
        return 'error'
    


homecourt_label=df_all['matchup'].apply(homecourt)
df_all['homecourt']=homecourt_label

In [14]:
#game month
#w['female'] = w['female'].map({'female': 1, 'male': 0})
def gamemonth(lst):
    dict_month={'1': 'Jan', '2': 'Feb', '3': 'Mar',
                '4': 'Apr', '5': 'May', '6': 'Jun',
                '7': 'Jul', '8': 'Aug', '9': 'Sep',
                '10': 'Oct', '11': 'Nov', '12': 'Dec'}
    
    splitseries=lst.apply(lambda row :str(row.month))
    
    newseries=splitseries.map(dict_month)
    
    return newseries
    
gamemonth(df_all['game_date']).isnull().values.any() #false
df_all['gamemonth']=gamemonth(df_all['game_date'])

In [15]:
#if last shot was made
def lastshot(lst):
    last=[0]
    for i in range(1,len(lst)):
        
        if lst[i-1]==0:
            flag=0
            last.append(0)
        elif lst[i-1]==1:
            flag=1
            last.append(1)
        else:
            last.append('unknown') #due to the random test data
    return last

df_all['last_shot_flag']=lastshot(df_all['shot_made_flag'])

In [16]:
#add column secondsToPeriodEnd
df_all['secondsToPeriodEnd'] = 60*df_all['minutes_remaining']+df_all['seconds_remaining']

In [17]:
#add column secondsFromPeriodEnd
df_all['secondsFromGameStart'] = df_all['period'].astype(int)*12*60 - df_all['secondsToPeriodEnd']

In [18]:
criterion = df_all['shot_id'].map(lambda x: x not in test_id)
criterion1 = df_all['shot_id'].map(lambda x: x in test_id)
df_all_tr=df_all[criterion]
df_all_te=df_all[criterion1]

In [19]:
ctg_feature=['combined_shot_type', 
             'shot_id','shot_type', 
             'action_type',
             'shot_zone_area', 'shot_zone_basic', 'playoffs', 'period','opponent','season',
             'homecourt',
             'btb',
             'gamemonth',
            'last_shot_flag'
             ]
num_feature=['loc_x', 'loc_y', 'shot_distance','shot_id','seconds_remaining',
             'secondsToPeriodEnd','secondsFromGameStart']

In [20]:
df_ctg = df_all.loc[:, lambda df: ctg_feature]
encoded_ctg=pd.get_dummies(df_ctg).astype(np.int16)

In [21]:
criterion01 = encoded_ctg['shot_id'].map(lambda x: x not in test_id)
criterion11 = encoded_ctg['shot_id'].map(lambda x: x in test_id)


df_tr_ctg=encoded_ctg[criterion01]
df_te_ctg=encoded_ctg[criterion11]

In [22]:
df_tr_num = df_all_tr.loc[:, lambda df: num_feature]
df_te_num = df_all_te.loc[:, lambda df: num_feature]
flag = df_all_tr['shot_made_flag']

In [23]:
train=pd.merge(df_tr_ctg, df_tr_num,on='shot_id')

test=pd.merge(df_te_ctg, df_te_num,on='shot_id')

In [24]:
train.shape

(25697, 154)

In [25]:
#how new features look like
df_all.loc[:, lambda df: ['homecourt','btb','gamemonth','last_shot_flag']]

Unnamed: 0,homecourt,btb,gamemonth,last_shot_flag
0,0,0,Oct,0
1,0,0,Oct,unknown
2,0,0,Oct,0
3,0,0,Oct,1
4,0,0,Oct,0
5,0,0,Oct,1
6,0,0,Oct,0
7,0,0,Oct,1
8,0,0,Oct,unknown
9,0,0,Oct,1


In [26]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.8,
    'eta': 0.1,
    'max_depth': 3,
    'seed': 2017,
    'silent': 1,
    'gamma':0.005,
    'subsample':0.8,
     'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.674965
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.664048
[2]	eval-logloss:0.657258
[3]	eval-logloss:0.650059
[4]	eval-logloss:0.643236
[5]	eval-logloss:0.639566
[6]	eval-logloss:0.634409
[7]	eval-logloss:0.630276
[8]	eval-logloss:0.627382
[9]	eval-logloss:0.624279
[10]	eval-logloss:0.621496
[11]	eval-logloss:0.619179
[12]	eval-logloss:0.618018
[13]	eval-logloss:0.616575
[14]	eval-logloss:0.615107
[15]	eval-logloss:0.61381
[16]	eval-logloss:0.612625
[17]	eval-logloss:0.611673
[18]	eval-logloss:0.610787
[19]	eval-logloss:0.610081
[20]	eval-logloss:0.609325
[21]	eval-logloss:0.608787
[22]	eval-logloss:0.60846
[23]	eval-logloss:0.608008
[24]	eval-logloss:0.607603
[25]	eval-logloss:0.60732
[26]	eval-logloss:0.606964
[27]	eval-logloss:0.606663
[28]	eval-logloss:0.606476
[29]	eval-logloss:0.606028
[30]	eval-logloss:0.605751
[31]	eval-logloss:0.60572
[32]	eval-logloss:0.605538
[33]	eval-logloss:0.605303
[34]	eval-logloss:0.605225
[35]	eval

In [27]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'eta': 0.2,
    'max_depth': 3,
    'seed': 2017,
    'silent': 1,
    'gamma':0.015,
    'subsample':0.9,
     'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.662168
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.645299
[2]	eval-logloss:0.636414
[3]	eval-logloss:0.62782
[4]	eval-logloss:0.62205
[5]	eval-logloss:0.618883
[6]	eval-logloss:0.615244
[7]	eval-logloss:0.612843
[8]	eval-logloss:0.611218
[9]	eval-logloss:0.609656
[10]	eval-logloss:0.608756
[11]	eval-logloss:0.60767
[12]	eval-logloss:0.606738
[13]	eval-logloss:0.606651
[14]	eval-logloss:0.60606
[15]	eval-logloss:0.605647
[16]	eval-logloss:0.605375
[17]	eval-logloss:0.604937
[18]	eval-logloss:0.60459
[19]	eval-logloss:0.604054
[20]	eval-logloss:0.603841
[21]	eval-logloss:0.603689
[22]	eval-logloss:0.603587
[23]	eval-logloss:0.603642
[24]	eval-logloss:0.603386
[25]	eval-logloss:0.603254
[26]	eval-logloss:0.603345
[27]	eval-logloss:0.603267
[28]	eval-logloss:0.602899
[29]	eval-logloss:0.602977
[30]	eval-logloss:0.602766
[31]	eval-logloss:0.602596
[32]	eval-logloss:0.60245
[33]	eval-logloss:0.602245
[34]	eval-logloss:0.602218
[35]	eval-l

In [32]:
import numpy as np
import pandas as pd

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split

# Load data and roughly clean it, then sort as game date
df = pd.read_csv("./data_original.csv")
df.drop(['game_event_id', 'game_id', 'lat', 'lon', 'team_id', 'team_name'], axis=1, inplace=True)
df.sort_values('game_date',  inplace=True)
mask = df['shot_made_flag'].isnull()


# Clean data
actiontypes = dict(df.action_type.value_counts())
df['type'] = df.apply(lambda row: row['action_type'] if actiontypes[row['action_type']] > 20\
                          else row['combined_shot_type'], axis=1)
df.drop(['action_type', 'combined_shot_type'], axis=1, inplace=True)

df['away'] = df.matchup.str.contains('@')
df.drop('matchup', axis=1, inplace=True)

df['distance'] = df.apply(lambda row: row['shot_distance'] if row['shot_distance'] <45 else 45, axis=1)

df['time_remaining'] = df.apply(lambda row: row['minutes_remaining'] * 60 + row['seconds_remaining'], axis=1)
df['last_moments'] = df.apply(lambda row: 1 if row['time_remaining'] < 3 else 0, axis=1)

data = pd.get_dummies(df['type'],prefix="action_type")

features=["away", "period", "playoffs", "shot_type", "shot_zone_area", "shot_zone_basic", "season",
           "shot_zone_range", "opponent", "distance", "minutes_remaining", "last_moments"]
for f in features:
    data = pd.concat([data, pd.get_dummies(df[f], prefix=f),], axis=1)

# Need work on game_date, add this into feature and increse n_estimators can inprove results but waste time and memory 

X = data[~mask]
y = df.shot_made_flag[~mask]

data=pd.read_csv("./data.csv")
data=data.dropna()
data.isna().sum()
data.to_csv("./data_cleansed.csv")

In [33]:
df_all=pd.read_csv('data_cleansed.csv')
df_teid=pd.read_csv('sample_submission.csv')
#split data into training and testing
test_id=df_teid['shot_id'].tolist()
#string format into datetime
df_all['game_date'] = pd.to_datetime(df_all['game_date'])
#if it is a back to back game
#(df_all['game_date'][100]-df_all['game_date'][1]).days #int
def btb(lst): #0,1
   btb_lst=[0]
   flag=0
   for i in range(1,len(lst)):
#         print("Iteration : ",i)
       if (lst[i]-lst[i-1]).days==1:
#             print (lst[i],lst[i-1])
           btb_lst.append(1)
#             print(btb_lst)
           flag=1
       elif (lst[i]-lst[i-1]).days==0:
           btb_lst.append(flag)
       else:
           flag=0
           btb_lst.append(flag)
   return btb_lst

df_all['btb']=btb(df_all['game_date'])
#home court game or not
def homecourt(row):
   if '@' in row:
       return 0
   elif 'vs' in row:
       return 1
   else:
       return 'error'



homecourt_label=df_all['matchup'].apply(homecourt)
df_all['homecourt']=homecourt_label
#game month
#w['female'] = w['female'].map({'female': 1, 'male': 0})
def gamemonth(lst):
   dict_month={'1': 'Jan', '2': 'Feb', '3': 'Mar',
               '4': 'Apr', '5': 'May', '6': 'Jun',
               '7': 'Jul', '8': 'Aug', '9': 'Sep',
               '10': 'Oct', '11': 'Nov', '12': 'Dec'}

   splitseries=lst.apply(lambda row :str(row.month))
#     print("Split : ", splitseries)

   newseries=splitseries.map(dict_month)
#     print("New : ", newseries)

   return newseries

gamemonth(df_all['game_date']).isnull().values.any() #false
df_all['gamemonth']=gamemonth(df_all['game_date'])

#if last shot was made
def lastshot(lst):
   last=[0]
   for i in range(1,len(lst)):

       if lst[i-1]==0:
           flag=0
           last.append(0)
       elif lst[i-1]==1:
           flag=1
           last.append(1)
       else:
           last.append('unknown') #due to the random test data
   return last

df_all['last_shot_flag']=lastshot(df_all['shot_made_flag'])

#add column secondsToPeriodEnd
df_all['secondsToPeriodEnd'] = 60*df_all['minutes_remaining']+df_all['seconds_remaining']
#add column secondsFromPeriodEnd
df_all['secondsFromGameStart'] = df_all['period'].astype(int)*12*60 - df_all['secondsToPeriodEnd']
criterion = df_all['shot_id'].map(lambda x: x not in test_id)
criterion1 = df_all['shot_id'].map(lambda x: x in test_id)
df_all_tr=df_all[criterion]
df_all_te=df_all[criterion1]

ctg_feature=['combined_shot_type',
            'shot_id','shot_type',
            'action_type',
            'shot_zone_area', 'shot_zone_basic', 'playoffs', 'period','opponent','season',
            'homecourt',
            'btb',
            'gamemonth',
           'last_shot_flag'
            ]
num_feature=['loc_x', 'loc_y', 'shot_distance','shot_id','seconds_remaining',
            'secondsToPeriodEnd','secondsFromGameStart']
df_ctg = df_all.loc[:, lambda df: ctg_feature]
encoded_ctg=pd.get_dummies(df_ctg).astype(np.int16)
criterion01 = encoded_ctg['shot_id'].map(lambda x: x not in test_id)
criterion11 = encoded_ctg['shot_id'].map(lambda x: x in test_id)


df_tr_ctg=encoded_ctg[criterion01]
df_te_ctg=encoded_ctg[criterion11]
df_tr_num = df_all_tr.loc[:, lambda df: num_feature]
df_te_num = df_all_te.loc[:, lambda df: num_feature]
flag = df_all_tr['shot_made_flag']

train=pd.merge(df_tr_ctg, df_tr_num,on='shot_id')

test=pd.merge(df_te_ctg, df_te_num,on='shot_id')

In [34]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.5,
    'eta': 0.2,
    'max_depth': 4,
    'seed': 2017,
    'silent': 1,
    'gamma':0.015,
    'subsample':0.8,
    'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.66742
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.659651
[2]	eval-logloss:0.645395
[3]	eval-logloss:0.641312
[4]	eval-logloss:0.635655
[5]	eval-logloss:0.62645
[6]	eval-logloss:0.620409
[7]	eval-logloss:0.616209
[8]	eval-logloss:0.612721
[9]	eval-logloss:0.610872
[10]	eval-logloss:0.610431
[11]	eval-logloss:0.609264
[12]	eval-logloss:0.608606
[13]	eval-logloss:0.606431
[14]	eval-logloss:0.605653
[15]	eval-logloss:0.605369
[16]	eval-logloss:0.604645
[17]	eval-logloss:0.603508
[18]	eval-logloss:0.60348
[19]	eval-logloss:0.603232
[20]	eval-logloss:0.602913
[21]	eval-logloss:0.602664
[22]	eval-logloss:0.602721
[23]	eval-logloss:0.602603
[24]	eval-logloss:0.602507
[25]	eval-logloss:0.60247
[26]	eval-logloss:0.602304
[27]	eval-logloss:0.60244
[28]	eval-logloss:0.602286
[29]	eval-logloss:0.601626
[30]	eval-logloss:0.601593
[31]	eval-logloss:0.60164
[32]	eval-logloss:0.601217
[33]	eval-logloss:0.600534
[34]	eval-logloss:0.600358
[35]	eval-l