In [2]:
import csv
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
df_all=pd.read_csv('data.csv')
df_teid=pd.read_csv('sample_submission.csv')

In [4]:
#split data into training and testing
test_id=df_teid['shot_id'].tolist()

In [5]:
df_all['shot_made_flag'].value_counts()

0.0    14232
1.0    11465
Name: shot_made_flag, dtype: int64

In [6]:
df_all['combined_shot_type'].value_counts()

Jump Shot    19710
Layup         4532
Dunk          1056
Tip Shot       152
Hook Shot      127
Bank Shot      120
Name: combined_shot_type, dtype: int64

In [7]:
df_all['period'].value_counts()

3    7002
1    6700
4    6043
2    5635
5     280
6      30
7       7
Name: period, dtype: int64

In [8]:
df_all['shot_zone_area'].value_counts()

Center(C)                11289
Right Side Center(RC)     3981
Right Side(R)             3859
Left Side Center(LC)      3364
Left Side(L)              3132
Back Court(BC)              72
Name: shot_zone_area, dtype: int64

In [9]:
df_all['shot_zone_basic'].value_counts()

Mid-Range                10532
Restricted Area           5932
Above the Break 3         4720
In The Paint (Non-RA)     3880
Right Corner 3             333
Left Corner 3              240
Backcourt                   60
Name: shot_zone_basic, dtype: int64

In [10]:
df_all['shot_type'].value_counts()

2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64

In [11]:
df_all['opponent'].value_counts()

SAS    1638
PHX    1535
HOU    1399
SAC    1397
DEN    1352
POR    1292
UTA    1238
MIN    1219
GSW    1143
LAC    1074
DAL    1033
MEM     871
BOS     783
SEA     694
IND     626
ORL     604
PHI     603
DET     587
NYK     566
OKC     561
TOR     556
MIA     517
CHI     516
CLE     514
MIL     507
WAS     501
CHA     500
NOH     475
ATL     438
NJN     422
NOP     287
VAN     204
BKN      45
Name: opponent, dtype: int64

In [12]:
#string format into datetime
df_all['game_date'] = pd.to_datetime(df_all['game_date'])

In [13]:
#if it is a back to back game 
#(df_all['game_date'][100]-df_all['game_date'][1]).days #int
def btb(lst): #0,1
    btb_lst=[0]
    flag=0
    for i in range(1,len(lst)):
        if (lst[i]-lst[i-1]).days==1:
            btb_lst.append(1)
            flag=1
        elif (lst[i]-lst[i-1]).days==0:
            btb_lst.append(flag)
        else:
            flag=0
            btb_lst.append(flag)
    return btb_lst

df_all['btb']=btb(df_all['game_date'])

In [14]:
#home court game or not
def homecourt(row):
    if '@' in row:
        return 0
    elif 'vs' in row:
        return 1
    else:
        return 'error'
    


homecourt_label=df_all['matchup'].apply(homecourt)
df_all['homecourt']=homecourt_label

In [15]:
#game month
#w['female'] = w['female'].map({'female': 1, 'male': 0})
def gamemonth(lst):
    dict_month={'1': 'Jan', '2': 'Feb', '3': 'Mar',
                '4': 'Apr', '5': 'May', '6': 'Jun',
                '7': 'Jul', '8': 'Aug', '9': 'Sep',
                '10': 'Oct', '11': 'Nov', '12': 'Dec'}
    
    splitseries=lst.apply(lambda row :str(row.month))
    
    newseries=splitseries.map(dict_month)
    
    return newseries
    
gamemonth(df_all['game_date']).isnull().values.any() #false
df_all['gamemonth']=gamemonth(df_all['game_date'])

In [16]:
#if last shot was made
def lastshot(lst):
    last=[0]
    for i in range(1,len(lst)):
        
        if lst[i-1]==0:
            flag=0
            last.append(0)
        elif lst[i-1]==1:
            flag=1
            last.append(1)
        else:
            last.append('unknown') #due to the random test data
    return last

df_all['last_shot_flag']=lastshot(df_all['shot_made_flag'])

In [17]:
#add column secondsToPeriodEnd
df_all['secondsToPeriodEnd'] = 60*df_all['minutes_remaining']+df_all['seconds_remaining']

In [18]:
#add column secondsFromPeriodEnd
df_all['secondsFromGameStart'] = df_all['period'].astype(int)*12*60 - df_all['secondsToPeriodEnd']

In [19]:
criterion = df_all['shot_id'].map(lambda x: x not in test_id)
criterion1 = df_all['shot_id'].map(lambda x: x in test_id)
df_all_tr=df_all[criterion]
df_all_te=df_all[criterion1]

In [20]:
ctg_feature=['combined_shot_type', 
             'shot_id','shot_type', 
             'action_type',
             'shot_zone_area', 'shot_zone_basic', 'playoffs', 'period','opponent','season',
             'homecourt',
             'btb',
             'gamemonth',
            'last_shot_flag'
             ]
num_feature=['loc_x', 'loc_y', 'shot_distance','shot_id','seconds_remaining',
             'secondsToPeriodEnd','secondsFromGameStart']

In [21]:
df_ctg = df_all.loc[:, lambda df: ctg_feature]
encoded_ctg=pd.get_dummies(df_ctg).astype(np.int16)

In [22]:
criterion01 = encoded_ctg['shot_id'].map(lambda x: x not in test_id)
criterion11 = encoded_ctg['shot_id'].map(lambda x: x in test_id)


df_tr_ctg=encoded_ctg[criterion01]
df_te_ctg=encoded_ctg[criterion11]

In [23]:
df_tr_num = df_all_tr.loc[:, lambda df: num_feature]
df_te_num = df_all_te.loc[:, lambda df: num_feature]
flag = df_all_tr['shot_made_flag']

In [24]:
train=pd.merge(df_tr_ctg, df_tr_num,on='shot_id')

test=pd.merge(df_te_ctg, df_te_num,on='shot_id')

In [25]:
train.shape

(25697, 150)

In [26]:
#how new features look like
df_all.loc[:, lambda df: ['homecourt','btb','gamemonth','last_shot_flag']]

Unnamed: 0,homecourt,btb,gamemonth,last_shot_flag
0,0,0,Oct,0
1,0,0,Oct,0
2,0,0,Oct,1
3,0,0,Oct,0
4,0,0,Oct,1
5,0,0,Oct,0
6,0,0,Oct,1
7,0,0,Oct,1
8,0,0,Oct,0
9,1,1,Nov,0


In [27]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.8,
    'eta': 0.1,
    'max_depth': 3,
    'seed': 2017,
    'silent': 1,
    'gamma':0.005,
    'subsample':0.8,
     'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.674937
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.667116
[2]	eval-logloss:0.657217
[3]	eval-logloss:0.652736
[4]	eval-logloss:0.645292
[5]	eval-logloss:0.639265
[6]	eval-logloss:0.634124
[7]	eval-logloss:0.630073
[8]	eval-logloss:0.62643
[9]	eval-logloss:0.624042
[10]	eval-logloss:0.6223
[11]	eval-logloss:0.619831
[12]	eval-logloss:0.617842
[13]	eval-logloss:0.616216
[14]	eval-logloss:0.615113
[15]	eval-logloss:0.613701
[16]	eval-logloss:0.612545
[17]	eval-logloss:0.61151
[18]	eval-logloss:0.610881
[19]	eval-logloss:0.610365
[20]	eval-logloss:0.609603
[21]	eval-logloss:0.609072
[22]	eval-logloss:0.608465
[23]	eval-logloss:0.608008
[24]	eval-logloss:0.607695
[25]	eval-logloss:0.607445
[26]	eval-logloss:0.607129
[27]	eval-logloss:0.606901
[28]	eval-logloss:0.606595
[29]	eval-logloss:0.606409
[30]	eval-logloss:0.606025
[31]	eval-logloss:0.605812
[32]	eval-logloss:0.605603
[33]	eval-logloss:0.605475
[34]	eval-logloss:0.605217
[35]	eval

In [28]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'eta': 0.2,
    'max_depth': 3,
    'seed': 2017,
    'silent': 1,
    'gamma':0.015,
    'subsample':0.9,
     'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.662168
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.64953
[2]	eval-logloss:0.636394
[3]	eval-logloss:0.627791
[4]	eval-logloss:0.621829
[5]	eval-logloss:0.617892
[6]	eval-logloss:0.614725
[7]	eval-logloss:0.612674
[8]	eval-logloss:0.610937
[9]	eval-logloss:0.609544
[10]	eval-logloss:0.608477
[11]	eval-logloss:0.607777
[12]	eval-logloss:0.607051
[13]	eval-logloss:0.606893
[14]	eval-logloss:0.606624
[15]	eval-logloss:0.605896
[16]	eval-logloss:0.60548
[17]	eval-logloss:0.605175
[18]	eval-logloss:0.605225
[19]	eval-logloss:0.605136
[20]	eval-logloss:0.604933
[21]	eval-logloss:0.604876
[22]	eval-logloss:0.604694
[23]	eval-logloss:0.604642
[24]	eval-logloss:0.604439
[25]	eval-logloss:0.604117
[26]	eval-logloss:0.604036
[27]	eval-logloss:0.603953
[28]	eval-logloss:0.603882
[29]	eval-logloss:0.603671
[30]	eval-logloss:0.6035
[31]	eval-logloss:0.60368
[32]	eval-logloss:0.603473
[33]	eval-logloss:0.603408
[34]	eval-logloss:0.603706
[35]	eval-

In [39]:
X_dtrain, X_deval, y_dtrain, y_deval = train_test_split(train, flag, random_state=2046, test_size=0.15)
prior = 0.4
dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
deval = xgb.DMatrix(X_deval, y_deval)
watchlist = [(deval, 'eval')]
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'colsample_bytree': 0.8,
    'eta': 0.2,
    'max_depth': 4,
    'seed': 2017,
    'silent': 1,
    'gamma':0.005,
    'subsample':0.8,
    'base_score': prior,
    'eval_metric': 'logloss'
}

clf = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds=50)

[0]	eval-logloss:0.661156
Will train until eval-logloss hasn't improved in 50 rounds.
[1]	eval-logloss:0.64826
[2]	eval-logloss:0.634817
[3]	eval-logloss:0.630179
[4]	eval-logloss:0.62209
[5]	eval-logloss:0.616394
[6]	eval-logloss:0.612301
[7]	eval-logloss:0.609827
[8]	eval-logloss:0.60762
[9]	eval-logloss:0.606529
[10]	eval-logloss:0.605359
[11]	eval-logloss:0.604008
[12]	eval-logloss:0.603444
[13]	eval-logloss:0.602582
[14]	eval-logloss:0.602344
[15]	eval-logloss:0.601653
[16]	eval-logloss:0.601769
[17]	eval-logloss:0.60154
[18]	eval-logloss:0.601848
[19]	eval-logloss:0.601745
[20]	eval-logloss:0.601535
[21]	eval-logloss:0.601728
[22]	eval-logloss:0.601576
[23]	eval-logloss:0.601373
[24]	eval-logloss:0.601421
[25]	eval-logloss:0.601411
[26]	eval-logloss:0.601697
[27]	eval-logloss:0.601864
[28]	eval-logloss:0.601933
[29]	eval-logloss:0.60178
[30]	eval-logloss:0.60141
[31]	eval-logloss:0.6014
[32]	eval-logloss:0.60117
[33]	eval-logloss:0.600783
[34]	eval-logloss:0.600586
[35]	eval-logl

In [32]:
pred=clf.predict(xgb.DMatrix(test))

ValueError: feature_names mismatch: ['shot_id', 'playoffs', 'period', 'homecourt', 'btb', 'last_shot_flag', 'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk', 'combined_shot_type_Hook Shot', 'combined_shot_type_Jump Shot', 'combined_shot_type_Layup', 'combined_shot_type_Tip Shot', 'shot_type_2PT Field Goal', 'shot_type_3PT Field Goal', 'action_type_Alley Oop Dunk Shot', 'action_type_Alley Oop Layup shot', 'action_type_Cutting Layup Shot', 'action_type_Driving Bank shot', 'action_type_Driving Dunk Shot', 'action_type_Driving Finger Roll Layup Shot', 'action_type_Driving Finger Roll Shot', 'action_type_Driving Floating Bank Jump Shot', 'action_type_Driving Floating Jump Shot', 'action_type_Driving Hook Shot', 'action_type_Driving Jump shot', 'action_type_Driving Layup Shot', 'action_type_Driving Reverse Layup Shot', 'action_type_Driving Slam Dunk Shot', 'action_type_Dunk Shot', 'action_type_Fadeaway Bank shot', 'action_type_Fadeaway Jump Shot', 'action_type_Finger Roll Layup Shot', 'action_type_Finger Roll Shot', 'action_type_Floating Jump shot', 'action_type_Follow Up Dunk Shot', 'action_type_Hook Bank Shot', 'action_type_Hook Shot', 'action_type_Jump Bank Shot', 'action_type_Jump Hook Shot', 'action_type_Jump Shot', 'action_type_Layup Shot', 'action_type_Pullup Bank shot', 'action_type_Pullup Jump shot', 'action_type_Putback Dunk Shot', 'action_type_Putback Layup Shot', 'action_type_Putback Slam Dunk Shot', 'action_type_Reverse Dunk Shot', 'action_type_Reverse Layup Shot', 'action_type_Reverse Slam Dunk Shot', 'action_type_Running Bank shot', 'action_type_Running Dunk Shot', 'action_type_Running Finger Roll Layup Shot', 'action_type_Running Finger Roll Shot', 'action_type_Running Hook Shot', 'action_type_Running Jump Shot', 'action_type_Running Layup Shot', 'action_type_Running Pull-Up Jump Shot', 'action_type_Running Reverse Layup Shot', 'action_type_Running Slam Dunk Shot', 'action_type_Running Tip Shot', 'action_type_Slam Dunk Shot', 'action_type_Step Back Jump shot', 'action_type_Tip Layup Shot', 'action_type_Tip Shot', 'action_type_Turnaround Bank shot', 'action_type_Turnaround Fadeaway shot', 'action_type_Turnaround Finger Roll Shot', 'action_type_Turnaround Hook Shot', 'action_type_Turnaround Jump Shot', 'shot_zone_area_Back Court(BC)', 'shot_zone_area_Center(C)', 'shot_zone_area_Left Side Center(LC)', 'shot_zone_area_Left Side(L)', 'shot_zone_area_Right Side Center(RC)', 'shot_zone_area_Right Side(R)', 'shot_zone_basic_Above the Break 3', 'shot_zone_basic_Backcourt', 'shot_zone_basic_In The Paint (Non-RA)', 'shot_zone_basic_Left Corner 3', 'shot_zone_basic_Mid-Range', 'shot_zone_basic_Restricted Area', 'shot_zone_basic_Right Corner 3', 'opponent_ATL', 'opponent_BKN', 'opponent_BOS', 'opponent_CHA', 'opponent_CHI', 'opponent_CLE', 'opponent_DAL', 'opponent_DEN', 'opponent_DET', 'opponent_GSW', 'opponent_HOU', 'opponent_IND', 'opponent_LAC', 'opponent_MEM', 'opponent_MIA', 'opponent_MIL', 'opponent_MIN', 'opponent_NJN', 'opponent_NOH', 'opponent_NOP', 'opponent_NYK', 'opponent_OKC', 'opponent_ORL', 'opponent_PHI', 'opponent_PHX', 'opponent_POR', 'opponent_SAC', 'opponent_SAS', 'opponent_SEA', 'opponent_TOR', 'opponent_UTA', 'opponent_VAN', 'opponent_WAS', 'season_1996-97', 'season_1997-98', 'season_1998-99', 'season_1999-00', 'season_2000-01', 'season_2001-02', 'season_2002-03', 'season_2003-04', 'season_2004-05', 'season_2005-06', 'season_2006-07', 'season_2007-08', 'season_2008-09', 'season_2009-10', 'season_2010-11', 'season_2011-12', 'season_2012-13', 'season_2013-14', 'season_2014-15', 'season_2015-16', 'gamemonth_Apr', 'gamemonth_Dec', 'gamemonth_Feb', 'gamemonth_Jan', 'gamemonth_Jun', 'gamemonth_Mar', 'gamemonth_May', 'gamemonth_Nov', 'gamemonth_Oct', 'loc_x', 'loc_y', 'shot_distance', 'seconds_remaining', 'secondsToPeriodEnd', 'secondsFromGameStart'] ['playoffs', 'period', 'homecourt', 'btb', 'last_shot_flag', 'combined_shot_type_Bank Shot', 'combined_shot_type_Dunk', 'combined_shot_type_Hook Shot', 'combined_shot_type_Jump Shot', 'combined_shot_type_Layup', 'combined_shot_type_Tip Shot', 'shot_type_2PT Field Goal', 'shot_type_3PT Field Goal', 'action_type_Alley Oop Dunk Shot', 'action_type_Alley Oop Layup shot', 'action_type_Cutting Layup Shot', 'action_type_Driving Bank shot', 'action_type_Driving Dunk Shot', 'action_type_Driving Finger Roll Layup Shot', 'action_type_Driving Finger Roll Shot', 'action_type_Driving Floating Bank Jump Shot', 'action_type_Driving Floating Jump Shot', 'action_type_Driving Hook Shot', 'action_type_Driving Jump shot', 'action_type_Driving Layup Shot', 'action_type_Driving Reverse Layup Shot', 'action_type_Driving Slam Dunk Shot', 'action_type_Dunk Shot', 'action_type_Fadeaway Bank shot', 'action_type_Fadeaway Jump Shot', 'action_type_Finger Roll Layup Shot', 'action_type_Finger Roll Shot', 'action_type_Floating Jump shot', 'action_type_Follow Up Dunk Shot', 'action_type_Hook Bank Shot', 'action_type_Hook Shot', 'action_type_Jump Bank Shot', 'action_type_Jump Hook Shot', 'action_type_Jump Shot', 'action_type_Layup Shot', 'action_type_Pullup Bank shot', 'action_type_Pullup Jump shot', 'action_type_Putback Dunk Shot', 'action_type_Putback Layup Shot', 'action_type_Putback Slam Dunk Shot', 'action_type_Reverse Dunk Shot', 'action_type_Reverse Layup Shot', 'action_type_Reverse Slam Dunk Shot', 'action_type_Running Bank shot', 'action_type_Running Dunk Shot', 'action_type_Running Finger Roll Layup Shot', 'action_type_Running Finger Roll Shot', 'action_type_Running Hook Shot', 'action_type_Running Jump Shot', 'action_type_Running Layup Shot', 'action_type_Running Pull-Up Jump Shot', 'action_type_Running Reverse Layup Shot', 'action_type_Running Slam Dunk Shot', 'action_type_Running Tip Shot', 'action_type_Slam Dunk Shot', 'action_type_Step Back Jump shot', 'action_type_Tip Layup Shot', 'action_type_Tip Shot', 'action_type_Turnaround Bank shot', 'action_type_Turnaround Fadeaway shot', 'action_type_Turnaround Finger Roll Shot', 'action_type_Turnaround Hook Shot', 'action_type_Turnaround Jump Shot', 'shot_zone_area_Back Court(BC)', 'shot_zone_area_Center(C)', 'shot_zone_area_Left Side Center(LC)', 'shot_zone_area_Left Side(L)', 'shot_zone_area_Right Side Center(RC)', 'shot_zone_area_Right Side(R)', 'shot_zone_basic_Above the Break 3', 'shot_zone_basic_Backcourt', 'shot_zone_basic_In The Paint (Non-RA)', 'shot_zone_basic_Left Corner 3', 'shot_zone_basic_Mid-Range', 'shot_zone_basic_Restricted Area', 'shot_zone_basic_Right Corner 3', 'opponent_ATL', 'opponent_BKN', 'opponent_BOS', 'opponent_CHA', 'opponent_CHI', 'opponent_CLE', 'opponent_DAL', 'opponent_DEN', 'opponent_DET', 'opponent_GSW', 'opponent_HOU', 'opponent_IND', 'opponent_LAC', 'opponent_MEM', 'opponent_MIA', 'opponent_MIL', 'opponent_MIN', 'opponent_NJN', 'opponent_NOH', 'opponent_NOP', 'opponent_NYK', 'opponent_OKC', 'opponent_ORL', 'opponent_PHI', 'opponent_PHX', 'opponent_POR', 'opponent_SAC', 'opponent_SAS', 'opponent_SEA', 'opponent_TOR', 'opponent_UTA', 'opponent_VAN', 'opponent_WAS', 'season_1996-97', 'season_1997-98', 'season_1998-99', 'season_1999-00', 'season_2000-01', 'season_2001-02', 'season_2002-03', 'season_2003-04', 'season_2004-05', 'season_2005-06', 'season_2006-07', 'season_2007-08', 'season_2008-09', 'season_2009-10', 'season_2010-11', 'season_2011-12', 'season_2012-13', 'season_2013-14', 'season_2014-15', 'season_2015-16', 'gamemonth_Apr', 'gamemonth_Dec', 'gamemonth_Feb', 'gamemonth_Jan', 'gamemonth_Jun', 'gamemonth_Mar', 'gamemonth_May', 'gamemonth_Nov', 'gamemonth_Oct', 'loc_x', 'loc_y', 'shot_distance', 'shot_id', 'seconds_remaining', 'secondsToPeriodEnd', 'secondsFromGameStart']

In [30]:
lstY1103=pred.tolist()

NameError: name 'pred' is not defined

In [31]:
lstY1103_1=['shot_made_flag']+lstY1103
lstID=['shot_id']+test_id

NameError: name 'lstY1103' is not defined

In [32]:
lstY1103_1

NameError: name 'lstY1103_1' is not defined