# Imports

In [37]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

np.random.seed(23)

In [5]:
df = pd.read_csv('Data/data_cleaned.csv')

In [6]:
df.head()

Unnamed: 0,action_type,combined_shot_type,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent
0,Jump Shot,Jump Shot,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR
1,Jump Shot,Jump Shot,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
2,Jump Shot,Jump Shot,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR
3,Driving Dunk Shot,Dunk,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR
4,Jump Shot,Jump Shot,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR


## Dummifying Features & Concatenating Dataframe

In [14]:
df.columns.to_list()

['action_type',
 'combined_shot_type',
 'minutes_remaining',
 'period',
 'playoffs',
 'season',
 'seconds_remaining',
 'shot_distance',
 'shot_made_flag',
 'shot_type',
 'shot_zone_area',
 'shot_zone_basic',
 'shot_zone_range',
 'game_date',
 'matchup',
 'opponent']

In [17]:
df['year'] = [x[:4] for x in df.game_date]

In [21]:
df.dtypes

action_type            object
combined_shot_type     object
minutes_remaining       int64
period                  int64
playoffs                int64
season                 object
seconds_remaining       int64
shot_distance           int64
shot_made_flag        float64
shot_type              object
shot_zone_area         object
shot_zone_basic        object
shot_zone_range        object
game_date              object
matchup                object
opponent               object
year                   object
dtype: object

In [29]:
features = ['combined_shot_type', 'minutes_remaining', 
            'period', 'shot_type', 'year', 'opponent']

# snake_casing subgroups for features 
for feature in features:
    if df[feature].dtypes == object:
        df[feature] = [x.replace(' ', '_') for x in df[feature]]
        
df_dum = pd.get_dummies(df[features], drop_first=True)

df_final = pd.concat([df_dum, 
                      df[['playoffs', 
                          'shot_distance', 
                          'shot_made_flag']]], 
                     axis=1)

In [31]:
df_final.head()

Unnamed: 0,minutes_remaining,period,combined_shot_type_Dunk,combined_shot_type_Hook_Shot,combined_shot_type_Jump_Shot,combined_shot_type_Layup,combined_shot_type_Tip_Shot,shot_type_3PT_Field_Goal,year_1997,year_1998,...,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS,playoffs,shot_distance,shot_made_flag
0,10,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15,0.0
1,7,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,16,1.0
2,6,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,22,0.0
3,6,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,9,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,14,0.0


## Train Test Split

In [32]:
X = df_final.drop('shot_made_flag', axis=1)
y = df_final.shot_made_flag

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.20,
                                                   random_state = 23)

# Standard Scale

In [41]:
# initialize
ss = StandardScaler()

# fit on Train
X_train_ss = pd.DataFrame(ss.fit_transform(X_train), 
                          columns=X_train.columns, 
                          index=X_train.index)

# transform test
X_test_ss = pd.DataFrame(ss.transform(X_test), 
                         columns=X_test.columns, 
                         index=X_test.index)

# SelectKBest

In [42]:
# Create and fit selector
selector = SelectKBest(k=20)
selector.fit(X_train_ss, y_train)

In [43]:
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)

In [44]:
X_train_feat_select = X_train_ss.iloc[:,cols]
X_test_feat_select = X_test_ss.iloc[:,cols]

In [47]:
# Selected Columns
X_train_feat_select.columns.tolist()

['minutes_remaining',
 'period',
 'combined_shot_type_Dunk',
 'combined_shot_type_Jump_Shot',
 'combined_shot_type_Layup',
 'shot_type_3PT_Field_Goal',
 'year_2000',
 'year_2001',
 'year_2004',
 'year_2008',
 'year_2009',
 'year_2014',
 'year_2015',
 'year_2016',
 'opponent_BOS',
 'opponent_IND',
 'opponent_MIL',
 'opponent_NYK',
 'opponent_PHX',
 'shot_distance']

# Exports

In [54]:
y_train.to_csv('Data/y_train_processed.csv', index=False)
y_test.to_csv('Data/y_test_processed.csv', index=False)

X_train_feat_select.to_csv('Data/x_train_processed.csv', index=False)
X_test_feat_select.to_csv('Data/x_test_processed.csv', index=False)