In [1]:
# Import libraries
from sklearn.model_selection import train_test_split, StratifiedKFold
import sys
sys.path.append("../src/features")
from FeatureSelectionOptuna import FeatureSelectionOptuna
import pandas as pd
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler


pd.set_option('display.max_columns', 500)
df = pd.read_csv("../data/processed/all_shots-v5.csv", index_col=0)
df_fs = df[['Shot Distance', 'X Location', 'Y Location', 'Season Type',
        'Shot Zone Basic_Above the Break 3', 'Shot Zone Basic_Backcourt', 'Shot Zone Basic_In The Paint (Non-RA)',
       'Shot Zone Basic_Left Corner 3', 'Shot Zone Basic_Mid-Range','Shot Zone Basic_Restricted Area', 'Shot Zone Basic_Right Corner 3',
       'Shot Zone Area_Back Court(BC)', 'Shot Zone Area_Center(C)', 'Shot Zone Area_Left Side Center(LC)', 'Shot Zone Area_Left Side(L)',
       'Shot Zone Area_Right Side Center(RC)', 'Shot Zone Area_Right Side(R)', 'Shot Zone Range_16-24 ft.', 'Shot Zone Range_24+ ft.',
       'Shot Zone Range_8-16 ft.', 'Shot Zone Range_Back Court Shot', 'Shot Zone Range_Less Than 8 ft.', 'target',
       'PERIOD', 'PLAYER1_NAME', 'at_home', 'PREVIOUS_OFF_REBOUND', 'PREVIOUS_DEF_REBOUND', 'dunk_shot',
       'PREVIOUS_OFF_TURNOVER', 'PREVIOUS_OFF_MISSED', 'DETAILLED_SHOT_TYPE',  'seconds_left', 'Age', 'TS%', 'PTM', 'ASTM', 'ORBM', 'STLM', 'BLKM', 'TOVM', 'USG%',
       'FG%', '2P%', '3P%', 'FT%', 'PTS', 'height', 'weight', 'C', 'PF', 'PG', 'PG-SG', 'SF', 'SF-SG', 'SG', 'SG-PG',
       'E_DEF_RATING', 'E_OFF_RATING', 'PCT_PREV_ACTION', 'PCT_AREA', 'YEARS_EXP']]

def update_shot_type(data):    
    data.loc[(data.DETAILLED_SHOT_TYPE != 'JUMP SHOT') & (data.DETAILLED_SHOT_TYPE != "FREE THROW"), 'DETAILLED_SHOT_TYPE'] = 'OTHER'
    return data

df_fs = update_shot_type(df_fs)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 32

X = df_fs.drop(['target', 'PLAYER1_NAME'], axis=1)
X = pd.get_dummies(X)
y = df_fs.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, stratify=y)
# Stratified kfold over the train set for cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
splits = list(skf.split(X_train, y_train))

In [5]:
features = list(X_train.columns)

model = XGBClassifier(n_estimators=200, 
                      booster="gbtree", 
                      objective="binary:logistic", 
                      colsample_bytree=0.85,
                      max_depth=6,
                      learning_rate=0.1,
                      gamma=0.9,
                      reg_lambda=0.9,
                      grow_policy="lossguide"
                      )

sampler = TPESampler(seed = SEED)
study = optuna.create_study(direction="maximize",sampler=sampler)

# We first try the model using all features
default_features = {ft: True for ft in features}
study.enqueue_trial(default_features)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         features=features,
                         X=X_train,
                         y=y_train,
                         splits=splits,
                         penalty = 1e-5,
                         ), n_trials=30)

[I 2024-08-23 15:43:44,296] A new study created in memory with name: no-name-efde170e-c5ac-4d5f-878a-897e51b49f15
[I 2024-08-23 15:44:13,199] Trial 0 finished with value: 0.6839942905905109 and parameters: {'Shot Distance': True, 'X Location': True, 'Y Location': True, 'Season Type': True, 'Shot Zone Basic_Above the Break 3': True, 'Shot Zone Basic_Backcourt': True, 'Shot Zone Basic_In The Paint (Non-RA)': True, 'Shot Zone Basic_Left Corner 3': True, 'Shot Zone Basic_Mid-Range': True, 'Shot Zone Basic_Restricted Area': True, 'Shot Zone Basic_Right Corner 3': True, 'Shot Zone Area_Back Court(BC)': True, 'Shot Zone Area_Center(C)': True, 'Shot Zone Area_Left Side Center(LC)': True, 'Shot Zone Area_Left Side(L)': True, 'Shot Zone Area_Right Side Center(RC)': True, 'Shot Zone Area_Right Side(R)': True, 'Shot Zone Range_16-24 ft.': True, 'Shot Zone Range_24+ ft.': True, 'Shot Zone Range_8-16 ft.': True, 'Shot Zone Range_Back Court Shot': True, 'Shot Zone Range_Less Than 8 ft.': True, 'PERIO

In [6]:
# display features that made the best results
selected_features = study.best_params
selected_features = [ft for ft in selected_features.keys() if selected_features[ft]]
selected_features

['Shot Distance',
 'X Location',
 'Y Location',
 'Season Type',
 'Shot Zone Basic_Backcourt',
 'Shot Zone Basic_In The Paint (Non-RA)',
 'Shot Zone Basic_Left Corner 3',
 'Shot Zone Basic_Mid-Range',
 'Shot Zone Basic_Restricted Area',
 'Shot Zone Basic_Right Corner 3',
 'Shot Zone Area_Back Court(BC)',
 'Shot Zone Area_Left Side(L)',
 'Shot Zone Area_Right Side Center(RC)',
 'Shot Zone Area_Right Side(R)',
 'Shot Zone Range_8-16 ft.',
 'Shot Zone Range_Back Court Shot',
 'Shot Zone Range_Less Than 8 ft.',
 'at_home',
 'PREVIOUS_OFF_REBOUND',
 'PREVIOUS_DEF_REBOUND',
 'dunk_shot',
 'PREVIOUS_OFF_MISSED',
 'Age',
 'TS%',
 'PTM',
 'ASTM',
 'ORBM',
 'STLM',
 'BLKM',
 'USG%',
 'FG%',
 '2P%',
 'FT%',
 'PTS',
 'height',
 'weight',
 'C',
 'PG',
 'PG-SG',
 'SF',
 'SF-SG',
 'SG',
 'SG-PG',
 'E_DEF_RATING',
 'E_OFF_RATING',
 'PCT_PREV_ACTION',
 'PCT_AREA',
 'YEARS_EXP',
 'DETAILLED_SHOT_TYPE_FREE THROW',
 'DETAILLED_SHOT_TYPE_OTHER']

In [7]:
# save the final dataset for further modeling - adding back the player name and target fields
df = pd.concat([df, pd.get_dummies(df_fs.DETAILLED_SHOT_TYPE, prefix='DETAILLED_SHOT_TYPE')], axis = 1)
selected_features.append('PLAYER1_NAME')
selected_features.append('target')
df_fs = df[selected_features]
df_fs.to_csv("../data/processed/all_shots_only_selected_features.csv")