In [5]:
import numpy as np
import xgboost as xgb
import optuna
from optuna.visualization import plot_optimization_history
import pandas as pd 
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/processed/all_shots_final-v2.csv", index_col=0)
df.drop(['PLAYER1_NAME'], axis=1, inplace=True)
df.dropna(inplace=True)
print(df.columns)
print(df.shape)

Index(['Shot Distance', 'Season Type', 'Shot Zone Basic_In The Paint (Non-RA)',
       'Shot Zone Basic_Right Corner 3', 'Shot Zone Area_Right Side(R)',
       'Shot Zone Range_8-16 ft.', 'at_home', 'PREVIOUS_OFF_MISSED', 'Age',
       'ASTM', 'ORBM', 'FT%', 'height', 'weight', 'C', 'SG-PG', 'E_DEF_RATING',
       'PCT_AREA', 'DETAILLED_SHOT_TYPE_JUMP SHOT', 'target'],
      dtype='object')
(428835, 20)


In [6]:
# Delete outliers
Q1 = df['Shot Distance'].quantile(0.25)
Q3 = df['Shot Distance'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df = df[(df['Shot Distance'] >= lower_bound) & (df['Shot Distance'] <= upper_bound)]

print(df.shape)

(427956, 20)


## Séparation des données

In [7]:
# prepare data for modeling   
X = df.drop('target', axis = 1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, shuffle=True)

## Standardisation et réduction de dimension

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_scaled['target'] = y_train

# réduction de dimensions
sel_var = VarianceThreshold(0.0001)
X_train_scaled = sel_var.fit_transform(X_train_scaled, y_train)
X_test_scaled = sel_var.transform(X_test_scaled)

## OverSampling

In [11]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Check the distribution of the target variable after resampling
print("Before resampling:")
print(y_train.value_counts())
print("\nAfter resampling:")
print(pd.Series(y_train_resampled).value_counts())

Before resampling:
target
1    193652
0    148712
Name: count, dtype: int64

After resampling:
target
0    193652
1    193652
Name: count, dtype: int64


## XGBoost

In [12]:
train = xgb.DMatrix(data=X_train_resampled, label=y_train_resampled)
test = xgb.DMatrix(data=X_test_scaled, label=y_test)


def objective(trial):
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, train)
    preds = bst.predict(test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-10-14 10:20:00,749] A new study created in memory with name: no-name-63e09786-1323-40a4-ad7c-847e90436ab7
[I 2024-10-14 10:20:01,649] Trial 0 finished with value: 0.6633563884475184 and parameters: {'booster': 'dart', 'lambda': 5.689654697855204e-05, 'alpha': 0.0001168925830775831, 'subsample': 0.42233188323332604, 'colsample_bytree': 0.9796199744029025, 'max_depth': 3, 'min_child_weight': 7, 'eta': 0.0029354220427314946, 'gamma': 0.1556252869609715, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.0006478468520881318, 'skip_drop': 0.00048216268037225236}. Best is trial 0 with value: 0.6633563884475184.
[I 2024-10-14 10:20:01,895] Trial 1 finished with value: 0.6666861388914852 and parameters: {'booster': 'gblinear', 'lambda': 4.15227464207405e-08, 'alpha': 1.5862732264009e-07, 'subsample': 0.7968380142861375, 'colsample_bytree': 0.8076524386714253}. Best is trial 1 with value: 0.6666861388914852.
[I 2024-10-14 10:20:02,066] Trial 

Number of finished trials:  100
Best trial:
  Value: 0.6767688569025142
  Params: 
    booster: dart
    lambda: 0.0018109587959834871
    alpha: 0.03840781339264432
    subsample: 0.6467111165571134
    colsample_bytree: 0.9866470353145465
    max_depth: 7
    min_child_weight: 8
    eta: 0.4080194023189088
    gamma: 6.331558669340217e-07
    grow_policy: depthwise
    sample_type: uniform
    normalize_type: tree
    rate_drop: 8.034540233857643e-06
    skip_drop: 2.568561037020507e-05


In [13]:
optuna.visualization.plot_optimization_history(study)

In [14]:
optuna.visualization.plot_param_importances(study)

In [15]:
optuna.visualization.plot_slice(study, params = ['subsample',
'lambda',
'colsample_bytree',
'alpha',
'booster'])

In [16]:
params = {
    "booster": "dart",
    "lambda": 1.0097915738299128e-06,
    "alpha": 2.3445234888982956e-08,
    "subsample": 0.7152405139734603,
    "colsample_bytree": 0.8437252255677583,
    "max_depth": 9,
    "min_child_weight": 9,
    "eta": 0.2357930936747691,
    "gamma": 0.13557767476363425,
    "grow_policy": "lossguide",
    "sample_type": "uniform",
    "normalize_type": "tree",
    "rate_drop": 1.2065021262266629e-08,
    "skip_drop": 3.1483383573651435e-05
}
model_xgb = xgb.train(params, train)

In [17]:
# save model
import bentoml
bentoml.xgboost.save_model("nba_shot_prediction", model_xgb)

Model(tag="nba_shot_prediction:3qfarcmkaw5sxdax", path="C:\Users\Sarah\bentoml\models\nba_shot_prediction\3qfarcmkaw5sxdax\")

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from optuna.samplers import TPESampler


def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ["gini", "entropy", "log_loss"])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion
    )

    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# callback to stop the trial early 
pruning_callback = optuna.pruners.MedianPruner()

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize', pruner=[pruning_callback], sampler=TPESampler())
study.optimize(objective_rf, n_trials=20)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-10-14 10:25:30,027] A new study created in memory with name: no-name-b39e2d5f-3880-441e-9f1d-4ff3c73a57c6
[I 2024-10-14 10:28:31,667] Trial 0 finished with value: 0.6772946069726142 and parameters: {'n_estimators': 476, 'max_depth': 20, 'min_samples_split': 24, 'min_samples_leaf': 18, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.6772946069726142.
[I 2024-10-14 10:29:42,696] Trial 1 finished with value: 0.6759393401252454 and parameters: {'n_estimators': 225, 'max_depth': 12, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 0 with value: 0.6772946069726142.
[I 2024-10-14 10:31:27,064] Trial 2 finished with value: 0.6771544069539209 and parameters: {'n_estimators': 291, 'max_depth': 16, 'min_samples_split': 23, 'min_samples_leaf': 29, 'max_features': 'sqrt', 'criterion': 'gini'}. Best is trial 0 with value: 0.6772946069726142.
[I 2024-10-14 10:32:27,896] Trial 3 finished with value: 0.

Number of finished trials:  20
Best trial:
  Value: 0.6778086737078232
  Params: 
    n_estimators: 353
    max_depth: 19
    min_samples_split: 22
    min_samples_leaf: 25
    max_features: sqrt
    criterion: entropy


In [19]:
plot_optimization_history(study)

In [20]:
optuna.visualization.plot_param_importances(study)

In [21]:
optuna.visualization.plot_slice(study, params = ['n_estimators',
'max_depth',
'min_samples_split',
'min_samples_leaf',
'max_features',
'criterion',])

In [23]:
model_rf = RandomForestClassifier(
        n_estimators=311,
        max_depth=14,
        min_samples_split=23,
        min_samples_leaf=4,
        max_features="log2",
        criterion="gini"
    )
model_rf.fit(X_train_resampled, y_train_resampled)


In [27]:
model_rf.score(X_test_scaled, y_test)

0.6825935383072743

In [33]:
# save model
import pickle
with open("../models/trained_rf.save", "wb") as f:
    pickle.dump(model_rf, f)

In [None]:
# save model
import bentoml
bentoml.sklearn.save_model("nba_shot_prediction", model_rf)

# Analyse des erreurs