In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from comet_ml import Experiment

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna

from src.data.dataset import split_dataset
from src.features.features import advanced_features
from src.models.eval_plots import plot_roc_auc, plot_goal_rate, plot_cumulative_proportion, plot_calibration_curve

#plt.rcParams["figure.figsize"] = (16, 4)
np.random.seed(0)

# Data Load

In [2]:
season_plays_df = pd.read_csv("./data/processed/plays_2015-2020.csv", index_col=False)
train_df, test_df = split_dataset(season_plays_df)
y_train = np.where(train_df.event_type_id=="GOAL", 1, 0)
y_test = np.where(test_df.event_type_id=="GOAL", 1, 0)

# Preprocess

In [3]:
pre_train_df = advanced_features(train_df)
pre_train_df = pre_train_df.drop(columns=["empty_net"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
x_train = pre_train_df[["angle_from_net", "dist_from_net"]]

# XGBoost

In [None]:
def evaluation_plots(model, X, y):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]
    
    plot_roc_auc(y, y_pred)
    plot_goal_rate(y, y_proba)
    plot_cumulative_proportion(y, y_proba)
    plot_calibration_curve(y, y_proba)

## Base model

In [None]:
base_params={
    "n_estimators": 100,
    "max_depth": 4,
    "learning_rate": 0.1,
}

def base_xgb(x_train, x_test, y_train, y_test, params={}):      
    model = XGBClassifier(objective="binary:logistic", use_label_encoder=False, **params)
    
    model.fit(x_train, y_train,
              eval_set=[(x_test, y_test)],
              eval_metric=["logloss", "error", "auc"]
    )
    return model

def run_base_xgb(X, y, params, save_run=False):
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
    if save_run:
        experiment = Experiment(project_name="hockey-all-star-analytics")
    model = base_xgb(x_train, x_val, y_train, y_val)
    evaluation_plots(model, x_val, y_val)
    
    if save_run:
        experiment.end()
    return model

In [None]:
base_model = run_xgb(x_train, y_train, base_params)

In [None]:
y_train

## Tuned XGBoost

In [6]:
def objective(trial, x_df, y_df):
    hyperparams = {
        # structure
        "max_depth": trial.suggest_int("max_depth", 3, 12, step=1),
        # accuracy
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_categorical("n_estimators", [50]),
        # overfitting
        "reg_alpha": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "reg_lambda": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_child_weight": trial.suggest_float("min_gain_to_split", 0, 15),
    }
    
    kfold_cv = StratifiedKFold(n_splits=5, shuffle=True)
    cv_scores = []
    
    for idx, (train_idx, test_idx) in enumerate(kfold_cv.split(x_df, y_df)):
        x_train, x_test = x_df.iloc[train_idx], x_df.iloc[test_idx]
        y_train, y_test = y_df.iloc[train_idx], y_df.iloc[test_idx]
        
        clf = XGBClassifier(**hyperparams)
        clf.fit(
            x_train,
            y_train,
            eval_set=[(x_test, y_test)],
            eval_metric=["logloss", "error", "auc"],
            verbose=False,
        )
        best_score = clf.evals_result()["validation_0"]["logloss"]
        cv_scores.append(best_score)
    
    return np.mean(cv_scores)


def run_tuned_xgb(x_df, y_df, save_run=False):
    if save_run:
        experiment = Experiment(project_name="hockey-all-star-analytics")
        
        
    study = optuna.create_study(direction="minimize", study_name="tuned_xgboost")
    optimize = lambda trial: objective(trial, x_df, y_df)
    study.optimize(optimize, n_trials=20)
    
    
    x_train, x_val, y_train, y_val = train_test_split(x_df.values, y_df.values, test_size=0.2, stratify=y_df.values)
    best_model = XGBClassifier(**study.best_params)
    best_model.fit(x_train, y_train,
                   eval_set=[(x_val, y_val)],
                   eval_metric=["logloss", "error", "auc"])
    

    y_pred = best_model.predict(x_val)
    y_proba = best_model.predict_proba(x_val)[:, 1]
    
    plot_roc_auc(y_val, y_pred)
    if save_run:
        experiment.log_figure()
        
    plot_goal_rate(y_val, y_proba)
    if save_run:
        experiment.log_figure()
        
    plot_cumulative_proportion(y_val, y_proba)
    if save_run:
        experiment.log_figure()
        
    plot_calibration_curve(y_val, y_proba)
    if save_run:
        experiment.log_figure()
    
    plt.plot(best_model.feature_importances_)
    if save_run:
        experiment.log_figure()
    
    if save_run:
        experiment.log_model("xgboost_best", "./models/run3")
        experiment.end()
    return best_model

In [7]:
best_model = run_tuned_xgb(pre_train_df, pd.Series(y_train), save_run=True)

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/zilto/hockey-all-star-analytics/e43106b786f649ca9f73829c64558ec1
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     validation_0_auc [5050]     : (0.663647, 0.772287)
COMET INFO:     validation_0_error [5050]   : (0.092075, 0.09405)
COMET INFO:     validation_0_logloss [5050] : (0.267527, 0.667749)
COMET INFO:   Parameters:
COMET INFO:     begin_iteration  : 0
COMET INFO:     booster          : gbtree
COMET INFO:     end_iteration    : 50
COMET INFO:     eval_metric      : ['logloss', 'error', 'auc']
COMET INFO:     feature_names    : ['seconds_elapsed', 'period_idx', 'x_coord', 'y_coord', 'x_coord_norm', 'y_coord_norm', 'dist_from_net', 'angle_from_net', 'Backhand', 'Deflected', 'Slap Shot', 'Snap Shot', 'Tip-In', 'Wrap-around', 'Wrist Sho

Parameters: { "lambda_l1", "lambda_l2", "min_gain_to_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0	validation_0-logloss:0.51882	validation_0-error:0.09363	validation_0-auc:0.74853




1	validation_0-logloss:0.42438	validation_0-error:0.09313	validation_0-auc:0.75554
2	validation_0-logloss:0.36788	validation_0-error:0.09317	validation_0-auc:0.75878
3	validation_0-logloss:0.33263	validation_0-error:0.09325	validation_0-auc:0.76040
4	validation_0-logloss:0.31012	validation_0-error:0.09322	validation_0-auc:0.76204
5	validation_0-logloss:0.29575	validation_0-error:0.09323	validation_0-auc:0.76317
6	validation_0-logloss:0.28629	validation_0-error:0.09313	validation_0-auc:0.76490
7	validation_0-logloss:0.28044	validation_0-error:0.09320	validation_0-auc:0.76518
8	validation_0-logloss:0.27653	validation_0-error:0.09325	validation_0-auc:0.76628
9	validation_0-logloss:0.27400	validation_0-error:0.09318	validation_0-auc:0.76699
10	validation_0-logloss:0.27241	validation_0-error:0.09307	validation_0-auc:0.76718
11	validation_0-logloss:0.27133	validation_0-error:0.09307	validation_0-auc:0.76734
12	validation_0-logloss:0.27041	validation_0-error:0.09310	validation_0-auc:0.76843
1

NameError: name 'best_mmodel' is not defined