In [1]:
import random
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from scipy.stats import norm
from models import model_prep_on_base, create_model_pipeline
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

random.seed(123)
from models import model_prep_on_base, model_prep_on_base

## Logit

In [2]:
# Load data
on_base_path = "../data/throw_home_runner_on_third_wide_sprint_arm.parquet"
on_base_lf = pl.scan_parquet(on_base_path)

# Feature column names
responses = ["is_out"]

num_predictors_drop = [
    "hang_time",
  "distance_catch_to_home",
  "distance_traveled_fielder",
  "distance_traveled_all_fielders",
  "distance_to_home_diff",
  "distance_catch_to_first",
  "distance_catch_to_second",
  "distance_catch_to_third"
]

num_predictors_median = [
    "seconds_since_hit_085_mlb_person_id_R3",
    "arm_overall_fielder_mlb_person_id",
]

# Scoring method
specificity_scorer = make_scorer(recall_score, pos_label=1)
precision_pos_scorer = make_scorer(precision_score, pos_label=1, zero_division=0)
f1_pos_scorer = make_scorer(f1_score, pos_label=1, zero_division=0)
scoring = {
    'log_loss': 'neg_log_loss',
    'brier_score': 'neg_brier_score'
}
refit = "brier_score"

# Parameter grid
param_grid = {
    'classifier__penalty': ['elasticnet'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'classifier__solver': ['saga'],
    'classifier__max_iter': [100, 250, 500, 1000]
}

# Train model
grid_search = create_model_pipeline(
    num_predictors_drop=num_predictors_drop,
    num_predictors_median=num_predictors_median,
    model_type="LogisticRegression",
    oversampling_method="SMOTE",
    param_grid=param_grid,
    scoring=scoring,
    refit=refit,
    cv=5,
)

# Censored Train and Censored Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = True,
    test_stay_to_out_threshold = False,
)

pipeline = results['pipeline']
X_train = results['X_train']
X_test = results['X_test']
y_train = results['y_train']
y_test = results['y_test']
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']
brier_score = results['brier_score']
log_loss = results['log_loss']
feature_names = results['feature_names']
response_names = results['response_names']

pipeline

Total features: 11
Total Predictors: 10
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits





Best parameters: 
classifier__C = 100
classifier__l1_ratio = 0.9
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.0465

              precision    recall  f1-score   support

       False       0.95      0.93      0.94      1444
        True       0.78      0.83      0.80       406

    accuracy                           0.91      1850
   macro avg       0.86      0.88      0.87      1850
weighted avg       0.91      0.91      0.91      1850

Brier Score: 0.0660
log loss: 0.2229

Predictors:
hang_time
distance_catch_to_home
distance_traveled_fielder
distance_traveled_all_fielders
distance_to_home_diff
distance_catch_to_first
distance_catch_to_second
distance_catch_to_third
seconds_since_hit_085_mlb_person_id_R3
arm_overall_fielder_mlb_person_id

Response:
is_out




In [3]:
# Parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 500],
    'classifier__max_depth': [None, 20],
    'classifier__min_samples_split': [2, 10],
    'classifier__min_samples_leaf': [1, 4],
    'classifier__max_features': ['sqrt', None],
    'classifier__random_state': [42]
}

# Train model
grid_search = create_model_pipeline(
    num_predictors_drop=num_predictors_drop,
    num_predictors_median=num_predictors_median,
    model_type="RandomForestClassifier",
    oversampling_method="SMOTE",
    param_grid=param_grid,
    scoring=scoring,
    refit=refit,
    cv=5,
)

# Censored Train and Censored Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = True,
    test_stay_to_out_threshold = False,
)

pipeline = results['pipeline']
X_train = results['X_train']
X_test = results['X_test']
y_train = results['y_train']
y_test = results['y_test']
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']
brier_score = results['brier_score']
log_loss = results['log_loss']
feature_names = results['feature_names']
response_names = results['response_names']

pipeline

Total features: 11
Total Predictors: 10
Total Responses: 1
Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best parameters: 
classifier__max_depth = None
classifier__max_features = sqrt
classifier__min_samples_leaf = 1
classifier__min_samples_split = 2
classifier__n_estimators = 500
classifier__random_state = 42

Best cross-validation score: -0.0413

              precision    recall  f1-score   support

       False       0.93      0.95      0.94      1444
        True       0.81      0.74      0.77       406

    accuracy                           0.90      1850
   macro avg       0.87      0.85      0.86      1850
weighted avg       0.90      0.90      0.90      1850

Brier Score: 0.0706
log loss: 0.2634

Predictors:
hang_time
distance_catch_to_home
distance_traveled_fielder
distance_traveled_all_fielders
distance_to_home_diff
distance_catch_to_first
distance_catch_to_second
distance_catch_to_third
seconds_since_hit_085_mlb_person_id_R3
arm_overall_fielder_mlb_person_

In [4]:
# Parameter grid
param_grid = {
    'classifier__n_estimators': [100, 500],
    'classifier__max_depth': [3, 10],
    'classifier__learning_rate': [0.01, 0.2],
    'classifier__subsample': [0.8, 1.0],
    'classifier__max_features': ['sqrt', None],
    'classifier__min_samples_split': [2, 10],
    'classifier__random_state': [42]
}

# Train model
grid_search = create_model_pipeline(
    num_predictors_drop=num_predictors_drop,
    num_predictors_median=num_predictors_median,
    model_type="GradientBoostingClassifier",
    oversampling_method="SMOTE",
    param_grid=param_grid,
    scoring=scoring,
    refit=refit,
    cv=5,
)

# Censored Train and Censored Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = True,
    test_stay_to_out_threshold = False,
)

pipeline = results['pipeline']
X_train = results['X_train']
X_test = results['X_test']
y_train = results['y_train']
y_test = results['y_test']
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']
brier_score = results['brier_score']
log_loss = results['log_loss']
feature_names = results['feature_names']
response_names = results['response_names']

pipeline

Total features: 11
Total Predictors: 10
Total Responses: 1
Fitting 5 folds for each of 64 candidates, totalling 320 fits

Best parameters: 
classifier__learning_rate = 0.01
classifier__max_depth = 10
classifier__max_features = sqrt
classifier__min_samples_split = 10
classifier__n_estimators = 500
classifier__random_state = 42
classifier__subsample = 0.8

Best cross-validation score: -0.0433

              precision    recall  f1-score   support

       False       0.93      0.95      0.94      1444
        True       0.81      0.74      0.77       406

    accuracy                           0.90      1850
   macro avg       0.87      0.84      0.85      1850
weighted avg       0.90      0.90      0.90      1850

Brier Score: 0.0789
log loss: 0.2859

Predictors:
hang_time
distance_catch_to_home
distance_traveled_fielder
distance_traveled_all_fielders
distance_to_home_diff
distance_catch_to_first
distance_catch_to_second
distance_catch_to_third
seconds_since_hit_085_mlb_person_id_R3
arm_

In [5]:
param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__solver': ['adam', 'lbfgs'],
    'classifier__alpha': [0.001, 0.01],
    'classifier__max_iter': [500, 1000],
    'classifier__random_state': [42]
}

# Train model
grid_search = create_model_pipeline(
    num_predictors_drop=num_predictors_drop,
    num_predictors_median=num_predictors_median,
    model_type="MLPClassifier",
    oversampling_method="SMOTE",
    param_grid=param_grid,
    scoring=scoring,
    refit=refit,
    cv=5,
)

# Censored Train and Censored Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = True,
    test_stay_to_out_threshold = False,
)

pipeline = results['pipeline']
X_train = results['X_train']
X_test = results['X_test']
y_train = results['y_train']
y_test = results['y_test']
y_pred = results['y_pred']
y_pred_proba = results['y_pred_proba']
brier_score = results['brier_score']
log_loss = results['log_loss']
feature_names = results['feature_names']
response_names = results['response_names']

pipeline

Total features: 11
Total Predictors: 10
Total Responses: 1
Fitting 5 folds for each of 48 candidates, totalling 240 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("


Best parameters: 
classifier__activation = relu
classifier__alpha = 0.01
classifier__hidden_layer_sizes = (50,)
classifier__max_iter = 500
classifier__random_state = 42
classifier__solver = adam

Best cross-validation score: -0.0449

              precision    recall  f1-score   support

       False       0.94      0.94      0.94      1444
        True       0.80      0.79      0.79       406

    accuracy                           0.91      1850
   macro avg       0.87      0.87      0.87      1850
weighted avg       0.91      0.91      0.91      1850

Brier Score: 0.0738
log loss: 0.2896

Predictors:
hang_time
distance_catch_to_home
distance_traveled_fielder
distance_traveled_all_fielders
distance_to_home_diff
distance_catch_to_first
distance_catch_to_second
distance_catch_to_third
seconds_since_hit_085_mlb_person_id_R3
arm_overall_fielder_mlb_person_id

Response:
is_out


In [6]:
print("\a")


