In [8]:
import os
import polars as pl
import pandas as pd
from models import model_prep_on_base
from models import create_model_pipeline
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score

In [9]:

# Load data
on_base_path = "../data/throw_home_runner_on_third_wide_sprint_arm.parquet"
on_base_lf = pl.scan_parquet(on_base_path)

# Feature column names
responses = ["is_out"]

num_predictors_drop = [
    "distance_catch_to_home"
]

num_predictors_median = [
]

# Parameter grid
param_grid = {
    'classifier__penalty': ['elasticnet'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'classifier__solver': ['saga'],
    'classifier__max_iter': [100, 250, 500, 1000]
}

# Scoring method
specificity_scorer = make_scorer(recall_score, pos_label=1)
precision_pos_scorer = make_scorer(precision_score, pos_label=1, zero_division=0)
f1_pos_scorer = make_scorer(f1_score, pos_label=1, zero_division=0)
scoring = {
    'log_loss': 'neg_log_loss',
    'brier_score': 'neg_brier_score'
}
refit = "brier_score"



# Train model
grid_search = create_model_pipeline(
    num_predictors_drop=num_predictors_drop,
    num_predictors_median=num_predictors_median,
    model_type="LogisticRegression",
    oversampling_method="SMOTE",
    param_grid=param_grid,
    scoring=scoring,
    refit=refit,
    cv=5,
)

grid_search

In [10]:
# Unmodified Train and Unmodified Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = False,
    test_stay_to_out = False,
    test_stay_to_out_threshold = False,
)
results["y_test"].sum()

Total features: 2
Total Predictors: 1
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits

Best parameters: 
classifier__C = 100
classifier__l1_ratio = 0.9
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.1208

              precision    recall  f1-score   support

       False       1.00      0.83      0.91      1450
        True       0.13      0.86      0.22        42

    accuracy                           0.83      1492
   macro avg       0.56      0.84      0.57      1492
weighted avg       0.97      0.83      0.89      1492

Brier Score: 0.1195

Predictors:
distance_catch_to_home

Response:
is_out


np.int64(42)

In [11]:
# Censored Train and Unmodified Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = False,
    test_stay_to_out_threshold = False,
)
results["y_test"].sum()

Total features: 2
Total Predictors: 1
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits

Best parameters: 
classifier__C = 10
classifier__l1_ratio = 0.5
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.0535

              precision    recall  f1-score   support

       False       0.99      0.92      0.95      1450
        True       0.17      0.60      0.27        42

    accuracy                           0.91      1492
   macro avg       0.58      0.76      0.61      1492
weighted avg       0.96      0.91      0.93      1492

Brier Score: 0.0658

Predictors:
distance_catch_to_home

Response:
is_out


np.int64(42)

In [12]:
# Censored Train and Censored Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = False,
    test_stay_to_out_threshold = True,
)
results["y_test"].sum()

Total features: 2
Total Predictors: 1
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits

Best parameters: 
classifier__C = 10
classifier__l1_ratio = 0.5
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.0535

              precision    recall  f1-score   support

       False       0.99      0.92      0.95      1450
        True       0.72      0.95      0.82       323

    accuracy                           0.92      1773
   macro avg       0.85      0.93      0.88      1773
weighted avg       0.94      0.92      0.93      1773

Brier Score: 0.0568

Predictors:
distance_catch_to_home

Response:
is_out


np.int64(323)

In [13]:
# Censored Train and Converted Stay to Out Test
results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = True,
    test_stay_to_out = True,
    test_stay_to_out_threshold = False,
)
results["y_test"].sum()

Total features: 2
Total Predictors: 1
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits

Best parameters: 
classifier__C = 10
classifier__l1_ratio = 0.5
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.0535

              precision    recall  f1-score   support

       False       0.94      0.92      0.93      1450
        True       0.73      0.81      0.77       408

    accuracy                           0.89      1858
   macro avg       0.84      0.86      0.85      1858
weighted avg       0.90      0.89      0.89      1858

Brier Score: 0.0759

Predictors:
distance_catch_to_home

Response:
is_out


np.int64(408)

In [20]:
# Converted Stay to Out Test Train and Converted Stay to Out Test
responses = ["is_successful"]

results = model_prep_on_base(
    on_base_lf = on_base_lf,
    grid_search = grid_search,
    responses = responses,
    num_predictors_drop = num_predictors_drop,
    num_predictors_median = num_predictors_median,
    test_size = 0.30,
    is_out_censored = False,
    test_stay_to_out = False,
    test_stay_to_out_threshold = False,
)
(results["y_test"] == False).sum()

Total features: 2
Total Predictors: 1
Total Responses: 1
Fitting 5 folds for each of 120 candidates, totalling 600 fits

Best parameters: 
classifier__C = 1
classifier__l1_ratio = 0.5
classifier__max_iter = 100
classifier__penalty = elasticnet
classifier__solver = saga

Best cross-validation score: -0.0800

              precision    recall  f1-score   support

       False       0.67      0.89      0.76       411
        True       0.97      0.88      0.92      1448

    accuracy                           0.88      1859
   macro avg       0.82      0.88      0.84      1859
weighted avg       0.90      0.88      0.88      1859

Brier Score: 0.0860

Predictors:
distance_catch_to_home

Response:
is_successful


np.int64(411)