# Lab 08: MLB Swing Probability Modeling

---
author: Your Name Here
date: April 8, 2024
embed-resources: true
---

## Introduction

## Methods

In [56]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
import matplotlib.pyplot as plt
from calibration import calibration_error, plot_calibration_plot
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
# 导入决策树
from sklearn.tree import DecisionTreeClassifier

### Data

In [38]:
pitches_train = pd.read_csv("https://cs307.org/lab-08/data/pitches-train.csv")
pitches_test = pd.read_csv("https://cs307.org/lab-08/data/pitches-test.csv")

# create X and y for train data
X_train = pitches_train.drop(columns=["swing"])
y_train = pitches_train["swing"]

# create X and y for test data
X_test = pitches_test.drop(columns=["swing"])
y_test = pitches_test["swing"]

In [39]:
pitches_train

Unnamed: 0,pitch_name,release_extension,release_pos_x,release_pos_y,release_pos_z,release_speed,release_spin_rate,spin_axis,plate_x,plate_z,...,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,stand,sz_top,sz_bot,swing
0,Cutter,6.6,-2.76,53.86,5.81,92.6,2376.0,195.0,-0.09,2.79,...,3,1,0,0,0,1,L,3.15,1.52,1
1,Changeup,6.8,-2.87,53.74,5.66,86.3,1511.0,226.0,-1.47,1.84,...,2,1,0,0,0,1,L,3.13,1.56,0
2,Changeup,6.7,-2.83,53.82,5.68,87.9,1570.0,224.0,-1.52,2.38,...,1,1,0,0,0,1,L,3.12,1.51,0
3,Knuckle Curve,6.7,-2.70,53.78,5.78,82.4,2398.0,32.0,0.20,1.04,...,1,0,0,0,0,1,L,3.15,1.52,1
4,Cutter,6.7,-2.64,53.83,5.81,91.0,2427.0,189.0,0.89,1.65,...,0,0,0,0,0,1,L,3.12,1.51,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648,4-Seam Fastball,6.8,-2.64,53.75,5.99,93.4,2411.0,206.0,0.59,2.91,...,0,0,0,0,0,1,L,3.50,1.81,0
2649,Cutter,6.3,-2.91,54.19,5.86,87.3,2541.0,113.0,1.38,1.73,...,1,2,0,0,0,0,R,3.19,1.48,1
2650,Knuckle Curve,6.4,-2.91,54.13,5.87,84.7,2539.0,35.0,0.81,0.11,...,1,1,0,0,0,0,R,3.19,1.48,1
2651,4-Seam Fastball,6.4,-2.67,54.13,6.04,94.3,2531.0,201.0,1.05,1.79,...,0,1,0,0,0,0,R,3.03,1.48,0


#### Summary Statistics

In [40]:
# Counts and Proportions
pitches_count=pitches_train.groupby(["pitch_name", "swing"]).agg('count').reset_index()
pitches_count[['pitch_name', 'swing', 'stand']].rename(columns={'pitcher': 'count'})

Unnamed: 0,pitch_name,swing,stand
0,4-Seam Fastball,0,685
1,4-Seam Fastball,1,611
2,Changeup,0,166
3,Changeup,1,192
4,Cutter,0,141
5,Cutter,1,127
6,Knuckle Curve,0,341
7,Knuckle Curve,1,294
8,Slider,0,53
9,Slider,1,43


In [41]:
print("All: ", (611+192+127+294+43)/(685+611+166+192+141+127+341+294+53+43))
print("4-Seam Fastball: ", 611/(685+611))
print("Changeup: ", 192/(166+192))
print("Cutter: ", 127/(141+127))
print("Knuckle Curve: ", 294/(341+294))
print("Slider: ", 43/(53+43))

All:  0.47757255936675463
4-Seam Fastball:  0.4714506172839506
Changeup:  0.5363128491620112
Cutter:  0.47388059701492535
Knuckle Curve:  0.462992125984252
Slider:  0.4479166666666667


#### Visualizations

In [42]:
# visualizations

### Models

In [43]:
X_train.dtypes

pitch_name            object
release_extension    float64
release_pos_x        float64
release_pos_y        float64
release_pos_z        float64
release_speed        float64
release_spin_rate    float64
spin_axis            float64
plate_x              float64
plate_z              float64
pfx_x                float64
pfx_z                float64
balls                  int64
strikes                int64
on_3b                  int64
on_2b                  int64
on_1b                  int64
outs_when_up           int64
stand                 object
sz_top               float64
sz_bot               float64
dtype: object

In [81]:
# split the data into numerical and categorical
numeric_features = X_train.select_dtypes(include=["float64", "int64"]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(
    steps=[
        ("Imputer", SimpleImputer(strategy="most_frequent")),
        ("Standardization", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("Modal Imputer", SimpleImputer(strategy="most_frequent")),
        ("One-Hot Encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("Numeric Transformer", numeric_transformer, numeric_features),
        ("Categorical Transformer", categorical_transformer, categorical_features),
    ],
    remainder="drop",  
)

model_pipeline = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Classifier", RandomForestClassifier(n_estimators=50, random_state=0))  
])

model_pipeline.fit(X_train, y_train)

In [82]:
calibrated_classifier = CalibratedClassifierCV(model_pipeline, cv='prefit', method='sigmoid')
calibrated_classifier.fit(X_test, y_test)

# 现在，calibrated_classifier表示校准后的模型，你可以使用它来预测概率
y_proba_calibrated = calibrated_classifier.predict_proba(X_test)[:, 1]  # 获取属于正类的概率

In [83]:
# 计算ECE
ece = calibration_error(y_test, y_proba_calibrated, norm='l2', n_bins=10, strategy='uniform')

# 计算MCE
mce = calibration_error(y_test, y_proba_calibrated, norm='max', n_bins=10, strategy='uniform')

# 输出ECE和MCE的值
print(f"ECE: {ece}")
print(f"MCE: {mce}")

ECE: 0.0
MCE: 0.07737147823758396


In [84]:
dump(calibrated_classifier, 'swing-probability.joblib')

['swing-probability.joblib']

## Results

In [None]:
# report model metrics

## Discussion

### Conclusion