In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

CLEANFILE = "../../MLBDATA/Processed/BatterData/ModelingBattingData.csv"

In [None]:
# 1) Load & prep
df = pd.read_csv(r'C:\Users\Andrew\Documents\ECON464\MLBDATA\Processed\BatterData\ModelingBattingData.csv')
df['LHB']     = (df['bat_side']=='L').astype(int)
df['PostBan'] = (df['year']>=2023).astype(int)
df['T']       = df['LHB'] * df['PostBan']     # treatment indicator

# 2) Features & outcome
features = ['cluster','year']                  # plus any others you like
X = pd.get_dummies(df[features], drop_first=True)
Y = df['on_base_percent']
T = df['T']

# 3) Fit two forests
rf_treat   = RandomForestRegressor(n_estimators=200, min_samples_leaf=50)
rf_control = RandomForestRegressor(n_estimators=200, min_samples_leaf=50)

rf_treat.  fit(X[T==1], Y[T==1])
rf_control.fit(X[T==0], Y[T==0])

# 4) Predict ITE = E[Y|T=1,X] − E[Y|T=0,X]
df['ITE_OBP'] = rf_treat.predict(X) - rf_control.predict(X)

# 5) Summarize by cluster
print(df.groupby('cluster')['ITE_OBP'].mean())


cluster
0    0.008944
1   -0.017091
2    0.020602
Name: ITE_OBP, dtype: float64


In [13]:
# 1) Load & prep
df = pd.read_csv(r'C:\Users\Andrew\Documents\ECON464\MLBDATA\Processed\BatterData\ModelingBattingData.csv')
df['LHB']     = (df['bat_side']=='L').astype(int)
df['PostBan'] = (df['year']>=2023).astype(int)
df['T']       = df['LHB'] * df['PostBan']     # treatment indicator

# 2) Features & outcome
features = ['cluster','year']                  # plus any others you like
X = pd.get_dummies(df[features], drop_first=True)
Y = df['babip']
T = df['T']

# 3) Fit two forests
rf_treat   = RandomForestRegressor(n_estimators=200, min_samples_leaf=50)
rf_control = RandomForestRegressor(n_estimators=200, min_samples_leaf=50)

rf_treat.  fit(X[T==1], Y[T==1])
rf_control.fit(X[T==0], Y[T==0])

# 4) Predict ITE = E[Y|T=1,X] − E[Y|T=0,X]
df['ITE_BABIP'] = rf_treat.predict(X) - rf_control.predict(X)

# 5) Summarize by cluster
print(df.groupby('cluster')['ITE_BABIP'].mean())

cluster
0   -0.012355
1   -0.014075
2    0.010024
Name: ITE_BABIP, dtype: float64


In [16]:
# 1) Load & prep
df = pd.read_csv(r'C:\Users\Andrew\Documents\ECON464\MLBDATA\Processed\BatterData\ModelingBattingData.csv')
df['LHB']       = (df['bat_side'] == 'L').astype(int)
df['PostBan']   = (df['year']    >= 2023).astype(int)
df['T']         = df['LHB'] * df['PostBan']     # treatment indicator

# 3) Build feature matrix & outcome
features = ['cluster', 'year']
X_base   = pd.get_dummies(df[features], drop_first=True)
X        = X_base.copy()
X['extreme_shift'] = df['extreme_shift']      # add the new flag
Y        = df['on_base_percent']              # continuous OBP outcome
T        = df['T']

# 4) Fit two RandomForestRegressors (T‐learner)
rf_treat   = RandomForestRegressor(n_estimators=200, min_samples_leaf=50, random_state=42)
rf_control = RandomForestRegressor(n_estimators=200, min_samples_leaf=50, random_state=42)

rf_treat .fit(X[T==1], Y[T==1])
rf_control.fit(X[T==0], Y[T==0])

# 5) Compute Individual Treatment Effects (ITE)
df['ITE_OBP'] = rf_treat .predict(X) - rf_control.predict(X)

# 6) Summarize average uplift by cluster
print("Average ITE_OBP by cluster:")
print(df.groupby('cluster')['ITE_OBP'].mean(), "\n")

# 7) Summarize by cluster AND extreme_shift status
print("Average ITE_OBP by cluster and extreme_shift:")
print(df.groupby(['cluster','extreme_shift'])['ITE_OBP'].mean().unstack())

# (Optional) Repeat for BABIP:
# df['ITE_BABIP'] = rf_treat_for_babip.predict(X) - rf_control_for_babip.predict(X)
# print(df.groupby('cluster')['ITE_BABIP'].mean())

Average ITE_OBP by cluster:
cluster
0    0.008406
1   -0.017382
2    0.021023
Name: ITE_OBP, dtype: float64 

Average ITE_OBP by cluster and extreme_shift:
extreme_shift         0         1
cluster                          
0              0.008399  0.008863
1             -0.017275 -0.017965
2              0.020826  0.021887
