In [1]:
# First we load the packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf

# I. Data Preparation

In [2]:
# Read in master data
# Note this master dataset was created in the Week 2 assignment
Master = pd.read_csv("../Data/Master.csv")
Master.dtypes

Unnamed: 0      int64
playerID       object
yearID          int64
stint           int64
G               int64
AB              int64
R               int64
H               int64
Doubles         int64
Triples         int64
HR              int64
RBI           float64
SB            float64
CS            float64
BB              int64
SO            float64
IBB           float64
HBP           float64
SH            float64
SF            float64
GIDP          float64
PA            float64
OBP           float64
SLG           float64
SalYear         int64
teamID         object
lgID           object
salary          int64
lnSal         float64
debutyr         int64
Exp             int64
Arb             int64
Free            int64
POS            object
Catch           int64
Infld           int64
dtype: object

In [3]:
# Create “Experience Squared” variable  
Master['Exp_SQ'] = Master['Exp']**2

In [4]:
# Calculate variables for batting average, isolated power, and eye as defined above (for “eye” do not include IBB in walks)
Master['Avg'] = Master['H']/Master['AB']

Master['Singles'] = Master['H'] - Master['Doubles'] - Master['Triples'] - Master['HR']
Master['SLG'] = (Master['Singles'] + 2*Master['Doubles'] + 3*Master['Triples'] + 4*Master['HR'])/Master['AB']

Master["HBP"] = Master["HBP"].fillna(0)
Master["SH"] = Master["SH"].fillna(0)
Master["SF"] = Master["SF"].fillna(0)

Master["PA"] = Master["AB"] + Master["BB"] + Master["HBP"] + Master["SH"] + Master["SF"] 
Master["OBP"] = ( Master["H"] + Master["BB"] + Master["HBP"] ) / ( Master["PA"] - Master["SH"] )

Master['Iso_Power'] = Master['SLG'] - Master['Avg']

Master['Eye'] = ( Master["BB"] + Master["HBP"] )/Master["PA"]

Batting Average = Hits/At-Bats

Isolated Power = Slugging Percentage – Batting Average

Eye = (Walks + Hit By Pitches)/Plate Appearances

In [5]:
# Subset the data to only include seasons (SalYear) 1995-2015
Yr_99_15 = Master[(Master['yearID']>=1995) & (Master['yearID']<=2015)].reset_index()
Yr_99_15.describe()

Unnamed: 0.1,index,Unnamed: 0,yearID,stint,G,AB,R,H,Doubles,Triples,...,Exp,Arb,Free,Catch,Infld,Exp_SQ,Avg,Singles,Iso_Power,Eye
count,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,...,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0,7176.0
mean,3889.258501,3889.258501,2004.958891,1.189242,117.465858,397.983696,56.445513,108.497213,21.623188,2.289994,...,5.903707,0.369287,0.396739,0.138239,0.347965,51.386706,0.267943,71.531912,0.158036,0.095609
std,2244.766522,2244.766522,6.037708,0.641971,32.343431,149.967104,27.857819,46.617716,10.614559,2.428963,...,4.066357,0.482645,0.489255,0.345174,0.476358,63.197508,0.031775,31.573646,0.0609,0.034388
min,0.0,0.0,1995.0,1.0,34.0,130.0,6.0,20.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.146018,9.0,0.006061,0.013514
25%,1946.75,1946.75,2000.0,1.0,93.0,265.0,33.0,67.0,13.0,1.0,...,3.0,0.0,0.0,0.0,0.0,9.0,0.247559,45.0,0.113636,0.07129
50%,3885.5,3885.5,2005.0,1.0,123.0,409.0,54.0,108.0,21.0,2.0,...,5.0,0.0,0.0,0.0,0.0,25.0,0.268571,70.0,0.153291,0.091421
75%,5830.25,5830.25,2010.0,1.0,146.0,530.0,77.0,147.0,29.0,3.0,...,9.0,1.0,1.0,0.0,1.0,81.0,0.289195,95.0,0.197133,0.1162
max,7783.0,7783.0,2015.0,10.0,163.0,716.0,152.0,262.0,59.0,23.0,...,24.0,1.0,1.0,1.0,1.0,576.0,0.378995,225.0,0.535714,0.3906


Question 1
What is the highest single season “Eye” measure for a player across all seasons in the data? 

In [6]:
Master[Master['Eye'] == Master['Eye'].max()]

Unnamed: 0.1,Unnamed: 0,playerID,yearID,stint,G,AB,R,H,Doubles,Triples,...,Arb,Free,POS,Catch,Infld,Exp_SQ,Avg,Singles,Iso_Power,Eye
743,743,bondsba01,2004,1,147,373,129,135,27,3,...,0,1,OF,0,0,324,0.36193,60,0.450402,0.3906


Question 2
Calculate the average “ISO” by team for all seasons in the data.  What season does the maximum average “ISO” by team value occur in?

In [7]:
ISO_Team = Master.groupby(["teamID","yearID"])["Iso_Power"].mean().reset_index()
ISO_Team[ ISO_Team["Iso_Power"] == ISO_Team["Iso_Power"].max()]

Unnamed: 0,teamID,yearID,Iso_Power
114,CHA,2008,0.209782


Question 3
Calculate the median batting average for every season in the data.  Which season had the highest median?

In [8]:
AVG_Season = Master.groupby(["yearID"])["Avg"].median().reset_index()
AVG_Season[ AVG_Season["Avg"] == AVG_Season["Avg"].max() ]

Unnamed: 0,yearID,Avg
6,1999,0.277868


# II. Running Regressions for Each Season

In [9]:
# Write a function to run the following regression for every season in the data:
# lnSal on batting average, isolated power, eye, plate appearances, experience, experience-squared,
# and positional fixed effects for free agents only

def salary_formula(Season):
    MB_Seas = Master[(Master["SalYear"] == Season) & (Master["Free"] == 1)]
    global lm
    MB_Seas = MB_Seas.dropna(subset=['lnSal', 'Avg', 'Iso_Power', 'Eye', 'PA', 'Exp', 'Exp_SQ', 'POS'])
    lm = smf.ols(formula = 'lnSal ~ Avg + Iso_Power + Eye + PA + Exp + Exp_SQ + C(POS)', data=MB_Seas).fit()
    return lm;

In [10]:
# Create a list to store seasons
seasons = Master["SalYear"].unique() #seasons is numpy type
seasons_list = sorted(list(seasons))
seasons_list

[1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016]

In [11]:
#Give each regression result a name which is the season it corresponds to
reg_result = {}
for season in seasons_list:
    reg_result[season] = salary_formula(season)
reg_result

{1994: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f2448cf8>,
 1995: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f2467f60>,
 1996: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f0409cc0>,
 1997: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f0419ac8>,
 1998: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f041a4a8>,
 1999: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f0437dd8>,
 2000: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f04426a0>,
 2001: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f04417f0>,
 2002: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f03d1fd0>,
 2003: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f03ea860>,
 2004: <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7743f03ea048>,
 2005: <statsmodels.r

In [12]:
# Divide results into following eras:
# Pre-MB: 1995-2000
# Moneyball Era: 2001-2008
# Post-MB: 2009-2014
Pre_MB = {}
MBall_Era = {}
Post_MB = {}

for season in reg_result:
    if (season >= 1995) & (season <= 2000):
        Pre_MB[season] = reg_result.get(season)
    elif (season >= 2001) & (season <= 2008):
        MBall_Era[season] = reg_result.get(season)
    elif season > 2008:
        Post_MB[season] = reg_result.get(season)

In [13]:
# Generate result tables for each era (should contain regression results for each individual season within each era).
# Make sure to include the R-squared and number of observations in the regression output.
from statsmodels.iolib.summary2 import summary_col

info_dict = {'R-squared': lambda x: f"{x.rsquared:.2f}",
             'No. observations': lambda x: f"{int(x.nobs)}"}

def generate_summary_table(models_dict):
    
    model_names = list(models_dict.keys())
    models = list(models_dict.values())
    
    summary_table = summary_col(models, model_names=model_names,
                                regressor_order=['Avg', 'Iso_Power', 'Eye', 'PA', 'Exp', 'Exp_SQ'], stars=True, info_dict=info_dict)
    return summary_table

In [14]:
print("Pre-Moneyball Era Regression Results:")
print(generate_summary_table(Pre_MB))

Pre-Moneyball Era Regression Results:

                    1995      1996       1997       1998       1999       2000   
---------------------------------------------------------------------------------
Avg              3.7261    1.6921     4.4403**   4.0121*    2.8133     2.8979    
                 (2.4150)  (2.8851)   (2.1651)   (2.1222)   (2.0204)   (1.8524)  
Iso_Power        3.5872*** 4.9581***  2.7886***  3.1340***  2.4689***  3.5986*** 
                 (1.1991)  (1.3861)   (1.0009)   (1.1908)   (0.9210)   (0.9752)  
Eye              2.4417    0.5096     2.0994     3.8206**   2.0140     1.0060    
                 (1.9611)  (2.0089)   (1.6643)   (1.6066)   (1.5924)   (1.5256)  
PA               0.0062*** 0.0033***  0.0028***  0.0036***  0.0036***  0.0029*** 
                 (0.0008)  (0.0006)   (0.0004)   (0.0004)   (0.0004)   (0.0004)  
Exp              0.0623    0.0415     -0.1466    0.1424     0.2264*    0.0510    
                 (0.1612)  (0.2007)   (0.1588)   (0.1502)  

In [15]:
print("Moneyball Era Regression Results:")
print(generate_summary_table(MBall_Era))

Moneyball Era Regression Results:

                    2001       2002       2003       2004       2005       2006       2007       2008   
--------------------------------------------------------------------------------------------------------
Avg              0.7142     2.2282     2.1671     2.8623     5.1070**   6.1979**   3.6265*    -0.7795   
                 (2.1011)   (2.6597)   (2.5275)   (2.6710)   (2.2766)   (2.5742)   (2.0114)   (2.0713)  
Iso_Power        5.0715***  2.7926**   1.4127     1.8573     3.1916***  2.6517**   3.0405***  3.1244**  
                 (1.0084)   (1.3380)   (1.3554)   (1.2495)   (1.2057)   (1.2112)   (1.0456)   (1.2600)  
Eye              -3.0326*   1.7225     3.0337     9.3959***  2.8751     2.9151     4.2304**   3.4829    
                 (1.6224)   (2.0778)   (2.1865)   (2.1082)   (1.8720)   (2.0959)   (1.8341)   (2.1335)  
PA               0.0040***  0.0031***  0.0044***  0.0027***  0.0034***  0.0037***  0.0037***  0.0037*** 
                 (0.

In [16]:
print("Post-Moneyball Era Regression Results:")
print(generate_summary_table(Post_MB))

Post-Moneyball Era Regression Results:

                    2009       2010      2011      2012       2013       2014       2015       2016   
------------------------------------------------------------------------------------------------------
Avg              7.6219***  7.9516*** 6.2201**  -1.5959    2.7462     5.7890***  5.0992***  4.0517    
                 (2.4116)   (2.7847)  (2.7746)  (2.6167)   (2.1495)   (2.1811)   (1.8966)   (2.5619)  
Iso_Power        1.6764     2.5375    3.1109**  3.2232**   2.7647*    2.9424**   0.8036     0.3273    
                 (1.4667)   (1.5673)  (1.4932)  (1.6000)   (1.4094)   (1.3941)   (1.5045)   (1.5051)  
Eye              4.3845**   6.2334*** 4.0906    2.6060     4.1774*    5.8097**   6.0030***  3.1391    
                 (2.1487)   (2.3060)  (2.7579)  (2.5512)   (2.3415)   (2.5174)   (2.1107)   (2.7139)  
PA               0.0033***  0.0033*** 0.0035*** 0.0042***  0.0032***  0.0032***  0.0033***  0.0025*** 
                 (0.0005)   (0.00

# III. Running the Pooled Regression

In [42]:
# Subset data to only include free agent players and add a “PreMB” dummy variable to your dataframe for seasons prior to 2004.  
FA_Prior_MB = Master[ (Master["Free"] == 1)].reset_index()
FA_Prior_MB["Pre_MB"] = np.where( (FA_Prior_MB["SalYear"] < 2004) , 1, 0)

In [43]:
# Run a pooled regression for free agent players using the format same model structure developed in your function in 6).
# Your pooled regression should have the form: lnSal ~ (model from 6)) + Pre-MB*(model from 6))

FA_Prior_MB = FA_Prior_MB.dropna(subset=['lnSal', 'Avg', 'Iso_Power', 'Eye', 'PA', 'Exp', 'Exp_SQ', 'POS'])
Pooled_lm = smf.ols(formula = 'lnSal ~ Avg + Iso_Power + Eye + PA + Exp + Exp_SQ + C(POS) \
                                + Pre_MB*(Avg + Iso_Power + Eye + PA + Exp + Exp_SQ + C(POS))',\
                                    data=FA_Prior_MB).fit()
Pooled_lm.summary()

0,1,2,3
Dep. Variable:,lnSal,R-squared:,0.517
Model:,OLS,Adj. R-squared:,0.513
Method:,Least Squares,F-statistic:,130.6
Date:,"Tue, 21 May 2024",Prob (F-statistic):,0.0
Time:,03:55:40,Log-Likelihood:,-3621.2
No. Observations:,3074,AIC:,7294.0
Df Residuals:,3048,BIC:,7451.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,11.6699,0.289,40.365,0.000,11.103,12.237
C(POS)[T.2B],-0.1017,0.082,-1.247,0.212,-0.262,0.058
C(POS)[T.3B],0.1645,0.078,2.117,0.034,0.012,0.317
C(POS)[T.C],0.0720,0.077,0.934,0.350,-0.079,0.223
C(POS)[T.DH],0.0528,0.095,0.555,0.579,-0.134,0.239
C(POS)[T.OF],0.1238,0.063,1.973,0.049,0.001,0.247
C(POS)[T.SS],0.0924,0.085,1.086,0.278,-0.074,0.259
Avg,2.6333,0.668,3.944,0.000,1.324,3.942
Iso_Power,2.1358,0.386,5.540,0.000,1.380,2.892

0,1,2,3
Omnibus:,14.081,Durbin-Watson:,1.328
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.924
Skew:,0.013,Prob(JB):,7.77e-05
Kurtosis:,3.384,Cond. No.,47000.0
