In [1]:
# First we load the packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# I. Building the Dataset: Player Stats

In [2]:
# Read in salary data retrieved from Lahman's Database
Salary = pd.read_csv("../Data/Salaries.csv")

# Read in batting
Batting = pd.read_csv("../Data/Batting.csv")

In [3]:
Salary.head()

Unnamed: 0,yearID,teamID,lgID,playerID,salary
0,1985,ATL,NL,barkele01,870000
1,1985,ATL,NL,bedrost01,550000
2,1985,ATL,NL,benedbr01,545000
3,1985,ATL,NL,campri01,633333
4,1985,ATL,NL,ceronri01,625000


In [4]:
Batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,Doubles,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,


In [5]:
# Load the “Salaries” file, drop any missing values, create a variable for the natural log (ln) of player salary,
# and rename the column “yearID” as “SalYear”

Salary = Salary.dropna()

Salary["log_sal"] = np.log(Salary["salary"])

Salary.rename(columns={'yearID': 'SalYear'}, inplace=True)

In [6]:
# Create a copy of the “Salaries” dataframe called “Master”.
Master = Salary.copy()

In [7]:
# Load the batting data and sum data across stints.
Batting["stint"].sum()

110819

In [8]:
# Subset batting data to only include batting seasons 1998-2006 and players with at least 130AB.
Subset = Batting[ (Batting["yearID"] <= 2006) & (Batting["yearID"] >= 1998) & (Batting["AB"] >= 130) ]
Subset.describe()

Unnamed: 0,yearID,stint,G,AB,R,H,Doubles,Triples,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
count,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0,3607.0
mean,2001.991128,1.052675,110.70779,370.049626,53.716662,101.393679,20.306072,2.128362,12.503743,51.393956,6.739673,2.922373,37.547269,66.154145,3.056279,3.976435,2.241475,3.213197,8.565567
std,2.579072,0.237845,35.282192,157.693047,29.679806,49.013804,11.030388,2.343704,10.747411,30.868556,9.367291,3.176027,24.836636,32.745994,4.707577,3.916807,3.071015,2.493199,5.273397
min,1998.0,1.0,35.0,130.0,7.0,21.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0
25%,2000.0,1.0,82.0,221.5,28.0,57.0,11.0,0.0,4.0,26.0,1.0,1.0,19.0,41.0,0.0,1.0,0.0,1.0,5.0
50%,2002.0,1.0,114.0,362.0,48.0,96.0,19.0,1.0,9.0,45.0,3.0,2.0,32.0,60.0,2.0,3.0,1.0,3.0,8.0
75%,2004.0,1.0,143.0,513.0,75.0,143.0,28.0,3.0,18.0,70.0,9.0,4.0,51.0,86.5,4.0,6.0,3.0,5.0,12.0
max,2006.0,4.0,163.0,704.0,152.0,262.0,59.0,20.0,73.0,165.0,72.0,24.0,232.0,195.0,120.0,31.0,24.0,16.0,32.0


In [9]:
# Calculate PA, OBP, SLG, and batting average
Subset["AVG"] = Subset["H"]/Subset["AB"]

Subset["Single"] = Subset["H"] - Subset["Doubles"] - Subset["Triples"] - Subset["HR"]
Subset["SLG"] = ( Subset["Single"] + 2*Subset["Doubles"] + 3*Subset["Triples"] + 4*Subset["HR"] ) / Subset["AB"]

Subset["HBP"] = Subset["HBP"].fillna(0)
Subset["SH"] = Subset["SH"].fillna(0)
Subset["SF"] = Subset["SF"].fillna(0)

Subset["PA"] = Subset["AB"] + Subset["BB"] + Subset["HBP"] + Subset["SH"] + Subset["SF"] 

Subset["OBP"] = ( Subset["H"] + Subset["BB"] + Subset["HBP"] ) / ( Subset["PA"] - Subset["SH"] )

Subset[["PA","OBP","SLG","AVG"]].mean()

PA     417.028001
OBP      0.337007
SLG      0.427798
AVG      0.268086
dtype: float64

In [10]:
# Create SalYear variable to create one year lag between batting performance and salary 
Subset['SalYear'] = Subset['yearID'] + 1 

In [11]:
# Merge batting data and master data.
Merge = pd.merge( Subset, Master, how = "inner", on = ['playerID', 'SalYear'] )
Merge.head()

Unnamed: 0,playerID,yearID,stint,teamID_x,lgID_x,G,AB,R,H,Doubles,...,AVG,Single,SLG,PA,OBP,SalYear,teamID_y,lgID_y,salary,log_sal
0,abbotje01,1998,1,CHA,AL,89,244,33,68,14,...,0.278689,41,0.491803,260.0,0.29845,1999,CHA,AL,255000,12.449019
1,abreubo01,1998,1,PHI,NL,151,497,68,155,29,...,0.311871,103,0.496982,589.0,0.408547,1999,PHI,NL,400000,12.89922
2,alexama02,1998,1,CHN,NL,108,264,34,60,10,...,0.227273,44,0.329545,289.0,0.278169,1999,CHN,NL,475000,13.07107
3,alfoned01,1998,1,NYN,NL,144,557,94,155,28,...,0.278276,108,0.427289,630.0,0.355096,1999,NYN,NL,2800000,14.84513
4,alicelu01,1998,1,TEX,AL,101,259,51,71,15,...,0.274131,47,0.42471,308.0,0.371711,1999,TEX,AL,825000,13.623139


Question 1
What was the average player salary in 1999?  What was the average player salary in 2006?

In [12]:
Merge[Merge["SalYear"]==1999]["salary"].mean()

2280982.486263736

In [13]:
Merge[Merge["SalYear"]==2006]["salary"].mean()

3716991.455882353

Question 2 Calculate the average player OBP and SLG for every season in the timeframe.  Which season had the highest average player OBP and what was its value?  Which season had the highest average player SLG and what was its value?    

In [14]:
OBP_Season = Merge.groupby("yearID")["OBP"].mean()
OBP_Season[ OBP_Season == OBP_Season.max() ]

yearID
1999    0.349643
Name: OBP, dtype: float64

In [15]:
SLG_Season = Merge.groupby("yearID")["SLG"].mean()
SLG_Season[ SLG_Season == SLG_Season.max() ]

yearID
2000    0.445443
Name: SLG, dtype: float64

Question 3
Sum HR by player across the entire timeframe.  What was the highest aggregate home run total over the timeframe 1998-2006?  

In [16]:
HR_Player = Merge.groupby("playerID")["HR"].sum()
HR_Player[ HR_Player == HR_Player.max() ]

playerID
rodrial01    400
Name: HR, dtype: int64

# II. Building the Dataset: Player Info

In [51]:
# Read in People data retrieved from Lahman's Database
People = pd.read_csv("../Data/People.csv")

# Read in player appearance data retrieved from Lahman's Database
Appearances = pd.read_csv("../Data/Appearances.csv")

In [52]:
People.head()

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,...,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01


In [53]:
Appearances.head()

Unnamed: 0,yearID,teamID,lgID,playerID,G_all,GS,G_batting,G_defense,G_p,G_c,...,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr
0,1871,TRO,,abercda01,1,1.0,1,1.0,0,0,...,0,0,1,0,0,0,0,0.0,0.0,0.0
1,1871,RC1,,addybo01,25,25.0,25,25.0,0,0,...,22,0,3,0,0,0,0,0.0,0.0,0.0
2,1871,CL1,,allisar01,29,29.0,29,29.0,0,0,...,2,0,0,0,29,0,29,0.0,0.0,0.0
3,1871,WS3,,allisdo01,27,27.0,27,27.0,0,27,...,0,0,0,0,0,0,0,0.0,0.0,0.0
4,1871,RC1,,ansonca01,25,25.0,25,25.0,0,5,...,2,20,0,1,0,0,1,0.0,0.0,0.0


In [54]:
# Read in “People” data and extract the player’s debut year 
People["debut"] = pd.to_datetime(People['debut'])
People['debutyr'] = People['debut'].dt.year
People['debutyr'].head()

0    2004.0
1    1954.0
2    1962.0
3    1977.0
4    2001.0
Name: debutyr, dtype: float64

In [55]:
# Merge debut year into master data and calculate years of experience
Exp = pd.merge( Master, People, how = "left", on = ["playerID"] )
Exp["exp_years"] = Exp["SalYear"] - Exp["debutyr"]

In [56]:
# Based on a player’s years of experience, 
# create indicator variables for arbitration eligible players (3-6 years)
# and free agent players (more than 6 years) 

Exp["arb_eligible"] = np.where( (Exp["exp_years"]>=3)&(Exp["exp_years"]<=6), 1, 0)
Exp["free_agent"] = np.where( (Exp["exp_years"]>6), 1, 0)

In [57]:
# Read in the data for player appearances and group by stint.
# Then identify the maximum number of games played at a given position for each year.  
Player_Appearances = Appearances.groupby(['playerID','yearID'])["G_c","G_1b","G_2b","G_3b","G_ss","G_of","G_dh"].sum()  
Player_Appearances.reset_index(inplace=True)
Player_Appearances.head()

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh
0,aardsda01,2004,0,0,0,0,0,0,0.0
1,aardsda01,2006,0,0,0,0,0,0,0.0
2,aardsda01,2007,0,0,0,0,0,0,0.0
3,aardsda01,2008,0,0,0,0,0,0,0.0
4,aardsda01,2009,0,0,0,0,0,0,0.0


In [58]:
Player_Appearances['Max_G'] = Player_Appearances[["G_c","G_1b","G_2b","G_3b","G_ss","G_of","G_dh"]].max(axis=1)
Player_Appearances.head()

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh,Max_G
0,aardsda01,2004,0,0,0,0,0,0,0.0,0.0
1,aardsda01,2006,0,0,0,0,0,0,0.0,0.0
2,aardsda01,2007,0,0,0,0,0,0,0.0,0.0
3,aardsda01,2008,0,0,0,0,0,0,0.0,0.0
4,aardsda01,2009,0,0,0,0,0,0,0.0,0.0


In [59]:
# Create a function to determine player position. 
def player_pos( row ):
    
    G_pos = ["G_c","G_1b","G_2b","G_3b","G_ss","G_of","G_dh"]
    pos = ""
    for i in G_pos:
        if row[i] == row["Max_G"]:
            if pos == "":
                pos += i[2:]
                row["pos"] = pos
            else:
                pos += ( ", " + i[2:] )
                row["pos"] = pos
    return pos

Player_Appearances["pos"] = Player_Appearances.apply(player_pos, axis=1)
'''
apply(player_pos, axis=1)

apply is a method in pandas that can apply a function to the rows or columns of a DataFrame.

"player_pos" takes a row of data as input and returns the player's position for that row.

axis=0 operates on columns, while axis=1 operates on rows.
'''

'\napply(player_pos, axis=1)\n\napply is a method in pandas that can apply a function to the rows or columns of a DataFrame.\n\n"player_pos" takes a row of data as input and returns the player\'s position for that row.\n\naxis=0 operates on columns, while axis=1 operates on rows.\n'

In [60]:
# Exclude non-position players.
Player_Appearances = Player_Appearances[ Player_Appearances['Max_G'] != 0 ]
Player_Appearances.head()

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh,Max_G,pos
9,aaronha01,1954,0,0,0,0,0,116,0.0,116.0,of
10,aaronha01,1955,0,0,27,0,0,126,0.0,126.0,of
11,aaronha01,1956,0,0,0,0,0,152,0.0,152.0,of
12,aaronha01,1957,0,0,0,0,0,150,0.0,150.0,of
13,aaronha01,1958,0,0,0,0,0,153,0.0,153.0,of


In [64]:
# Create an indicator variable for catcher and the infield (2B, SS, 3B) positions separately.
# Thus, you should have a separate indicator variable for 2B, SS, and 3B 
# individually as opposed to one infielder indicator variable combining these positions. 
Player_Appearances["catcher"] = np.where( Player_Appearances["pos"].str.contains("c"), 1, 0)
Player_Appearances["infield"] = np.where( (Player_Appearances["pos"].str.contains("2b"))|
                                          (Player_Appearances["pos"].str.contains("ss"))|
                                          (Player_Appearances["pos"].str.contains("3b")), 1, 0)

In [97]:
# Merge this into exp data.
salary_pos = pd.merge(Player_Appearances, Exp, how = 'right', left_on = ("playerID", "yearID"), right_on = ("playerID", "SalYear") )
salary_pos.head()

Unnamed: 0,playerID,yearID,G_c,G_1b,G_2b,G_3b,G_ss,G_of,G_dh,Max_G,...,bats,throws,debut,finalGame,retroID,bbrefID,debutyr,exp_years,arb_eligible,free_agent
0,abbotje01,1998.0,0.0,0.0,0.0,0.0,0.0,76.0,2.0,76.0,...,R,L,1997-06-10,2001-09-29,abboj002,abbotje01,1997.0,1.0,0,0
1,abbotje01,1999.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,17.0,...,R,L,1997-06-10,2001-09-29,abboj002,abbotje01,1997.0,2.0,0,0
2,abbotje01,2000.0,0.0,0.0,0.0,0.0,0.0,65.0,7.0,65.0,...,R,L,1997-06-10,2001-09-29,abboj002,abbotje01,1997.0,3.0,1,0
3,abbotje01,2001.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,17.0,...,R,L,1997-06-10,2001-09-29,abboj002,abbotje01,1997.0,4.0,1,0
4,abbotku01,1993.0,0.0,0.0,2.0,0.0,6.0,13.0,0.0,13.0,...,R,R,1993-09-07,2001-04-13,abbok002,abbotku01,1993.0,0.0,0,0


Question 1
What was the highest paid position on average in 1999?  What was the highest paid position on average in 2004?

In [98]:
salary_by_pos = salary_pos.groupby(["yearID","pos"])["salary"].mean().reset_index()
salary_by_pos[ salary_by_pos["salary"] == salary_by_pos["salary"].max() ]

Unnamed: 0,yearID,pos,salary
308,2016.0,"of, dh",11400000.0


In [99]:
salary_1999 = salary_by_pos[salary_by_pos["yearID"] == 1999]
salary_1999[ salary_1999["salary"] == salary_1999["salary"].max() ]

Unnamed: 0,yearID,pos,salary
139,1999.0,1b,2114334.0


In [100]:
salary_2004 = salary_by_pos[salary_by_pos["yearID"] == 2004]
salary_2004[ salary_2004["salary"] == salary_2004["salary"].max() ]

Unnamed: 0,yearID,pos,salary
192,2004.0,1b,4344392.02


Question 2
What percentage of observations in the data set are either flagged as arbitration eligible or free agent eligible?

In [101]:
(Exp["arb_eligible"].sum() + Exp["free_agent"].sum())/len(Exp)

0.737399727561677

Question 3
Sum years of experience by team for 2002. What is the highest and lowest aggregate years of experience for teams in 2002 data?   

In [102]:
Exp_2002 = Exp[Exp["SalYear"] == 2002].reset_index()
Exp_2002_team = Exp_2002.groupby("teamID")["exp_years"].sum().reset_index()
Exp_2002_team[ Exp_2002_team["exp_years"] == Exp_2002_team["exp_years"].min() ]

Unnamed: 0,teamID,exp_years
27,TBA,111.0


In [103]:
Exp_2002_team[ Exp_2002_team["exp_years"] == Exp_2002_team["exp_years"].max() ]

Unnamed: 0,teamID,exp_years
1,ARI,290.0


# III. Running Regressions

In [110]:
# lnSal on OBP, SLG, batting average, plate appearances, arbitration (dummy), free agent (dummy), 
# and all positional dummy variables during the seasons prior to the publication of Moneyball (1999-2003) combined. 
import statsmodels.formula.api as smf

whole_data = pd.merge( Merge, salary_pos, on = ["playerID","yearID"])
whole_data.head()

Unnamed: 0,playerID,yearID,stint,teamID_x,lgID_x,G,AB,R,H,Doubles,...,bats,throws,debut,finalGame,retroID,bbrefID,debutyr,exp_years,arb_eligible,free_agent
0,abbotje01,1998,1,CHA,AL,89,244,33,68,14,...,R,L,1997-06-10,2001-09-29,abboj002,abbotje01,1997.0,1.0,0,0
1,abreubo01,1998,1,PHI,NL,151,497,68,155,29,...,L,R,1996-09-01,2014-09-28,abreb001,abreubo01,1996.0,2.0,0,0
2,alexama02,1998,1,CHN,NL,108,264,34,60,10,...,R,R,1992-09-18,2006-09-28,alexm001,alexama02,1992.0,6.0,1,0
3,alfoned01,1998,1,NYN,NL,144,557,94,155,28,...,R,R,1995-04-26,2006-06-11,alfoe001,alfoned01,1995.0,3.0,1,0
4,alicelu01,1998,1,TEX,AL,101,259,51,71,15,...,B,R,1988-04-23,2002-09-25,alicl001,alicelu01,1988.0,10.0,0,1


In [117]:
lm = smf.ols(formula = 'log_sal_x ~ OBP + SLG + AVG + PA + arb_eligible + free_agent + catcher + infield', data = whole_data ).fit()
lm.summary()

0,1,2,3
Dep. Variable:,log_sal_x,R-squared:,0.61
Model:,OLS,Adj. R-squared:,0.609
Method:,Least Squares,F-statistic:,580.5
Date:,"Thu, 16 May 2024",Prob (F-statistic):,0.0
Time:,22:46:44,Log-Likelihood:,-3436.0
No. Observations:,2980,AIC:,6890.0
Df Residuals:,2971,BIC:,6944.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.4054,0.142,73.421,0.000,10.127,10.683
OBP,2.3342,0.597,3.909,0.000,1.163,3.505
SLG,3.2021,0.267,12.007,0.000,2.679,3.725
AVG,-2.6050,0.736,-3.538,0.000,-4.049,-1.161
PA,0.0026,9.13e-05,28.816,0.000,0.002,0.003
arb_eligible,1.2932,0.041,31.537,0.000,1.213,1.374
free_agent,1.8805,0.040,46.782,0.000,1.802,1.959
catcher,0.0220,0.045,0.487,0.626,-0.067,0.111
infield,-0.0220,0.033,-0.672,0.502,-0.086,0.042

0,1,2,3
Omnibus:,14.81,Durbin-Watson:,1.757
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.034
Skew:,0.063,Prob(JB):,7.36e-05
Kurtosis:,3.371,Cond. No.,28700.0


In [118]:
# Repeat step 1 but run the regression for the seasons 2004-2006 (all years combined).
whole_data_0406 = whole_data[ (whole_data["yearID"]>=2004) & (whole_data["yearID"]<=2006) ].reset_index()
lm2 = smf.ols(formula = 'log_sal_x ~ OBP + SLG +AVG + PA + arb_eligible + free_agent + catcher + infield', data = whole_data_0406 ).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,log_sal_x,R-squared:,0.574
Model:,OLS,Adj. R-squared:,0.57
Method:,Least Squares,F-statistic:,157.7
Date:,"Thu, 16 May 2024",Prob (F-statistic):,1.11e-167
Time:,22:46:52,Log-Likelihood:,-1110.9
No. Observations:,945,AIC:,2240.0
Df Residuals:,936,BIC:,2283.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.2935,0.270,38.119,0.000,9.764,10.823
OBP,3.1174,1.143,2.727,0.007,0.874,5.361
SLG,3.4290,0.501,6.848,0.000,2.446,4.412
AVG,-2.7269,1.335,-2.043,0.041,-5.346,-0.108
PA,0.0024,0.000,14.291,0.000,0.002,0.003
arb_eligible,1.2976,0.077,16.903,0.000,1.147,1.448
free_agent,1.9156,0.074,25.787,0.000,1.770,2.061
catcher,0.0471,0.083,0.571,0.568,-0.115,0.209
infield,-0.0245,0.059,-0.413,0.680,-0.141,0.092

0,1,2,3
Omnibus:,9.366,Durbin-Watson:,1.784
Prob(Omnibus):,0.009,Jarque-Bera (JB):,12.911
Skew:,0.08,Prob(JB):,0.00157
Kurtosis:,3.55,Cond. No.,29700.0


In [119]:
# Run the same regression model as in steps 1 and 2 separately for each season.
# It may be easiest to read output if you display your results in a couple of tables
# (one for Pre-Moneyball and one for Post-Moneyball

list = [2004,2005,2006]
regression_results = {}

for i in list:
    data = whole_data_0406[(whole_data_0406["yearID"] == i)].reset_index()
    lm = smf.ols(formula = 'log_sal_x ~ OBP + SLG + AVG + PA + arb_eligible + free_agent + catcher + infield', data = data ).fit()
    regression_results[i] = lm

from statsmodels.iolib.summary2 import summary_col
Header = ['All years','2004','2005','2006']
Table = summary_col([lm2, regression_results[2004], regression_results[2005], regression_results[2006]],
                     regressor_order=['OBP','SLG','AVG','PA','arb_eligible','free_agent','catcher','infield','Intercept'],stars=True,
                     float_format="'%.3f'",model_names = Header)
print(Table)    



              All years     2004        2005        2006   
-----------------------------------------------------------
OBP          '3.117'***  '3.731'*   '1.815'     '3.625'*   
             ('1.143')   ('2.094')  ('1.941')   ('1.923')  
SLG          '3.429'***  '4.580'*** '2.588'***  '2.881'*** 
             ('0.501')   ('0.909')  ('0.858')   ('0.840')  
AVG          '-2.727'**  '-3.646'   '-1.117'    '-3.533'*  
             ('1.335')   ('2.470')  ('2.405')   ('2.120')  
PA           '0.002'***  '0.002'*** '0.003'***  '0.003'*** 
             ('0.000')   ('0.000')  ('0.000')   ('0.000')  
arb_eligible '1.298'***  '1.186'*** '1.314'***  '1.439'*** 
             ('0.077')   ('0.142')  ('0.132')   ('0.127')  
free_agent   '1.916'***  '1.839'*** '1.986'***  '1.954'*** 
             ('0.074')   ('0.139')  ('0.127')   ('0.122')  
catcher      '0.047'     '0.033'    '0.059'     '0.101'    
             ('0.083')   ('0.149')  ('0.143')   ('0.139')  
infield      '-0.024'    '-0.036'   '-0

In [None]:
#Uncomment this cell once the assignment is complete in order to export your Master dataset
#Master.to_csv("../Data/Master.csv")