In [76]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from stargazer.stargazer import Stargazer
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('kickers_v2.csv')

In [61]:
data.head()

Unnamed: 0.1,Unnamed: 0,Team,Year,GameMinute,Kicker,Distance,ScoreDiff,Grass,Success,Intercept
0,1,PHI,2005,3,Akers,49,0,False,0,1
1,2,PHI,2005,29,Akers,49,-7,False,0,1
2,3,PHI,2005,51,Akers,44,-7,False,1,1
3,4,PHI,2005,14,Akers,43,14,True,0,1
4,5,PHI,2005,60,Akers,23,0,True,1,1


In [62]:
# It's important we check for NAN before we start our analysis.
data.isnull().values.any()

False

<h1> PSET 1 Econ 1042 Sports Economics </h1>
<h2> 1. Question </h2>
<ol>
    <li> What was the minimum distance of a field goal kicked in this sample? What was the maximum? Mean? Median!</li>
    <li> Why isn’t the minimum lower? (For those who are not familiar with football, please read about how field goal distance is measured and its relationship to where the ball is on the field.)</li>
    <li> What special circumstances might explain the maximum? (Hint: football is a game with 4, 15-minute quarters. At the end of the second quarter there is a halftime break and possession is assigned based on the result of a first-half coin toss) </li>
</ol>

In [14]:
print(f"The median distance of a field goal kicked was {np.median(data['Distance'])}")
data['Distance'].describe()

The median distance of a field goal kicked was 37.0


count    11187.000000
mean        36.897381
std         10.173351
min         18.000000
25%         28.000000
50%         37.000000
75%         45.000000
max         76.000000
Name: Distance, dtype: float64

1. The minimum distance of a field goal kicked in the sample was 18.00 yards. The maximum was 76.00 yards and the median was 37.0 yards. The mean was 36.897 yards
2. The minimum is 17 yards. This makes sense since the endzone is 10yards, and the ball has to be kicked from 7 yards from the line of scrimmage. Hence 10 + 7 = 17.

In [16]:
# Lets find out what play was kicked from 76 yards away?
max_yard = data.loc[data['Distance'] == 76.00]
max_yard

Unnamed: 0.1,Unnamed: 0,Team,Year,GameMinute,Kicker,Distance,ScoreDiff,Grass,Success
3557,3558,OAK,2008,30,Janikowski,76,15,True,0


3. The 76 yards field goal attempt from Janikowski was in the last second of the second quarter (video: https://www.youtube.com/watch?v=X7BepDe6Zoc). It makes sense to kick if far into the opponents end zone, if in the first half your team had the ball. Since, then the opposing team will start from further away from the kickers endzone. It's like as if the special team does a punt.

<h2> 2. Question </h2>
<p> Over the entire sample what percentage of kicks from 40 to 45 yards were made? Kicks over 45 yards? <p>

In [46]:
sample_size = len(data)
print(sample_size)
# let's find the number of successful kicks from 40-45 yards
kicks_40_45 = data.loc[(data['Distance'] > 40) & (data['Distance'] < 45)]
print(len(kicks_40_45))
# We find that 1325 kicks were made in that range
success_40_45 = kicks_40_45['Success'].value_counts()
# print(success_40_45)
ratio_success = success_40_45[1]/(success_40_45[0] + success_40_45[1])
ratio = (len(kicks_40_45)/sample_size) * 100
print(f'{ratio:.3f}% of Kicks were from between 40-45 yards')

# How many kicks were over 45 yards?
# kicks

11187
1325
11.844% of Kicks were from between 40-45 yards


In [49]:
kicks_above_45 = data.loc[data['Distance'] > 45]
ratio_above_45 = (len(kicks_above_45)/sample_size) * 100
print(f'{ratio_above_45:.3f}% of Kicks were from between 40-45 yards')

24.439% of Kicks were from between 40-45 yards


<h2> 3. Question </h2>
<p> Was the make rate higher on grass or on turf? Is that difference statistically significant? Do you think this is the true effect of surface? Why or why not?  (Answer this by doing an OLS regression. For the entire assignment, let’s use the heteroskedasticity robust standard errors, r in stata or the equivalent in R)<br> <br>
Let's compute the difference using $\Delta = \bar{Y}_{grass} - \bar{Y}_{turf}$ we shall report standard errors as heteroscedasticity robust (HC2) 
</p>

In [67]:
# define response variable
# statsmodel requires us to add a column where each value is 1 in order to compute intercept
data['Intercept'] = 1
# Since we find no NAN in our column
print(data['Grass'].isnull().values.any())
# We can conver the 'bool' values for Grass to the datatype 'int'
data['Grass'] = data['Grass'].astype(int)
Y = data[['Success']]
X = data[['Grass', 'Intercept']]
mod = sm.OLS(Y, X)
res = mod.fit(cov_type='HC2')
print(res.summary())

False
                            OLS Regression Results                            
Dep. Variable:                Success   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     7.500
Date:                Sun, 29 Jan 2023   Prob (F-statistic):            0.00618
Time:                        18:29:31   Log-Likelihood:                -4845.9
No. Observations:               11187   AIC:                             9696.
Df Residuals:                   11185   BIC:                             9710.
Df Model:                           1                                         
Covariance Type:                  HC2                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Grass         -0.0193      0.007     -2.739   

In [74]:
# Let's also try the Sklearn Library
# We can also use the Sklearn Library to do an OLS regression, but I don't think it has a summary function.
X = data[['Grass']]
reg = LinearRegression(fit_intercept=True).fit(X,Y)
parameters = reg.get_params()
print(reg.coef_)

[[-0.01932925]]


We find that the observed difference is statistically insignificant at the $\alpha = 0.05$ level. It seems as if the surface does not have an impact on the observed average success rates of field goal kicks.

<h2> 4. Question </h2>
<ol>
    <li>	How is distance of attempt correlated with surface? What might explain this? (Coaches get to choose when to kick a field goal, one is never forced) </li>
    <li> 	How is distance correlated with make percentage? </li>
</ol>

In [82]:
# Let's calculate the correlation between the columns
corr_surface = data['Distance'].corr(data['Grass'])
corr_success = data['Distance'].corr(data['Success'])
print(corr_surface)
print(corr_success)

-0.002551996001227438
-0.33693399701495164


The correlation coefficient for distance and or Grass is -0.0025, basically negligible. The correlation coefficiecnt for distance and success rates is -0.3369, meaning as distance increases the success rate goes down. 