## Load the data

In [None]:
# !pip install adjustText
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
import urllib.request
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import matplotlib as mpl
from scipy.interpolate import make_interp_spline, BSpline
from adjustText import adjust_text

pd.set_option('mode.chained_assignment', None)
%config InlineBackend.figure_format = 'retina'

In [None]:
colors = {
    'ARI':'#97233F',
    'ATL':'#A71930',
    'BAL':'#241773',
    'BUF':'#00338D',
    'CAR':'#0085CA',
    'CHI':'#C83803',
    'CIN':'#FB4F14',
    'CLE':'#311D00',
    'DAL':'#041E42',
    'DEN':'#FB4F14',
    'DET':'#0076B6',
    'GB' :'#203731',
    'HOU':'#03202F',
    'IND':'#002C5F',
    'JAX':'#006778',
    'JAC':'#006778',
    'KC' :'#E31837',
    'LAC':'#0080C6',
    'SD':'#0080C6',
    'LAR':'#866D4B',
    'LA':'#866D4B',
    'STL':'#866D4B',
    'MIA':'#008E97',
    'MIN':'#4F2683',
    'NE' :'#002244',
    'NO' :'#D3BC8D',
    'NYG':'#0B2265',
    'NYJ':'#125740',
    'OAK':'#A5ACAF',
    'PHI':'#004C54',
    'PIT':'#FFB612',
    'SEA':'#69BE28',
    'SF' :'#AA0000',
    'TB' :'#D50A0A',
    'TEN':'#0C2340',
    'WAS':'#773141'
}

kc_p = '#E31837'
kc_s = '#FFB81C'

In [None]:
#You can find csvs of play by play data here: https://github.com/guga31bb/nflfastR-data/tree/master/data
#I've got a huge working csv with all seasons appended together, which is too big to upload to github
df = pd.read(FILENAME)

In [None]:
df.head(10)

In [None]:
#I always forget column names - and there's a lot of them - so I like to list them before I start analyzing
list(df)

## Colquit - EPA and WPA Analysis

In [None]:
#Some columns we may be interested in for this analysis are:
    #yardline_100 (field position)
    #epa,wpa,#yards
    #punt_blocked,punt_inside_twenty,
    #punter_player_name,
    #roof_type
    #posteam,
    #gameid,
    #season
    #week
    #play_id
    #season_type
    
#let's create a new, smaller dataframe to work with, with just these columns
columns = ['punter_player_name','yardline_100',
           'epa','wpa','yards',
           'return_yards',
           'punt_blocked',
           'punt_inside_twenty',
           'roof_type',
          'posteam','defteam',
           'game_id',
          'season','week',
          'play_id','season_type',]

punt_df = df[columns].loc[(df['play_type']=='punt')]

punt_df.head(10)

In [None]:
#Let's do a quick look at which punters have the highest EPA and WPA - only including punters with at least 150 attempts
punters = punt_df.groupby(by=['punter_player_name'])['epa','wpa','punt_inside_twenty','yards'].sum()
#Get number of attempts
punters['attempts'] = punt_df.groupby(by=['punter_player_name'])['epa'].count()
#Get team abbr for the team the punter had the most punts for
punters['team'] = punt_df.groupby(['punter_player_name'])['posteam'].agg(pd.Series.mode)
#limit to 150 attempts
punters = punters.loc[(punters['attempts']>150)]
punters.reset_index(inplace=True)
#sort by wpa, only show top 20
punters.sort_values(by='wpa',ascending=False).head(20)

In [None]:
#Quick visualization of this
plt.style.use('seaborn-talk')
plt.style.use('fivethirtyeight')

#remove outlier for plotting purposes
punters = punters.loc[(punters['punter_player_name']!='A.Podlesh')]

#plot our punters, looping to get a different color for each
for index,punter in punters.iterrows():
    plt.scatter(punter['wpa']*100, punter['epa'], c=colors[punter['team']], 
                s=punter['attempts']*.5, alpha=0.8)

    plt.annotate(punter['punter_player_name'], (punter['wpa']*100,punter['epa']),
                textcoords="offset points", # how to position the text
                 xytext=(0,12), # distance from text to points (x,y)
                 ha='center',fontsize = 11)
    

#Graph lables
plt.title('Punter Total Expected Points & Win Probability Added since 2000\n')
plt.ylabel('Total Expected Points Added\n')
plt.xlabel('\nTotal Win Probability Added')

In [None]:
#Look at how these numbers compare to other KC players
kc_qbs = df.loc[(df['posteam']=='KC')].groupby(by='passer_player_name')['epa','wpa'].sum()
kc_qbs.sort_values(by='epa', ascending=False).head(10)

In [None]:
#Look at how these numbers compare to other KC players
kc_qbs = df.loc[(df['posteam']=='KC')].groupby(by='receiver_player_name')['epa','wpa'].sum()
kc_qbs.sort_values(by='epa', ascending=False).head(20)

In [None]:
#Look at how these numbers compare to other KC players
kc_qbs = df.loc[(df['posteam']=='KC')].groupby(by='rusher_player_name')['epa','wpa'].sum()
kc_qbs.sort_values(by='epa', ascending=False).head(10)

 ## Punting Yards Over Expectation Model

In [None]:
#look at how field position effects punting yards and epa
punt_fp = punt_df.groupby(by='yardline_100')['yards','epa'].mean()
#Plot of punt yards by field postion
plt.scatter(punt_fp.index,punt_fp.yards,c='k')

plt.title('Average Net Punt Yards by Field Position\n')
plt.ylabel('Net Punt Yards\n')
plt.xlabel('\nField Position (measured from opponent end zone)')

Looks like a polynomial regression would be good here - quadratic should be fine

In [None]:
#imports
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures 
#Get our data
X = punt_df['yardline_100'].values.reshape(-1,1)
y = punt_df['yards'].values

poly = PolynomialFeatures(degree = 2) 
X_poly = poly.fit_transform(X) 
  
poly.fit(X_poly, y) 
lin2 = LinearRegression() 
lin2.fit(X_poly, y) 

In [None]:
field_position = np.linspace(0,100,101).reshape(-1,1)
plt.plot(field_position, lin2.predict(poly.fit_transform(field_position)), color = 'red') 
plt.scatter(punt_fp.index,punt_fp.yards,c='k')

Looks good to me!

In [None]:
#Let's add this column to our punts dataframe
punt_df['exp_punt_yds'] = lin2.predict(poly.fit_transform(punt_df['yardline_100'].values.reshape(-1,1)))
punt_df['pyoe'] = punt_df['yards'] - punt_df['exp_punt_yds']

In [None]:
#group punters to find best punter by pyoe
punters = punt_df.groupby(by=['punter_player_name'])['yards','pyoe'].mean()
#Get number of attempts
punters['attempts'] = punt_df.groupby(by=['punter_player_name'])['epa'].count()
#Get team abbr for the team the punter had the most punts for
punters['team'] = punt_df.groupby(['punter_player_name'])['posteam'].agg(pd.Series.mode)
punters['total_pyoe'] = punt_df.groupby(by=['punter_player_name'])['pyoe'].sum()
#limit to 150 attempts
punters = punters.loc[(punters['attempts']>150)]

#sort by pyoe, only show top 20
punters.sort_values(by='pyoe',ascending=False,inplace=True)
punters.reset_index(inplace=True)
punters.head(20)

In [None]:
#Quick visualization of this
plt.style.use('seaborn-talk')
plt.style.use('fivethirtyeight')

#plot our punters, looping to get a different color for each
for index,punter in punters.iterrows():
    plt.scatter(punter['pyoe'], punter['total_pyoe'], c=colors[punter['team']], 
                s=punter['attempts']*.5, alpha=0.8)

    plt.annotate(punter['punter_player_name'], 
                 (punter['pyoe'],punter['total_pyoe']),
                textcoords="offset points", # how to position the text
                 xytext=(0,12), # distance from text to points (x,y)
                 ha='center',fontsize = 11)
    

#Graph lables
plt.title('Punter Total and Average Punt Yards Over Expectation (PYOE) Since 2000\n')
plt.ylabel('Total Punt Yards Over Expectation\n')
plt.xlabel('\nAverage Punt Yards Over Expectation')