In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox, AnchoredText

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.options.mode.chained_assignment = None

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/play_by_play_data/regular_season/reg_pbp_2018.csv')

In [None]:
#Get rid of quarters ending and other events
data = data.loc[
    (data['epa'].notnull()) &
    ((data['play_type'] == 'no_play') |
    (data['play_type'] == 'pass') |
    (data['play_type'] == 'run'))
]

#Remove timeouts
data.drop(data[(data['replay_or_challenge'] == 0) & (data['desc'].str.contains('Timeout'))].index, inplace=True)

#Classify running plays with penalities as runs
data.loc[data.desc.str.contains('left end|left tackle|left guard|up the middle|right guard|right tackle|right end|rushes'), 'play_type'] = 'run'

#Classify scrambles, sacks, and incomplete/complete with penalty as pass plays instead of runs/no play
data.loc[data.desc.str.contains('scrambles|sacked|pass'), 'play_type'] = 'pass'

#Remove kneels and spikes
#Kneels and spikes with penalties are classified as no_play, thus not removed in above cell
data = data.loc[data.desc.str.contains('kneels|spiked') == False]

#Reindex data dataframe
data.reset_index(drop=True, inplace=True)

In [None]:
#Create a smaller dataframe with plays where rusher_player_name is null
rusher_nan = data.loc[(data['play_type'] == 'run') &
         (data['rusher_player_name'].isnull())]

#Create a list of the indexes/indices for the plays where rusher_player_name is null
rusher_nan_indices = list(rusher_nan.index)

for i in rusher_nan_indices:
    #Split the description on the blank spaces, isolating each word
    desc = data['desc'].iloc[i].split()
    
    #For each word in the play description
    for j in range(0,len(desc)):
        #If a word is right, up, or left
        if desc[j] == 'right' or desc[j] == 'up' or desc[j] == 'left':
            #Set rusher_player_name for that play to the word just before the direction
            data['rusher_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass

In [None]:
passer_nan = data.loc[(data['play_type'] == 'pass') &
         (data['passer_player_name'].isnull())]

passer_nan_indices = list(passer_nan.index)

for i in passer_nan_indices:
    desc = data['desc'].iloc[i].split()
    
    for j in range(0,len(desc)):
        if desc[j] == 'pass':
            data['passer_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass
        
data.loc[data['passer_player_name'] == 'Backward', 'passer_player_name'] = float('NaN')

In [None]:
receiver_nan = data.loc[(data['play_type'] == 'pass') & 
                        (data['receiver_player_name'].isnull()) &
                        (data['desc'].str.contains('scrambles|sacked|incomplete')==False)]

receiver_nan_indices = list(receiver_nan.index)

for i in receiver_nan_indices:
    desc = data['desc'].iloc[i].split()

    for j in range(0,len(desc)):
        if (desc[j]=='left' or desc[j]=='right' or desc[j]=='middle') and (desc[j+1]=='to'):
            data['receiver_player_name'].iloc[i] = desc[j+2]
    else:
        pass

In [None]:
data.insert(69, 'success', 0)
data.loc[data['epa'] > 0, 'success'] = 1

In [None]:
rb_run_value = data.loc[(data['play_type']=='run') & (data['down']<=4)].groupby(
    by='rusher_player_name')[['epa','success']].mean()
rb_run_value['run_attempts'] = data.loc[(data['play_type']=='run') & (data['down']<=4)].groupby(
    by='rusher_player_name')['epa'].count()
rb_run_value.index.name = 'player_name'
rb_run_value.columns = ['run_epa','run_success','run_attempts']

In [None]:
rb_pass_value = data.loc[(data['play_type']=='pass') & (data['down']<=4)].groupby(
    by='receiver_player_name')[['epa','success']].mean()
rb_pass_value['pass_receptions'] = data.loc[(data['play_type']=='pass') & (data['down']<=4)].groupby(
    by='receiver_player_name')['epa'].count()
rb_pass_value.index.name = 'player_name'
rb_pass_value.columns = ['pass_epa','pass_success','pass_receptions']

In [None]:
rb_value = pd.merge(rb_run_value, rb_pass_value, on='player_name')

In [None]:
rb_value = rb_value.loc[(rb_value['run_attempts']>=40) & (rb_value['pass_receptions']>=20)]

In [None]:
x_data = rb_value['run_epa'].values
y_data = rb_value['pass_epa'].values

fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(x_data, y_data, s=200, alpha=.6, edgecolor='black', color='darkcyan')

# #Set line where x=y
# lims = [np.min([ax.get_xlim(),ax.get_ylim()]), np.max([ax.get_xlim(),ax.get_ylim()])]
# ax.plot(lims, lims, 'k--', alpha=.8)

#Add line for above/below
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = 1 * x_vals
ax.plot(x_vals, y_vals, '--', color='black', alpha=.6)

ax.set_xlabel('Average Run EPA', fontsize=14)
ax.set_ylabel('Average Pass EPA', fontsize=14)
ax.set_title('Running Back EPA Run vs. Pass - 2018', fontsize=18)
text_box = AnchoredText('Data from nflscrapR', frameon=True, loc=4, pad=0.5, prop=dict(alpha=.7))
plt.setp(text_box.patch, facecolor='white', alpha=0)
plt.gca().add_artist(text_box)

plt.savefig('rb_epa.png',dpi=600)

In [None]:
x_data = rb_value['run_success'].values
y_data = rb_value['pass_success'].values

fig, ax = plt.subplots(figsize=(10,10))

ax.scatter(x_data, y_data, s=200, alpha=.6, edgecolor='black', color='darkcyan')
# #Add labels
# for i, player in enumerate(rb_value.index.values):
#     x = x_data[i]
#     y = y_data[i]
    
#     ax.scatter(x,y,s=100, alpha=.6, edgecolor='black', color='darkcyan')
#     ax.text(x+.005, y+.005, player, fontsize=8)

# #Set line where x=y
# lims = [np.min([ax.get_xlim(),ax.get_ylim()]), np.max([ax.get_xlim(),ax.get_ylim()])]
# ax.plot(lims, lims, 'k--', alpha=.8)    

#Add line for above/below
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = 1 * x_vals
ax.plot(x_vals, y_vals, '--', color='black', alpha=.6)

ax.set_xlabel('Run Success', fontsize=14)
ax.set_ylabel('Pass Success', fontsize=14)
ax.set_title('Running Back Success Run vs. Pass - 2018', fontsize=18)
text_box = AnchoredText('Data from nflscrapR', frameon=True, loc=4, pad=0.5, prop=dict(alpha=.7))
plt.setp(text_box.patch, facecolor='white', alpha=0)
plt.gca().add_artist(text_box)

plt.savefig('rb_success.png',dpi=600)