In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.options.mode.chained_assignment = None

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/play_by_play_data/regular_season/reg_pbp_2018.csv')

In [None]:
#Get rid of quarters ending and other events
data = data.loc[
    (data['epa'].notnull()) &
    ((data['play_type'] == 'no_play') |
    (data['play_type'] == 'pass') |
    (data['play_type'] == 'run'))
]

#Remove timeouts
data.drop(data[(data['replay_or_challenge'] == 0) & (data['desc'].str.contains('Timeout'))].index, inplace=True)

#Classify running plays with penalities as runs
data.loc[data.desc.str.contains('left end|left tackle|left guard|up the middle|right guard|right tackle|right end|rushes'), 'play_type'] = 'run'

#Classify scrambles, sacks, and incomplete/complete with penalty as pass plays instead of runs/no play
data.loc[data.desc.str.contains('scrambles|sacked|pass'), 'play_type'] = 'pass'

#Remove kneels and spikes
#Kneels and spikes with penalties are classified as no_play, thus not removed in above cell
data = data.loc[data.desc.str.contains('kneels|spiked') == False]

#Reindex data dataframe
data.reset_index(drop=True, inplace=True)

In [None]:
#Create a smaller dataframe with plays where rusher_player_name is null
rusher_nan = data.loc[(data['play_type'] == 'run') &
         (data['rusher_player_name'].isnull())]

#Create a list of the indexes/indices for the plays where rusher_player_name is null
rusher_nan_indices = list(rusher_nan.index)

for i in rusher_nan_indices:
    #Split the description on the blank spaces, isolating each word
    desc = data['desc'].iloc[i].split()
    
    #For each word in the play description
    for j in range(0,len(desc)):
        #If a word is right, up, or left
        if desc[j] == 'right' or desc[j] == 'up' or desc[j] == 'left':
            #Set rusher_player_name for that play to the word just before the direction
            data['rusher_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass

In [None]:
passer_nan = data.loc[(data['play_type'] == 'pass') &
         (data['passer_player_name'].isnull())]

passer_nan_indices = list(passer_nan.index)

for i in passer_nan_indices:
    desc = data['desc'].iloc[i].split()
    
    for j in range(0,len(desc)):
        if desc[j] == 'pass':
            data['passer_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass
        
data.loc[data['passer_player_name'] == 'Backward', 'passer_player_name'] = float('NaN')

In [None]:
data.insert(69, 'success', 0)
data.loc[data['epa'] > 0, 'success'] = 1

In [None]:
#Get list of passers and how many 'pass' plays they were a part of
passers = data.groupby(by='passer_player_name')['epa'].count()

#Filter out passers with less than 80 dropbacks
passers = passers.loc[passers.values >= 80]

#Isolate just the passer names
passers = passers.index

In [None]:
epa_qb_running = data['epa'].loc[(data['play_type']=='run') & (data['rusher_player_name'].isin(passers))]
epa_scrambles = data['epa'].loc[data['desc'].str.contains('scrambles')]
epa_nonqb_running = data['epa'].loc[(data['play_type']=='run') & (data['rusher_player_name'].isin(passers) == False)]

In [None]:
#Chart all runs and scrambles
fig, ax = plt.subplots(figsize=(10,6))

ax.hist(epa_qb_running, label='Designed QB Runs', bins=20, color='slategrey')
ax.hist(epa_scrambles, label='Scrambles', bins=20, color='springgreen', alpha=.6)

ax.set_ylabel('Number of Plays (log scale)')
ax.set_xticks(np.linspace(-9,9,19))
ax.set_xlabel('Expected Points Added')
ax.set_title('EPA on Designed QB Runs vs Scrambles - 2018')
ax.text(5.7, 5, 'Data from nflscrapR', alpha=.7)
ax.legend()

plt.savefig('epa_qb_runs_vs_scrambles.png', dpi=400)

In [None]:
#Chart designed qb runs and non-qb runs
fig, ax = plt.subplots(figsize=(10,6))

ax.hist(epa_nonqb_running, label='Designed QB Runs', bins=20, color='slategrey')
ax.hist(epa_qb_running, label='Non-QB Runs', bins=20, color='springgreen')

ax.set_yscale('log')
ax.set_yticks([1, 10, 100, 1000, 10000])
ax.set_ylabel('Number of Plays (log scale)')
ax.set_xticks(np.linspace(-9,9,19))
ax.set_xlabel('Expected Points Added')
ax.set_title('EPA on Designed Runs vs Scrambles (Neutral Situations) - 2018')
ax.text(5.8, 2000, 'Data from nflscrapR', alpha=.7)
ax.legend()

plt.savefig('epa_qb_runs_vs_non_qb.png', dpi=400)

In [None]:
#Chart all three categories
fig, ax = plt.subplots(figsize=(10,6))

ax.hist(epa_nonqb_running, label='Designed QB Runs', bins=20, color='slategrey')
ax.hist(epa_qb_running, label='Non-QB Runs', bins=20, color='springgreen')
ax.hist(epa_scrambles, label='Scrambles', bins= 20, color='dodgerblue', alpha=.7)

ax.set_yscale('log')
ax.set_yticks([1, 10, 100, 1000, 10000])
ax.set_ylabel('Number of Plays (log scale)')
ax.set_xticks(np.linspace(-9,9,19))
ax.set_xlabel('Expected Points Added')
ax.set_title('EPA: Designed QB Runs vs Non-QB Runs vs Scrambles - 2018')
ax.text(5.8, 1500, 'Data from nflscrapR', alpha=.7)
ax.legend()

plt.savefig('all_runs.png', dpi=400)

In [None]:
#Success rates from designed runs and scrambles
non_qb_run_success = data['success'].loc[(data['play_type']=='run') & (data['rusher_player_name'].isin(passers) == False)].mean()
qb_run_success = data['success'].loc[(data['play_type']=='run') & (data['rusher_player_name'].isin(passers))].mean()
scramble_success = data['success'].loc[data['desc'].str.contains('scrambles')].mean()

In [None]:
#Create a summary dataframe
df = pd.DataFrame(index=['Non-QB Designed Runs','QB Designed Runs','Scrambles'],columns=['Mean EPA', 'St. Dev', 'Success Rate', 'Attempts'])
df.loc['Non-QB Designed Runs'] = [epa_nonqb_running.mean(), epa_nonqb_running.std(), non_qb_run_success, len(epa_nonqb_running)]
df.loc['QB Designed Runs'] = [epa_qb_running.mean(), epa_qb_running.std(), qb_run_success, len(epa_qb_running)]
df.loc['Scrambles'] = [epa_scrambles.mean(), epa_scrambles.std(), scramble_success, len(epa_scrambles)]

df['Mean EPA'] = df['Mean EPA'].astype(float).round(3)
df['St. Dev'] = df['St. Dev'].astype(float).round(3)
df['Success Rate'] = df['Success Rate'].astype(float).round(2)