In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.options.mode.chained_assignment = None

In [72]:
data = pd.read_csv('https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/play_by_play_data/regular_season/reg_pbp_2018.csv')

In [73]:
#Get rid of quarters ending and other events
data = data.loc[
    (data['epa'].notnull()) &
    ((data['play_type'] == 'no_play') |
    (data['play_type'] == 'pass') |
    (data['play_type'] == 'run'))
]

#Remove timeouts
data.drop(data[(data['replay_or_challenge'] == 0) & (data['desc'].str.contains('Timeout'))].index, inplace=True)

#Classify running plays with penalities as runs
data.loc[data.desc.str.contains('left end|left tackle|left guard|up the middle|right guard|right tackle|right end|rushes'), 'play_type'] = 'run'

#Classify scrambles, sacks, and incomplete/complete with penalty as pass plays instead of runs/no play
data.loc[data.desc.str.contains('scrambles|sacked|pass'), 'play_type'] = 'pass'

#Remove kneels and spikes
#Kneels and spikes with penalties are classified as no_play, thus not removed in above cell
data = data.loc[data.desc.str.contains('kneels|spiked') == False]

#Delete any kicks
data = data.loc[data.desc.str.contains('kicks') == False]

#Reindex data dataframe
data.reset_index(drop=True, inplace=True)

In [74]:
#Create a smaller dataframe with plays where rusher_player_name is null
rusher_nan = data.loc[(data['play_type'] == 'run') &
         (data['rusher_player_name'].isnull())]

#Create a list of the indexes/indices for the plays where rusher_player_name is null
rusher_nan_indices = list(rusher_nan.index)

for i in rusher_nan_indices:
    #Split the description on the blank spaces, isolating each word
    desc = data['desc'].iloc[i].split()
    
    #For each word in the play description
    for j in range(0,len(desc)):
        #If a word is right, up, or left
        if desc[j] == 'right' or desc[j] == 'up' or desc[j] == 'left':
            #Set rusher_player_name for that play to the word just before the direction
            data['rusher_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass

In [75]:
passer_nan = data.loc[(data['play_type'] == 'pass') &
         (data['passer_player_name'].isnull())]

passer_nan_indices = list(passer_nan.index)

for i in passer_nan_indices:
    desc = data['desc'].iloc[i].split()
    
    for j in range(0,len(desc)):
        if desc[j] == 'pass':
            data['passer_player_name'].iloc[i] = desc[j-1]
            
        else:
            pass
        
data.loc[data['passer_player_name'] == 'Backward', 'passer_player_name'] = float('NaN')

In [76]:
data.insert(69, 'success', 0)
data.loc[data['epa'] > 0, 'success'] = 1

In [110]:
game_ids = set(data['game_id'])

In [159]:
run_plays = []
run_epa = []
run_success = []
dropbacks = []
dropbacks_epa = []
dropbacks_success = []
score = []
wp = []

for i in game_ids:
    game_plays = data.loc[data['game_id']==i]
    
    away_team = game_plays['away_team'].values[0]
    home_team = game_plays['home_team'].values[0]
    
    #Set play constraints
    game_plays = game_plays.loc[(game_plays['game_half']=='Half1')
                               & (game_plays['down']<=3)
                               ]
    
    away_plays = game_plays.loc[(game_plays['posteam']==away_team)]
    home_plays = game_plays.loc[(game_plays['posteam']==home_team)]
    
    away_runs = away_plays.loc[away_plays['play_type']=='run']
    away_dropbacks = away_plays.loc[away_plays['play_type']=='pass']
    
    home_runs = home_plays.loc[home_plays['play_type']=='run']
    home_dropbacks = home_plays.loc[home_plays['play_type']=='pass']
    
    #Away team stats
    run_plays.append(len(away_runs))
    run_epa.append(away_runs['epa'].mean())
    run_success.append(away_runs['success'].mean())    
    
    dropbacks.append(len(away_dropbacks))
    dropbacks_epa.append(away_dropbacks['epa'].mean())
    dropbacks_success.append(away_dropbacks['success'].mean())
    
    score.append(away_plays['total_away_score'].values[-1]) 
    wp.append(away_plays['away_wp'].values[-1]) 
    
    #Home team stats
    run_plays.append(len(home_runs))
    run_epa.append(home_runs['epa'].mean())
    run_success.append(home_runs['success'].mean())    
    
    dropbacks.append(len(home_dropbacks))
    dropbacks_epa.append(home_dropbacks['epa'].mean())
    dropbacks_success.append(home_dropbacks['success'].mean())
    
    score.append(home_plays['total_home_score'].values[-1]) 
    wp.append(home_plays['home_wp'].values[-1]) 

In [160]:
run_plays = np.asarray(run_plays)
dropbacks = np.asarray(dropbacks)
score = np.asarray(score)
wp = np.asarray(wp)

In [161]:
np.corrcoef(run_epa,score)[1,0]

0.4187711484811011

In [162]:
np.corrcoef(run_success,score)[1,0]

0.3401389652989098

In [163]:
np.corrcoef(dropbacks_epa,score)[1,0]

0.678077051570628

In [164]:
np.corrcoef(dropbacks_success,score)[1,0]

0.5348761798357784