# Imports

In [343]:
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nfl_data_py as nfl

# Grabbing Data

In [3]:
seasons = range(2016, 2023 + 1)
df_pbp = nfl.import_pbp_data(seasons) 

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
Downcasting floats.


# Grabbing Only Pass Data

Cleaning and creating our dedicated passing dataframe:

In [344]:
df_passes = df_pbp.query("play_type == 'pass' & air_yards.notnull() & season_type == 'REG'").reset_index()

In [345]:
df_passes.head()

Unnamed: 0,index,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
0,3,85.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,...,SHOTGUN,"2 RB, 1 TE, 2 WR",6.0,"4 DL, 3 LB, 4 DB",3.0,41302;40078;38540;35553;43295;38582;40053;4336...,00-0029892;00-0027714;00-0032965;00-0029893;00...,00-0031171;00-0029542;00-0029566;00-0030073;00...,11.0,11.0
1,5,130.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,...,I_FORM,"2 RB, 1 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",5.0,40078;38540;41302;35553;38582;43295;40053;4336...,00-0029892;00-0027714;00-0032965;00-0029893;00...,00-0029542;00-0031171;00-0029566;00-0030073;00...,11.0,11.0
2,9,216.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BUF,away,...,SHOTGUN,"1 RB, 2 TE, 2 WR",7.0,"3 DL, 4 LB, 4 DB",4.0,34479;40111;41277;37249;41230;36089;37100;3567...,00-0027004;00-0030046;00-0028112;00-0031325;00...,00-0031170;00-0027560;00-0027965;00-0031296;00...,11.0,10.0
3,11,261.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BUF,away,...,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"1 DL, 5 LB, 5 DB",3.0,34479;40111;41277;37249;41230;36089;40112;3710...,00-0027004;00-0030046;00-0028112;00-0031325;00...,00-0031170;00-0027560;00-0027965;00-0031296;00...,10.0,10.0
4,13,310.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,...,SHOTGUN,"1 RB, 2 TE, 2 WR",7.0,"4 DL, 3 LB, 4 DB",4.0,38540;41302;35553;38582;43295;40053;43369;4011...,00-0027714;00-0032965;00-0029893;00-0026223;00...,00-0029542;00-0031171;00-0029566;00-0030073;00...,11.0,11.0


Distinguishing between long and short passes.

In [346]:
df_passes["pass_length_air_yards"] = np.where(
      df_passes["air_yards"] >= 20, "long", "short"
)

Setting null yardage values to 0

In [347]:
df_passes["passing_yards"] = np.where(
          df_passes["passing_yards"].isnull(), 0,
          df_passes["passing_yards"]
        )

Accumulating the play by play stats into season stats

In [348]:
df_passing_yards_by_season = df_passes \
    .groupby(["passer_id", "passer", "season"]) \
    .agg({"passing_yards": ["sum", "count", "mean"], "epa": ["sum"],
          "pass_touchdown": ["sum"], "complete_pass": ["sum"]}
    )

In [349]:
df_passing_yards_by_season.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,passing_yards,passing_yards,passing_yards,epa,pass_touchdown,complete_pass
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count,mean,sum,sum,sum
passer_id,passer,season,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
00-0019596,T.Brady,2016,3554.0,431,8.245939,157.785248,28.0,291.0
00-0019596,T.Brady,2017,4577.0,581,7.877797,201.057358,32.0,385.0
00-0019596,T.Brady,2018,4355.0,568,7.667253,125.592216,29.0,375.0
00-0019596,T.Brady,2019,4057.0,613,6.618271,64.109032,24.0,373.0
00-0019596,T.Brady,2020,4633.0,606,7.645215,154.74057,40.0,401.0


In [350]:
df_passing_yards_by_season.columns = list(map("_".join, df_passing_yards_by_season.columns))
df_passing_yards_by_season.reset_index(inplace=True)

In [366]:
new_cols = ["passer_id", "passer", "season", "passing_yards", "passing_attempts", "yards_per_pass", "total_epa",
            "passing_touchdowns", "completions"]
df_passing_yards_by_season.columns = new_cols

In [367]:
df_passing_yards_by_season.head()

Unnamed: 0,passer_id,passer,season,passing_yards,passing_attempts,yards_per_pass,total_epa,passing_touchdowns,completions
0,00-0019596,T.Brady,2016,3554.0,431,8.245939,157.785248,28.0,291.0
1,00-0019596,T.Brady,2017,4577.0,581,7.877797,201.057358,32.0,385.0
2,00-0019596,T.Brady,2018,4355.0,568,7.667253,125.592216,29.0,375.0
3,00-0019596,T.Brady,2019,4057.0,613,6.618271,64.109032,24.0,373.0
4,00-0019596,T.Brady,2020,4633.0,606,7.645215,154.74057,40.0,401.0


# Grabbing Rushing Stats

We needed the pass data to filter out other players in non-QB position groups who mave have throw a pass.

In [291]:
df_runs = df_pbp.query("play_type == 'run' & season_type == 'REG' & rusher_id.notnull()").reset_index()

In [296]:
df_passing_yards_by_season

Unnamed: 0,passer_id,passer,season,Passing Yards,Passing Attempts,Yards Per Pass,Total EPA,Passing Touchdowns,Completions
0,00-0019596,T.Brady,2016,3554.0,431,8.245939,157.785248,28.0,291.0
1,00-0019596,T.Brady,2017,4577.0,581,7.877797,201.057358,32.0,385.0
2,00-0019596,T.Brady,2018,4355.0,568,7.667253,125.592216,29.0,375.0
3,00-0019596,T.Brady,2019,4057.0,613,6.618271,64.109032,24.0,373.0
4,00-0019596,T.Brady,2020,4633.0,606,7.645215,154.740570,40.0,401.0
...,...,...,...,...,...,...,...,...,...
806,00-0038598,J.Hall,2023,101.0,10,10.100000,5.916939,0.0,8.0
807,00-0039150,B.Young,2023,2055.0,381,5.393701,-21.672869,9.0,231.0
808,00-0039152,W.Levis,2023,1266.0,183,6.918033,17.122780,7.0,107.0
809,00-0039163,C.Stroud,2023,3540.0,412,8.592233,116.298828,20.0,265.0


In [297]:
# Extract unique passer IDs from the passing DataFrame
unique_passer_ids = df_passing_yards_by_season.query("Completions > 25")['passer_id'].unique()

In [298]:
# Filter the rushing DataFrame to only include rows where the rushing_id is in the unique_passer_ids
df_runs_qb = df_runs[df_runs['rusher_id'].isin(unique_passer_ids)]

In [299]:
df_runs_qb

Unnamed: 0,index,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
4,19,450.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,...,SHOTGUN,"1 RB, 2 TE, 2 WR",6.0,"3 DL, 3 LB, 5 DB",,41302;38540;37977;43295;38582;40494;37109;4025...,00-0028497;00-0032965;00-0029893;00-0026223;00...,00-0031171;00-0029542;00-0029566;00-0030041;00...,11.0,11.0
20,65,1478.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BUF,away,...,JUMBO,"6 OL, 2 RB, 2 TE, 0 WR",9.0,"4 DL, 4 LB, 3 DB",,34479;35575;40111;37249;43619;36089;43694;3567...,00-0027004;00-0030046;00-0028112;00-0032499;00...,00-0027736;00-0027560;00-0032574;00-0031296;00...,11.0,11.0
32,103,2316.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BUF,away,...,SHOTGUN,"1 RB, 1 TE, 3 WR",7.0,"2 DL, 4 LB, 5 DB",,36060;34479;41277;40111;37249;41230;37100;4124...,00-0027004;00-0030046;00-0028112;00-0031325;00...,00-0027539;00-0031170;00-0027965;00-0031296;00...,10.0,10.0
63,204,1018.0,2016_01_CAR_DEN,2016090800,DEN,CAR,REG,1,CAR,away,...,PISTOL,"1 RB, 1 TE, 3 WR",6.0,"3 DL, 4 LB, 4 DB",,39974;32252;33602;38566;37075;38672;37724;4125...,00-0025446;00-0026069;00-0031359;00-0031362;00...,00-0030577;00-0029586;00-0027940;00-0029620;00...,11.0,11.0
64,209,1119.0,2016_01_CAR_DEN,2016090800,DEN,CAR,REG,1,CAR,away,...,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"3 DL, 4 LB, 4 DB",,42546;32252;37075;35764;38672;42366;40038;3772...,00-0025446;00-0032055;00-0031362;00-0029164;00...,00-0032138;00-0027940;00-0027256;00-0029620;00...,11.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97787,375215,1725.0,2023_13_SF_PHI,2023120309,PHI,SF,REG,13,SF,away,...,SINGLEBACK,"1 RB, 1 TE, 3 WR",7.0,"3 DL, 3 LB, 5 DB",,45345;55873;53477;54599;54727;41257;54026;4781...,00-0036551;00-0037829;00-0037834;00-0035719;00...,00-0033445;00-0038386;00-0031372;00-0036813;00...,11.0,11.0
97796,375245,2466.0,2023_13_SF_PHI,2023120309,PHI,SF,REG,13,PHI,home,...,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",,53601;46757;47785;52461;46157;39950;55952;3726...,00-0036389;00-0030561;00-0028129;00-0038112;00...,00-0036563;00-0034573;00-0035717;00-0034754;00...,11.0,11.0
97797,375248,2543.0,2023_13_SF_PHI,2023120309,PHI,SF,REG,13,PHI,home,...,JUMBO,"1 RB, 2 TE, 2 WR",9.0,"5 DL, 2 LB, 4 DB",,53601;46757;52422;54663;47785;53579;46157;5246...,00-0037086;00-0036919;00-0036389;00-0030561;00...,00-0036563;00-0034573;00-0036260;00-0035717;00...,11.0,11.0
97798,375249,2565.0,2023_13_SF_PHI,2023120309,PHI,SF,REG,13,PHI,home,...,JUMBO,"1 RB, 2 TE, 2 WR",9.0,"5 DL, 2 LB, 4 DB",,53601;46757;52422;54663;47785;53579;46157;5246...,00-0037086;00-0036919;00-0036389;00-0030561;00...,00-0036563;00-0034573;00-0036260;00-0035717;00...,11.0,11.0


In [300]:
df_runs_qb.loc[df_runs_qb.rushing_yards.isnull(), "rushing_yards"] = 0

In [301]:
df_runs_qb_season = df_runs_qb \
    .groupby(["rusher_id", "rusher", "season"]) \
    .agg({"rushing_yards": ["sum", "count", "mean"], "epa": ["sum"],
          "rush_touchdown": ["sum"]}
    )

In [302]:
df_runs_qb_season.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rushing_yards,rushing_yards,rushing_yards,epa,rush_touchdown
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count,mean,sum,sum
rusher_id,rusher,season,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
00-0019596,T.Brady,2016,10.0,7,1.428571,-3.487682,0.0
00-0019596,T.Brady,2017,14.0,8,1.75,-7.771545,0.0
00-0019596,T.Brady,2018,10.0,8,1.25,1.753143,1.0
00-0019596,T.Brady,2019,13.0,8,1.625,4.360135,3.0
00-0019596,T.Brady,2020,10.0,9,1.111111,-1.123093,3.0


In [303]:
df_runs_qb_season.columns = list(map("_".join, df_runs_qb_season.columns))
df_runs_qb_season.reset_index(inplace=True)
df_runs_qb_season.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards_sum,rushing_yards_count,rushing_yards_mean,epa_sum,rush_touchdown_sum
0,00-0019596,T.Brady,2016,10.0,7,1.428571,-3.487682,0.0
1,00-0019596,T.Brady,2017,14.0,8,1.75,-7.771545,0.0
2,00-0019596,T.Brady,2018,10.0,8,1.25,1.753143,1.0
3,00-0019596,T.Brady,2019,13.0,8,1.625,4.360135,3.0
4,00-0019596,T.Brady,2020,10.0,9,1.111111,-1.123093,3.0


In [304]:
new_cols = ["rusher_id", "rusher", "season", "rushing_yards", "carries", "yards_per_carry", "total_epa",
            "rushing_touchdowns"]
df_runs_qb_season.columns = new_cols

In [305]:
df_runs_qb_season.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards,carries,yards_per_carry,total_epa,rushing_touchdowns
0,00-0019596,T.Brady,2016,10.0,7,1.428571,-3.487682,0.0
1,00-0019596,T.Brady,2017,14.0,8,1.75,-7.771545,0.0
2,00-0019596,T.Brady,2018,10.0,8,1.25,1.753143,1.0
3,00-0019596,T.Brady,2019,13.0,8,1.625,4.360135,3.0
4,00-0019596,T.Brady,2020,10.0,9,1.111111,-1.123093,3.0


In [306]:
df_now_runs = df_runs_qb_season.copy()
df_last_runs = df_runs_qb_season.copy()

In [307]:
df_last_runs.rename(columns= {
        'rushing_yards': 'rushing_yards_last',
        'carries': 'carries_last',
        'yards_per_carry': 'yards_per_carry_last',
        'total_epa': 'total_epa_last',
        'rushing_touchdowns': 'rushing_touchdowns_last',
    }, inplace=True)

In [308]:
df_last_runs["season"] += 1

In [309]:
df_lag_runs = df_now_runs.merge(df_last_runs, how="inner", on=["rusher_id", "rusher", "season"])

In [310]:
df_lag_runs.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards,carries,yards_per_carry,total_epa,rushing_touchdowns,rushing_yards_last,carries_last,yards_per_carry_last,total_epa_last,rushing_touchdowns_last
0,00-0019596,T.Brady,2017,14.0,8,1.75,-7.771545,0.0,10.0,7,1.428571,-3.487682,0.0
1,00-0019596,T.Brady,2018,10.0,8,1.25,1.753143,1.0,14.0,8,1.75,-7.771545,0.0
2,00-0019596,T.Brady,2019,13.0,8,1.625,4.360135,3.0,10.0,8,1.25,1.753143,1.0
3,00-0019596,T.Brady,2020,10.0,9,1.111111,-1.123093,3.0,13.0,8,1.625,4.360135,3.0
4,00-0019596,T.Brady,2021,14.0,7,2.0,6.495685,2.0,10.0,9,1.111111,-1.123093,3.0


In [319]:
df_lag_runs_20 = df_lag_runs.query("carries > 20")

In [327]:
df_lag_runs_200_y = df_lag_runs.query("rushing_yards >= 200")

In [336]:
df_lag_runs_400_y = df_lag_runs.query("rushing_yards >= 400")

# Correlation for Rushing Stats

## Rushing Touchdowns

In [312]:
df_lag_runs[["rushing_touchdowns", "rushing_touchdowns_last"]].corr()

Unnamed: 0,rushing_touchdowns,rushing_touchdowns_last
rushing_touchdowns,1.0,0.536982
rushing_touchdowns_last,0.536982,1.0


In [320]:
df_lag_runs_20[["rushing_touchdowns", "rushing_touchdowns_last"]].corr()

Unnamed: 0,rushing_touchdowns,rushing_touchdowns_last
rushing_touchdowns,1.0,0.394226
rushing_touchdowns_last,0.394226,1.0


In [328]:
df_lag_runs_200_y[["rushing_touchdowns", "rushing_touchdowns_last"]].corr()

Unnamed: 0,rushing_touchdowns,rushing_touchdowns_last
rushing_touchdowns,1.0,-0.088577
rushing_touchdowns_last,-0.088577,1.0


In [337]:
df_lag_runs_400_y[["rushing_touchdowns", "rushing_touchdowns_last"]].corr()

Unnamed: 0,rushing_touchdowns,rushing_touchdowns_last
rushing_touchdowns,1.0,0.104369
rushing_touchdowns_last,0.104369,1.0


## EPA

In [313]:
df_lag_runs[["total_epa", "total_epa_last"]].corr()

Unnamed: 0,total_epa,total_epa_last
total_epa,1.0,0.215231
total_epa_last,0.215231,1.0


In [321]:
df_lag_runs_20[["total_epa", "total_epa_last"]].corr()

Unnamed: 0,total_epa,total_epa_last
total_epa,1.0,0.098788
total_epa_last,0.098788,1.0


In [330]:
df_lag_runs_200_y[["total_epa", "total_epa_last"]].corr()

Unnamed: 0,total_epa,total_epa_last
total_epa,1.0,-0.01465
total_epa_last,-0.01465,1.0


In [338]:
df_lag_runs_400_y[["total_epa", "total_epa_last"]].corr()

Unnamed: 0,total_epa,total_epa_last
total_epa,1.0,0.38201
total_epa_last,0.38201,1.0


## Yards per Carry

In [314]:
df_lag_runs[["yards_per_carry", "yards_per_carry_last"]].corr()

Unnamed: 0,yards_per_carry,yards_per_carry_last
yards_per_carry,1.0,0.401991
yards_per_carry_last,0.401991,1.0


In [322]:
df_lag_runs_20[["yards_per_carry", "yards_per_carry_last"]].corr()

Unnamed: 0,yards_per_carry,yards_per_carry_last
yards_per_carry,1.0,0.270109
yards_per_carry_last,0.270109,1.0


In [331]:
df_lag_runs_200_y[["yards_per_carry", "yards_per_carry_last"]].corr()

Unnamed: 0,yards_per_carry,yards_per_carry_last
yards_per_carry,1.0,0.123528
yards_per_carry_last,0.123528,1.0


In [339]:
df_lag_runs_400_y[["yards_per_carry", "yards_per_carry_last"]].corr()

Unnamed: 0,yards_per_carry,yards_per_carry_last
yards_per_carry,1.0,0.562382
yards_per_carry_last,0.562382,1.0


## Carries

In [315]:
df_lag_runs[["carries", "carries_last"]].corr()

Unnamed: 0,carries,carries_last
carries,1.0,0.770436
carries_last,0.770436,1.0


In [323]:
df_lag_runs_20[["carries", "carries_last"]].corr()

Unnamed: 0,carries,carries_last
carries,1.0,0.605299
carries_last,0.605299,1.0


In [332]:
df_lag_runs_200_y[["carries", "carries_last"]].corr()

Unnamed: 0,carries,carries_last
carries,1.0,0.348746
carries_last,0.348746,1.0


In [340]:
df_lag_runs_400_y[["carries", "carries_last"]].corr()

Unnamed: 0,carries,carries_last
carries,1.0,0.292299
carries_last,0.292299,1.0


## Rushing Yards

In [316]:
df_lag_runs[["rushing_yards", "rushing_yards_last"]].corr()

Unnamed: 0,rushing_yards,rushing_yards_last
rushing_yards,1.0,0.755456
rushing_yards_last,0.755456,1.0


In [324]:
df_lag_runs_20[["rushing_yards", "rushing_yards_last"]].corr()

Unnamed: 0,rushing_yards,rushing_yards_last
rushing_yards,1.0,0.590032
rushing_yards_last,0.590032,1.0


In [333]:
df_lag_runs_200_y[["rushing_yards", "rushing_yards_last"]].corr()

Unnamed: 0,rushing_yards,rushing_yards_last
rushing_yards,1.0,0.388971
rushing_yards_last,0.388971,1.0


In [341]:
df_lag_runs_400_y[["rushing_yards", "rushing_yards_last"]].corr()

Unnamed: 0,rushing_yards,rushing_yards_last
rushing_yards,1.0,0.639419
rushing_yards_last,0.639419,1.0
