# Imports

In [50]:
import pandas as pd
import numpy as np

In [2]:
import nfl_data_py as nfl

# Data

Some explantions for stats:

**WOPR** is a weighted combination of the share of team targets a player receives and the share of team air yards. The formula is: WOPR = 1.5 × Target Market Share + 0.7 × Air Yards Market Share. Elite wide receivers are seen having 0.70+ WOPRs.

**Expected Points Added (EPA)** is a commonly used advanced statistic in football. In short, this stat measures how well a team performs compared to their expectation on a play-by-play basis. EPA and EPA per Play, along with other stats using EPA, can help us evaluate team and player performance.

**Passing Air Conversion Ratio (PACR)** measures the rate at which air yards are converted to passing yards. The formula is PACR = (passing yards)/(air yards).

**Receiver Air Conversion Ratio (RACR)** measures the rate at which targeted air yards are converted to receiving yards. The formula is RACR = (receiving yards)/(air yards).

**Completion Percentage Over Expectation (CPOE)**: (Completion percentage) - (expected completion percentage) for passers.

**dakota** is the adjusted EPA + CPOE composite based on coefficients which best predict adjusted EPA/play in the following year.

In [3]:
years = [2023]

## Play-By-Play

Returns play-by-play data for the years and columns specified

years : required, list of years to pull data for (earliest available is 1999)

columns : optional, list of columns to pull data for

downcast : optional, converts float64 columns to float32, reducing memory usage by ~30%. Will slow down initial load speed ~50%

cache : optional, determines whether to pull pbp data from github repo or local cache generated by nfl.cache_pbp()

alt_path : optional, required if nfl.cache_pbp() is called using an alternate path to the default cache

In [4]:
columns = None
df_play = nfl.import_pbp_data(years, columns, downcast=True, cache=False, alt_path=None)

2023 done.
Downcasting floats.


In [5]:
df_play.head(1)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
0,1.0,2023_01_ARI_WAS,2023091007,WAS,ARI,REG,1,,,,...,,,,,,,,,0,0


In [6]:
play = df_play.iloc[1]
for i in range(0, len(play), 10):
    print(play[i: i + 10])
    print()

play_id                    39.0
game_id         2023_01_ARI_WAS
old_game_id          2023091007
home_team                   WAS
away_team                   ARI
season_type                 REG
week                          1
posteam                     WAS
posteam_type               home
defteam                     ARI
Name: 1, dtype: object

side_of_field                       ARI
yardline_100                       35.0
game_date                    2023-09-10
quarter_seconds_remaining         900.0
half_seconds_remaining           1800.0
game_seconds_remaining           3600.0
game_half                         Half1
quarter_end                         0.0
drive                               1.0
sp                                  0.0
Name: 1, dtype: object

qtr                                                           1.0
down                                                          NaN
goal_to_go                                                    0.0
time                              

# Who Throws Deep?

Finding average depth of target (adot) for quarterbacks.

Grab only the pass plays. Note, in pandas, you should use the `&` operator for element-wise logical AND operations.

In [22]:
df_pass_plays = df_play[(df_play["play_type"] == "pass") & (df_play["air_yards"].notna())]

Group by 'passer_id' and 'passer', then aggregate air yards by count and mean.

In [23]:
grouped = df_pass_plays.groupby(['passer_id', 'passer']).agg({"air_yards": ["count", "mean"]})

Creating a refined data frame where we only keep players who have attempted over 100 passes. We sort, descending, by adot.

In [27]:
grouped.columns = list(map("_".join, grouped.columns.values))
sort_crit = "air_yards_count > 100"
print(grouped.query(sort_crit).sort_values(by="air_yards_mean", ascending=[False]).to_string())

                                air_yards_count  air_yards_mean
passer_id  passer                                              
00-0039152 W.Levis                          150       10.193334
00-0029701 R.Tannehill                      157        9.375796
00-0033537 D.Watson                         170        9.370588
00-0035228 K.Murray                         107        9.308412
00-0039163 C.Stroud                         385        9.241558
00-0036264 J.Love                           370        8.967567
00-0036389 J.Hurts                          358        8.575419
00-0034855 B.Mayfield                       380        8.497369
00-0034857 J.Allen                          432        8.284722
00-0033077 D.Prescott                       370        8.270270
00-0031280 D.Carr                           372        8.223118
00-0033275 P.Walker                         110        8.200000
00-0031345 J.Garoppolo                      167        8.119761
00-0037834 B.Purdy                      

## ADOT For Receivers

In [44]:
list(df_pass_plays.columns)

['play_id',
 'game_id',
 'old_game_id',
 'home_team',
 'away_team',
 'season_type',
 'week',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'td_player_name',
 'td_player_id',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'po

In [43]:
[x for x in list(df_pass_plays.columns) if "po" in x]

['posteam',
 'posteam_type',
 'extra_point_result',
 'two_point_conv_result',
 'posteam_timeouts_remaining',
 'posteam_score',
 'posteam_score_post',
 'defteam_score_post',
 'score_differential_post',
 'extra_point_prob',
 'two_point_conversion_prob',
 'home_wp_post',
 'away_wp_post',
 'extra_point_attempt',
 'two_point_attempt',
 'defensive_two_point_attempt',
 'defensive_two_point_conv',
 'defensive_extra_point_attempt',
 'defensive_extra_point_conv',
 'cpoe',
 'drive_time_of_possession',
 'possession_team']

In [39]:
receivers = df_pass_plays.groupby(['receiver_id', 'receiver']).agg({"air_yards": ["count", "mean"]})

In [40]:
receivers.columns = list(map("_".join, receivers.columns.values))
sort_crit = "air_yards_count > 50"
print(receivers.query(sort_crit).sort_values(by="air_yards_mean", ascending=[False]).to_string())

                            air_yards_count  air_yards_mean
receiver_id receiver                                       
00-0036261  B.Aiyuk                      65       15.107693
00-0037545  R.Shaheed                    55       14.781818
00-0030564  D.Hopkins                    83       14.674699
00-0031408  M.Evans                      91       14.274725
00-0038977  N.Dell                       75       14.266666
00-0031544  A.Cooper                     86       14.139535
00-0036196  G.Davis                      67       13.880597
00-0037239  C.Olave                     103       13.757281
00-0035640  D.Metcalf                    80       13.550000
00-0037247  G.Pickens                    73       12.945206
00-0034837  C.Ridley                     76       12.671053
00-0036912  D.Smith                      76       12.460526
00-0035676  A.Brown                     105       12.447619
00-0035662  M.Brown                      98       11.959184
00-0035216  D.Johnson                   

# Sticky Stats

In [45]:
seasons = range(2016, 2023 + 1)
df_pbp = nfl.import_pbp_data(seasons) 

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
Downcasting floats.


In [46]:
df_passes = df_pbp.query("play_type == 'pass' & air_yards.notnull()").reset_index()

A *long pass* is a pass that is 20+ yards, and a *short pass* as less than 20 yards.

In [52]:
df_passes["pass_length_air_yards"] = np.where(
      df_passes["air_yards"] >= 20, "long", "short"
)

In [53]:
df_passes["passing_yards"] = np.where(
          df_passes["passing_yards"].isnull(), 0,
          df_passes["passing_yards"]
        )

In [56]:
df_passes["passing_yards"].describe()

count    143732.000000
mean          7.176259
std           9.645344
min         -20.000000
25%           0.000000
50%           5.000000
75%          11.000000
max          98.000000
Name: passing_yards, dtype: float64

## Passing yards by pass type

In [57]:
df_passes.query("pass_length_air_yards == 'short'")["passing_yards"].describe()

count    126811.000000
mean          6.514490
std           7.688124
min         -20.000000
25%           0.000000
50%           5.000000
75%          10.000000
max          95.000000
Name: passing_yards, dtype: float64

In [58]:
df_passes.query("pass_length_air_yards == 'long'")["passing_yards"].describe()

count    16921.000000
mean        12.135749
std         17.872271
min          0.000000
25%          0.000000
50%          0.000000
75%         26.000000
max         98.000000
Name: passing_yards, dtype: float64

## EPA by pass type

In [59]:
df_passes.query("pass_length_air_yards == 'short'")["epa"].describe()

count    126810.000000
mean          0.116065
std           1.428156
min         -13.031219
25%          -0.608661
50%          -0.005850
75%           0.954895
max           8.241420
Name: epa, dtype: float64

In [60]:
df_passes.query("pass_length_air_yards == 'long'")["epa"].describe()

count    16921.000000
mean         0.377189
std          2.179605
min        -10.477921
25%         -0.829438
50%         -0.466574
75%          2.129708
max          8.789743
Name: epa, dtype: float64

## Weekly Data

Returns weekly data for the years and columns specified. The weekly data is by player.

years : required, list of years to pull data for (earliest available is 1999)

columns : optional, list of columns to pull data for

downcast : converts float64 columns to float32, reducing memory usage by ~30%. Will slow down initial load speed ~50%

In [None]:
columns = None
df_weekly = nfl.import_weekly_data(years, columns, downcast=True)

In [None]:
df_weekly.head()

In [None]:
play = df_weekly.iloc[0]
for i in range(0, len(play), 10):
    print(play[i: i + 10])
    print()

In [None]:
# each row represents the performance of that player for the week
for row in df_weekly.itertuples():
    print(row.Index, row.player_name)

## Seasonal Data

Returns seasonal data, including various calculated market share stats specific to receivers

years (List[int]) : required, list of years to pull data for (earliest available is 1999)

s_type (str) : optional (default 'REG') season type to include in average ('ALL','REG','POST')

In [None]:
df_seasonal = nfl.import_seasonal_data(years)

In [None]:
df_seasonal.head

In [None]:
play = df_seasonal.iloc[0]
for i in range(0, len(play), 10):
    print(play[i: i + 10])
    print()

## Season Rosters

Returns yearly roster information for the seasons specified

years : required, list of years to pull data for (earliest available is 1999)

columns : optional, list of columns to pull data for

In [None]:
columns = ["player_name", "player_id", "position"]
df_roster = nfl.import_seasonal_rosters(years, columns)

In [None]:
df_roster.iloc[0]

# Creating Player Map

In [None]:
player_map = dict()
id_map = dict()

In [None]:
for row in df_roster.itertuples():
    player_map[row.player_id] = dict()
    player_map[row.player_id]["name"] = row.player_name
    player_map[row.player_id]["position"] = row.position
    
    id_map[row.player_name] = row.player_id

In [None]:
for row in df_seasonal.itertuples():
    if row.player_id in player_map:
        player_map[row.player_id]["seasonal_df"] = row._asdict()

In [None]:
# each row represents the performance of that player for the week
for row in df_weekly.itertuples():
    if row.player_display_name in id_map:
        player_id = id_map[row.player_display_name]
        if "week_dfs" in player_map[player_id]:
            player_map[player_id]["week_dfs"].append(row._asdict())
        else:
            player_map[player_id]["week_dfs"] = [row._asdict()]

### Travis Kelce

In [None]:
kelce_id = id_map["Travis Kelce"]

In [None]:
player_map[kelce_id]["seasonal_df"]

In [None]:
player_map[kelce_id]["week_dfs"][0]

# Data by Position Group

In [None]:
qbs = list()
rbs = list()
wrs = list()
tes = list()

In [None]:
for key in player_map:
    player = player_map[key]
    if "seasonal_df" in player:  # not all players have a seasonal df
        if player["position"] == "QB":
            qbs.append(player)
        if player["position"] == "RB":
            rbs.append(player)
        if player["position"] == "WR":
            wrs.append(player)
        if player["position"] == "TE":
            tes.append(player)

In [None]:
df_qb = pd.DataFrame(columns=['Name', 'Passing EPA', 'Rushing EPA', 'Total EPA', 'EPA Per Game', 'Fantasy Points', 'Fantasy Points Per Game', 'Dakota', 'Dakota Per Game', 'PACR', 'PACR Per Game'])
i = 1
for player in qbs:
    print(player["seasonal_df"].keys())
    if player["seasonal_df"]["fantasy_points_ppr"] > -500 and player["seasonal_df"]["games"] > 1:
        print(player["seasonal_df"].keys())
        break
        name = player['name']
        games = player['seasonal_df']['games']
        fantasy_points_ppr = player['seasonal_df']['fantasy_points_ppr']
        dakota = player['seasonal_df']['dakota']
        pacr = player['seasonal_df']['pacr']
        passing_epa = player['seasonal_df']['passing_epa']
        rushing_epa = player['seasonal_df']['rushing_epa']
        total_epa = passing_epa + rushing_epa
        
        df_qb.loc[i] = [name, passing_epa, rushing_epa, total_epa, total_epa / games, fantasy_points_ppr, fantasy_points_ppr / games, dakota, dakota / games, pacr, pacr / games]
        i += 1
    break

In [None]:
names = ["Brock Purdy", "C.J. Stroud", "Trevor Lawrence", "Justin Herbert", "Justin Fields"]
def highlight_names(row):
    """"""
    if row['Name'] in names:
        return ['background-color: yellow']*len(row)
    return ['']*len(row)

In [None]:
df_qb

In [None]:
df_qb.sort_values(by='Fantasy Points', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='Total EPA', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='EPA Per Game', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='Dakota', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='Dakota Per Game', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='PACR', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.sort_values(by='PACR Per Game', ascending=False).style.apply(highlight_names, axis=1)

In [None]:
df_qb.head(1)

In [None]:
ls

In [None]:
df_qb.to_csv()

## WRs

In [None]:
df_wr = pd.DataFrame(columns=['Name', 'Target Share', 'Weighted Opportunity X', 'Rushing EPA', 'Receiving EPA', 'Total EPA', 'EPA Per Game', 'Fantasy Points', 'Fantasy Points Per Game', 'RACR', 'RACR Per Game'])
i = 1
for player in wrs:
    if player["seasonal_df"]["fantasy_points_ppr"] > -500 and player["seasonal_df"]["games"] > 1:
        name = player['name']
        games = player['seasonal_df']['games']
        fantasy_points_ppr = player['seasonal_df']['fantasy_points_ppr']
    
        tgt_sh =  player['seasonal_df']['tgt_sh']
        wopr_x = player['seasonal_df']['wopr_x']
        racr = player['seasonal_df']['racr']
        receiving_epa = player['seasonal_df']['receiving_epa']
        rushing_epa = player['seasonal_df']['rushing_epa']
        total_epa = receiving_epa + rushing_epa
        
        df_wr.loc[i] = [name, tgt_sh, wopr_x, rushing_epa, receiving_epa, total_epa, total_epa / games, fantasy_points_ppr, fantasy_points_ppr / games, racr, racr / games]
        i += 1

In [None]:
df_wr

In [None]:
df_wr.sort_values(by='Fantasy Points Per Game', ascending=False).head(48)

In [None]:
df_wr.sort_values(by='Total EPA', ascending=False).head(48)

In [None]:
df_wr.sort_values(by='RACR', ascending=False).head(48)

In [None]:
df_wr.sort_values(by='RACR Per Game', ascending=False).head(48)

In [None]:
df_wr.sort_values(by='Target Share', ascending=False).head(48)

In [None]:
df_wr.sort_values(by='Weighted Opportunity X', ascending=False).head(48)