# Imports

In [1]:
import plotly.express as px
import pandas as pd

In [5]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from helpers import plot_correlation_matrix, create_correlation_matrix, filter_df

# Read Data

This data set takes into account passing data from 2012 - 2022. This data is unfiltered.

In [7]:
df_roster = pd.read_pickle("../../interactive/df_roster_2012_2022.pkl")

In [8]:
df_seasonal = pd.read_pickle("../../interactive/df_seasonal_2012_2022.pkl")

In [9]:
df_pbp = pd.read_pickle("../../interactive/df_pbp_2012_2022.pkl")

# Grabbing NGS Data

In [10]:
df_ngs_rush = pd.read_pickle("../../interactive/df_ngs_rush_2016_2022.pkl")

In [11]:
df_ngs_rush = df_ngs_rush.query("season_type == 'REG'")

In [13]:
df_ngs_data = df_ngs_rush \
    .groupby(["player_gsis_id", "player_first_name", "player_last_name", "season"]) \
    .agg({
    'efficiency': ["mean"],
    'percent_attempts_gte_eight_defenders': ['mean'],
    'avg_time_to_los': ['mean'],
    'expected_rush_yards': ['mean'],
    'rush_yards_over_expected': ['sum'],
    'rush_yards_over_expected_per_att': ['mean'],
    'rush_pct_over_expected': ['mean']
})

In [14]:
df_ngs_data.columns = list(map("_".join, df_ngs_data.columns))
df_ngs_data.reset_index(inplace=True)

In [15]:
df_ngs_data.head()

Unnamed: 0,player_gsis_id,player_first_name,player_last_name,season,efficiency_mean,percent_attempts_gte_eight_defenders_mean,avg_time_to_los_mean,expected_rush_yards_mean,rush_yards_over_expected_sum,rush_yards_over_expected_per_att_mean,rush_pct_over_expected_mean
0,00-0023500,Frank,Gore,2016,3.864813,21.810003,2.327528,,0.0,,
1,00-0023500,Frank,Gore,2017,3.807271,25.913666,2.545626,,0.0,,
2,00-0023500,Frank,Gore,2018,3.247479,8.964702,2.468552,118.911784,109.882161,0.535431,0.406332
3,00-0023500,Frank,Gore,2019,4.521416,33.168447,2.544394,100.334709,-96.681794,-0.404407,0.314246
4,00-0023500,Frank,Gore,2020,4.20497,9.304508,2.617079,113.971097,-156.653169,-0.410539,0.406444


# Create Data Frame for Runs

In [16]:
df_runs = df_pbp.query("play_type == 'run' and season_type == 'REG'")

In [17]:
df_runs.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
2,53.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
3,74.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
7,165.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
11,256.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
16,359.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,KC,home,ATL,...,,,,,,,,,,


# Create DF for Rushing Stats by Season

In [18]:
df_rushing_yards_by_season = df_runs \
    .groupby(["rusher_id", "rusher", "season"]) \
    .agg({
        "rushing_yards": ["sum", "mean"],
        "rush_attempt": ["sum"],
        "fumble": ["sum"],
        "rush_touchdown": ["sum"],
        "tackled_for_loss": ["sum"],
        "epa": ["sum", "mean"],
        "success": ["sum"]
    })

In [19]:
df_rushing_yards_by_season.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rushing_yards,rushing_yards,rush_attempt,fumble,rush_touchdown,tackled_for_loss,epa,epa,success
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,sum,sum,sum,sum,sum,mean,sum
rusher_id,rusher,season,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
00-0007091,M.Hasselbeck,2012,0.0,0.0,1.0,1.0,0.0,0.0,-1.360733,-1.360733,0.0
00-0007091,M.Hasselbeck,2014,-5.0,-5.0,1.0,0.0,0.0,1.0,-1.280506,-1.280506,0.0
00-0007091,M.Hasselbeck,2015,4.0,4.0,1.0,0.0,0.0,0.0,-0.035615,-0.035615,0.0
00-0010346,P.Manning,2012,-2.0,-1.0,2.0,0.0,0.0,1.0,-0.817786,-0.408893,0.0
00-0010346,P.Manning,2013,-9.0,-0.9,10.0,5.0,1.0,2.0,-18.549723,-1.854972,1.0


In [20]:
df_rushing_yards_by_season.columns = list(map("_".join, df_rushing_yards_by_season.columns))
df_rushing_yards_by_season.reset_index(inplace=True)

In [21]:
df_rushing_yards_by_season_cleaned = df_rushing_yards_by_season.dropna()

In [22]:
df_rushing_yards_by_season_cleaned = df_rushing_yards_by_season_cleaned.query("season >= 2016")

In [23]:
df_full_2016_2022 = df_rushing_yards_by_season_cleaned.copy()

**Note:** the ngs data set does not have anything in it related to non-rb positions.

Rather unfortunate :(

In [24]:
for i, row in df_full_2016_2022.iterrows():
    r_id = row["rusher_id"]
    season = row["season"]
    
    seasonal_row = df_seasonal.query(f"player_id == '{r_id}' & season == {season} & season_type == 'REG'")
    roster_row = df_roster.query(f"player_id == '{r_id}' & season == {season}")
    
    # grabbing corresponding row from the ngs data set
    ngs_row = df_ngs_data.query(f"season == {season} & player_gsis_id == '{r_id}'")

    # completing out final df
    if ngs_row.empty or seasonal_row.empty or roster_row.empty:
        pass
    else:
        # from seasonal df
        df_full_2016_2022.at[i, "games"] = seasonal_row["games"].iloc[0]
        df_full_2016_2022.at[i, "fantasy_points"] = seasonal_row["fantasy_points"].iloc[0]

        # from roster df
        df_full_2016_2022.at[i, "age"] = roster_row["age"].iloc[0]
        df_full_2016_2022.at[i, "position"] = roster_row["position"].iloc[0]
        
        # from ngs data
        df_full_2016_2022.at[i, 'efficiency_mean'] = ngs_row['efficiency_mean'].iloc[0]
        df_full_2016_2022.at[i, 'percent_attempts_gte_eight_defenders_mean'] = ngs_row['percent_attempts_gte_eight_defenders_mean'].iloc[0]
        df_full_2016_2022.at[i, 'avg_time_to_los_mean'] = ngs_row['avg_time_to_los_mean'].iloc[0]
        df_full_2016_2022.at[i, 'expected_rush_yards_mean'] = ngs_row['expected_rush_yards_mean'].iloc[0]
        df_full_2016_2022.at[i, 'rush_yards_over_expected_sum'] = ngs_row['rush_yards_over_expected_sum'].iloc[0]
        df_full_2016_2022.at[i, 'rush_yards_over_expected_per_att_mean'] = ngs_row['rush_yards_over_expected_per_att_mean'].iloc[0]
        df_full_2016_2022.at[i, 'rush_pct_over_expected_mean'] = ngs_row['rush_pct_over_expected_mean'].iloc[0]

  df_full_2016_2022.at[i, "position"] = roster_row["position"].iloc[0]


In [25]:
df_final = df_full_2016_2022.dropna()

# Create Lag Version of Data

In [26]:
df_now = df_final.copy()
df_last = df_final.copy()

In [27]:
rename_dict = {}
for col in list(df_last.columns[3:]):
    rename_dict[col] = f"{col}_last"

In [28]:
df_last.rename(columns=rename_dict, inplace=True)

In [29]:
df_last["season"] += 1

In [30]:
df_lag = df_now.merge(df_last, how='inner', on=['rusher_id', 'rusher', 'season'])

In [31]:
df_lag.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards_sum,rushing_yards_mean,rush_attempt_sum,fumble_sum,rush_touchdown_sum,tackled_for_loss_sum,epa_sum,...,fantasy_points_last,age_last,position_last,efficiency_mean_last,percent_attempts_gte_eight_defenders_mean_last,avg_time_to_los_mean_last,expected_rush_yards_mean_last,rush_yards_over_expected_sum_last,rush_yards_over_expected_per_att_mean_last,rush_pct_over_expected_mean_last
0,00-0023500,F.Gore,2019,599.0,3.608434,166.0,0.0,2.0,18.0,-32.828232,...,90.6,35.0,RB,3.247479,8.964702,2.468552,118.911784,109.882161,0.535431,0.406332
1,00-0023500,F.Gore,2020,653.0,3.491979,187.0,1.0,2.0,17.0,-41.306019,...,81.9,36.0,RB,4.521416,33.168447,2.544394,100.334709,-96.681794,-0.404407,0.314246
2,00-0025394,A.Peterson,2019,898.0,4.255924,211.0,2.0,5.0,20.0,-25.582747,...,169.0,33.0,RB,5.157395,15.759431,2.962591,141.369825,87.19228,-0.051872,0.366966
3,00-0025394,A.Peterson,2020,604.0,3.871795,156.0,0.0,7.0,11.0,-12.136178,...,130.0,34.0,RB,4.276318,30.083278,2.854776,109.030173,220.577573,0.493684,0.412969
4,00-0025394,A.Peterson,2021,98.0,2.578947,38.0,1.0,2.0,5.0,-9.864626,...,112.5,35.0,RB,4.291046,30.683327,2.616606,139.261264,-145.090112,-0.610254,0.292767


# Data Manipulations

Preserving the df_lag to avoid manipulations on it. Can just re-run from here, if we run into any issues from data manipulations, or just want the original NGS + simple stats.

In [33]:
df = df_lag.copy()

## Correlation Matrix Features

In [66]:
x = [
#     'rushing_yards_sum',
#     'rushing_yards_mean',
#     'rush_attempt_sum',
#     'fumble_sum',
#     'rush_touchdown_sum',
#     'tackled_for_loss_sum',
#     'epa_sum',
#     'epa_mean',
#     'success_sum',
    'fantasy_points',
    'efficiency_mean',
    'percent_attempts_gte_eight_defenders_mean',
    'avg_time_to_los_mean',
    'expected_rush_yards_mean',
    'rush_yards_over_expected_sum',
    'rush_yards_over_expected_per_att_mean',
    'rush_pct_over_expected_mean',
]

In [67]:
y = [
#     'rushing_yards_sum_last',
#     'rushing_yards_mean_last',
#     'rush_attempt_sum_last',
#     'fumble_sum_last',
#     'rush_touchdown_sum_last',
#     'tackled_for_loss_sum_last',
#     'epa_sum_last',
#     'epa_mean_last',
#     'success_sum_last',
    'fantasy_points_last',
    'efficiency_mean_last',
    'percent_attempts_gte_eight_defenders_mean_last',
    'avg_time_to_los_mean_last',
    'expected_rush_yards_mean_last',
    'rush_yards_over_expected_sum_last',
    'rush_yards_over_expected_per_att_mean_last',
    'rush_pct_over_expected_mean_last'
]

## Exploration

In [68]:
base_path="../../interactive/RB/stability-rushing/season"

In [69]:
filter_df(df, "", x, y)

fantasy_points


Unnamed: 0,rusher_id,rusher,season,rushing_yards_sum,rushing_yards_mean,rush_attempt_sum,fumble_sum,rush_touchdown_sum,tackled_for_loss_sum,epa_sum,...,fantasy_points_last,age_last,position_last,efficiency_mean_last,percent_attempts_gte_eight_defenders_mean_last,avg_time_to_los_mean_last,expected_rush_yards_mean_last,rush_yards_over_expected_sum_last,rush_yards_over_expected_per_att_mean_last,rush_pct_over_expected_mean_last
0,00-0023500,F.Gore,2019,599.0,3.608434,166.0,0.0,2.0,18.0,-32.828232,...,90.6,35.0,RB,3.247479,8.964702,2.468552,118.911784,109.882161,0.535431,0.406332
1,00-0023500,F.Gore,2020,653.0,3.491979,187.0,1.0,2.0,17.0,-41.306019,...,81.9,36.0,RB,4.521416,33.168447,2.544394,100.334709,-96.681794,-0.404407,0.314246
2,00-0025394,A.Peterson,2019,898.0,4.255924,211.0,2.0,5.0,20.0,-25.582747,...,169.0,33.0,RB,5.157395,15.759431,2.962591,141.369825,87.192280,-0.051872,0.366966
3,00-0025394,A.Peterson,2020,604.0,3.871795,156.0,0.0,7.0,11.0,-12.136178,...,130.0,34.0,RB,4.276318,30.083278,2.854776,109.030173,220.577573,0.493684,0.412969
4,00-0025394,A.Peterson,2021,98.0,2.578947,38.0,1.0,2.0,5.0,-9.864626,...,112.5,35.0,RB,4.291046,30.683327,2.616606,139.261264,-145.090112,-0.610254,0.292767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,00-0036875,R.Stevenson,2022,1040.0,4.952381,210.0,3.0,5.0,18.0,-9.571356,...,100.9,23.0,RB,3.838662,38.513970,2.746166,119.071566,124.427475,0.429158,0.426022
233,00-0036893,N.Harris,2022,1034.0,3.801471,272.0,2.0,7.0,22.0,-28.651573,...,226.7,23.0,RB,4.660829,14.034710,2.801212,146.420609,-150.150347,-0.404157,0.314597
234,00-0036906,K.Herbert,2022,731.0,5.666667,129.0,0.0,4.0,8.0,1.171968,...,64.9,23.0,RB,3.970373,19.902740,2.934207,155.425127,-4.125636,0.130836,0.416662
235,00-0036924,Mi.Carter,2022,402.0,3.526316,114.0,1.0,3.0,9.0,-21.690918,...,118.4,22.0,RB,4.297025,20.274910,2.820964,112.951330,77.438028,0.071273,0.456307


In [83]:
query = "position == 'RB' and rush_attempt_sum > 100 and rush_attempt_sum_last > 100"
d = df.query(query)
d[["rush_pct_over_expected_mean", "rush_pct_over_expected_mean_last",
    "fantasy_points", "fantasy_points_last"]].corr()

Unnamed: 0,rush_pct_over_expected_mean,rush_pct_over_expected_mean_last,fantasy_points,fantasy_points_last
rush_pct_over_expected_mean,1.0,0.069803,0.152174,0.107934
rush_pct_over_expected_mean_last,0.069803,1.0,0.060958,0.022653
fantasy_points,0.152174,0.060958,1.0,0.359566
fantasy_points_last,0.107934,0.022653,0.359566,1.0


In [82]:
query = "position == 'RB' and rush_attempt_sum > 100 and rush_attempt_sum_last > 100"
d = df.query(query)
d[["avg_time_to_los_mean", "avg_time_to_los_mean_last",
    "fantasy_points", "fantasy_points_last"]].corr()

Unnamed: 0,avg_time_to_los_mean,avg_time_to_los_mean_last,fantasy_points,fantasy_points_last
avg_time_to_los_mean,1.0,0.586291,0.075436,0.113521
avg_time_to_los_mean_last,0.586291,1.0,0.15508,0.154988
fantasy_points,0.075436,0.15508,1.0,0.359566
fantasy_points_last,0.113521,0.154988,0.359566,1.0


In [81]:
query = "rush_attempt_sum > 100 and rush_attempt_sum_last > 100"
t = filter_df(df, query, x, y)

avg_time_to_los_mean


In [75]:
t[["avg_time_to_los_mean", "rush_yards_over_expected_sum", "fantasy_points",
  "avg_time_to_los_mean_last", "rush_yards_over_expected_sum_last", "fantasy_points_last"]].corr()

Unnamed: 0,avg_time_to_los_mean,rush_yards_over_expected_sum,fantasy_points,avg_time_to_los_mean_last,rush_yards_over_expected_sum_last,fantasy_points_last
avg_time_to_los_mean,1.0,0.134912,0.012071,0.589051,0.219092,0.00797
rush_yards_over_expected_sum,0.134912,1.0,0.774918,0.415707,0.545975,0.079122
fantasy_points,0.012071,0.774918,1.0,0.087277,0.335927,0.196065
avg_time_to_los_mean_last,0.589051,0.415707,0.087277,1.0,0.361377,0.126986
rush_yards_over_expected_sum_last,0.219092,0.545975,0.335927,0.361377,1.0,0.446147
fantasy_points_last,0.00797,0.079122,0.196065,0.126986,0.446147,1.0
