# Imports

In [1]:
import plotly.express as px
import pandas as pd

In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from helpers import plot_correlation_matrix, create_correlation_matrix, filter_df

# Read Data

This data set takes into account passing data from 2012 - 2022. This data is unfiltered.

In [3]:
df_roster = pd.read_pickle("../../interactive/df_roster_2012_2022.pkl")

In [4]:
df_seasonal = pd.read_pickle("../../interactive/df_seasonal_2012_2022.pkl")

In [5]:
df_pbp = pd.read_pickle("../../interactive/df_pbp_2012_2022.pkl")

In [6]:
df_pbp.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
0,1.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,,,,...,,,,,,,,,,
1,35.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
2,53.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
3,74.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
4,95.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,


# Creat Data Frame for Runs

In [7]:
df_runs = df_pbp.query("play_type == 'run' and season_type == 'REG'")

In [8]:
df_runs.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
2,53.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
3,74.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
7,165.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
11,256.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,ATL,away,KC,...,,,,,,,,,,
16,359.0,2012_01_ATL_KC,2012090908,KC,ATL,REG,1,KC,home,ATL,...,,,,,,,,,,


# Creat DF for Rushing Stats by Season

In [9]:
df_rushing_yards_by_season = df_runs \
    .groupby(["rusher_id", "rusher", "season"]) \
    .agg({
        "rushing_yards": ["sum", "mean"],
        "rush_attempt": ["sum"],
        "fumble": ["sum"],
        "rush_touchdown": ["sum"],
        "tackled_for_loss": ["sum"],
        "epa": ["sum", "mean"],
        "success": ["sum"]
    })

In [10]:
df_rushing_yards_by_season.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rushing_yards,rushing_yards,rush_attempt,fumble,rush_touchdown,tackled_for_loss,epa,epa,success
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,sum,sum,sum,sum,sum,mean,sum
rusher_id,rusher,season,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
00-0007091,M.Hasselbeck,2012,0.0,0.0,1.0,1.0,0.0,0.0,-1.360733,-1.360733,0.0
00-0007091,M.Hasselbeck,2014,-5.0,-5.0,1.0,0.0,0.0,1.0,-1.280506,-1.280506,0.0
00-0007091,M.Hasselbeck,2015,4.0,4.0,1.0,0.0,0.0,0.0,-0.035615,-0.035615,0.0
00-0010346,P.Manning,2012,-2.0,-1.0,2.0,0.0,0.0,1.0,-0.817786,-0.408893,0.0
00-0010346,P.Manning,2013,-9.0,-0.9,10.0,5.0,1.0,2.0,-18.549723,-1.854972,1.0


In [11]:
df_rushing_yards_by_season.columns = list(map("_".join, df_rushing_yards_by_season.columns))
df_rushing_yards_by_season.reset_index(inplace=True)

In [12]:
df_rushing_yards_by_season.dropna(inplace=True)
df_rushing_yards_by_season_cleaned = df_rushing_yards_by_season.copy()

In [13]:
for i, row in df_rushing_yards_by_season_cleaned.copy().iterrows():
    r_id = row["rusher_id"]
    season = row["season"]
    
    seasonal_row = df_seasonal.query(f"player_id == '{r_id}' & season == {season} & season_type == 'REG'")
    roster_row = df_roster.query(f"player_id == '{r_id}' & season == {season}")
    
    if seasonal_row.empty:
        # from seasonal df
        df_rushing_yards_by_season_cleaned.at[i, "games"] = None
        df_rushing_yards_by_season_cleaned.at[i, "fantasy_points"] = None

        # from roster df
        df_rushing_yards_by_season_cleaned.at[i, "age"] = None
        df_rushing_yards_by_season_cleaned.at[i, "position"] = None
    
    else:
        # from seasonal df
        df_rushing_yards_by_season_cleaned.at[i, "games"] = seasonal_row["games"].iloc[0]
        df_rushing_yards_by_season_cleaned.at[i, "fantasy_points"] = seasonal_row["fantasy_points"].iloc[0]

        # from roster df
        df_rushing_yards_by_season_cleaned.at[i, "age"] = roster_row["age"].iloc[0]
        df_rushing_yards_by_season_cleaned.at[i, "position"] = roster_row["position"].iloc[0]

  df_rushing_yards_by_season_cleaned.at[i, "position"] = roster_row["position"].iloc[0]


In [14]:
df_rushing_yards_by_season_cleaned.dropna(inplace=True)
df_rushing_yards_by_season_cleaned.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards_sum,rushing_yards_mean,rush_attempt_sum,fumble_sum,rush_touchdown_sum,tackled_for_loss_sum,epa_sum,epa_mean,success_sum,games,fantasy_points,age,position
0,00-0007091,M.Hasselbeck,2012,0.0,0.0,1.0,1.0,0.0,0.0,-1.360733,-1.360733,0.0,8.0,76.48,36.0,QB
1,00-0007091,M.Hasselbeck,2014,-5.0,-5.0,1.0,0.0,0.0,1.0,-1.280506,-1.280506,0.0,4.0,16.94,38.0,QB
2,00-0007091,M.Hasselbeck,2015,4.0,4.0,1.0,0.0,0.0,0.0,-0.035615,-0.035615,0.0,8.0,91.1,39.0,QB
3,00-0010346,P.Manning,2012,-2.0,-1.0,2.0,0.0,0.0,1.0,-0.817786,-0.408893,0.0,16.0,310.96,36.0,QB
4,00-0010346,P.Manning,2013,-9.0,-0.9,10.0,5.0,1.0,2.0,-18.549723,-1.854972,1.0,16.0,409.98,37.0,QB


# Create Lag Version of Data

In [15]:
df_now = df_rushing_yards_by_season_cleaned.copy()
df_last = df_rushing_yards_by_season_cleaned.copy()

In [16]:
rename_dict = {}
for col in list(df_last.columns[3:]):
    rename_dict[col] = f"{col}_last"

In [17]:
df_last.rename(columns=rename_dict, inplace=True)

In [18]:
df_last["season"] += 1

In [19]:
df_lag = df_now.merge(df_last, how='inner', on=['rusher_id', 'rusher', 'season'])

In [20]:
df_lag.head()

Unnamed: 0,rusher_id,rusher,season,rushing_yards_sum,rushing_yards_mean,rush_attempt_sum,fumble_sum,rush_touchdown_sum,tackled_for_loss_sum,epa_sum,...,fumble_sum_last,rush_touchdown_sum_last,tackled_for_loss_sum_last,epa_sum_last,epa_mean_last,success_sum_last,games_last,fantasy_points_last,age_last,position_last
0,00-0007091,M.Hasselbeck,2015,4.0,4.0,1.0,0.0,0.0,0.0,-0.035615,...,0.0,0.0,1.0,-1.280506,-1.280506,0.0,4.0,16.94,38.0,QB
1,00-0010346,P.Manning,2013,-9.0,-0.9,10.0,5.0,1.0,2.0,-18.549723,...,0.0,0.0,1.0,-0.817786,-0.408893,0.0,16.0,310.96,36.0,QB
2,00-0010346,P.Manning,2014,-13.0,-1.857143,7.0,3.0,0.0,2.0,-12.568753,...,5.0,1.0,2.0,-18.549723,-1.854972,1.0,16.0,409.98,37.0,QB
3,00-0019596,T.Brady,2013,13.0,1.083333,12.0,3.0,0.0,2.0,-9.026926,...,1.0,3.0,2.0,4.275997,0.328923,9.0,16.0,344.28,35.0,QB
4,00-0019596,T.Brady,2014,20.0,1.428571,14.0,1.0,0.0,0.0,-2.157746,...,3.0,0.0,2.0,-9.026926,-0.752244,5.0,16.0,251.52,36.0,QB


# Data Manipulations

Keep a copy of our unfiltered data so it is untouched if we need to reset our exploration.

In [21]:
df = df_lag.copy()

Rename columns to labels that make more sense.

In [22]:
df.rename(columns={
    'rushing_yards_sum': "rushing_yards",
    'rushing_yards_mean': "yards_per_carry",
    'rush_attempt_sum': "carries",
    'fumble_sum': "fumbles",
    'rush_touchdown_sum': "rushing_touchdowns",
    'tackled_for_loss_sum': "times_tackled_for_loss",
    'epa_sum': "total_epa",
    'epa_mean': "epa_per_play",
    'success_sum': "successful_plays",
    'rushing_yards_sum_last': "rushing_yards_last",
    'rushing_yards_mean_last': "yards_per_carry_last",
    'rush_attempt_sum_last': "carries_last",
    'fumble_sum_last': "fumbles_last",
    'rush_touchdown_sum_last': "rushing_touchdowns_last",
    'tackled_for_loss_sum_last': "times_tackled_for_loss_last",
    'epa_sum_last': "total_epa_last",
    'epa_mean_last': "epa_per_play_last",
    'success_sum_last': "successful_plays_last"
}, inplace=True)

# Generate Graphs

## Correlation Matrix

# Filtered DFs

In [82]:
base_path = "../../interactive/RB/stability-rushing/season"

In [None]:
x = [
#     'rushing_yards',
#     'yards_per_carry',
#     'carries',
#     'fumbles',
#     'rushing_touchdowns',
    'times_tackled_for_loss',
    'total_epa',
    'epa_per_play',
    'successful_plays',
    'fantasy_points',
]

In [78]:
y = [
#     'rushing_yards_last',
#     'yards_per_carry_last',
#     'carries_last',
#     'fumbles_last',
#     'rushing_touchdowns_last',
    'times_tackled_for_loss_last',
    'total_epa_last',
    'epa_per_play_last',
    'successful_plays_last',
    'fantasy_points_last',
]

## No Filter (All)

In [None]:
temp_all = filter_df(df, "", x, y)

## RBs

In [None]:
thres = 100
query = f"position == 'RB' and \
fantasy_points > {thres} and fantasy_points_last > {thres}"
temp_rb = filter_df(df, query, x, y)

In [None]:
temp_rb["successful_plays"].mean() / temp_rb["carries"].mean()

In [None]:
temp_rb["yards_per_carry"].mean()

## QBs

In [None]:
query = "position == 'QB' and \
carries > 40 and carries_last > 40"
temp_qb = filter_df(df, query, x, y)

In [None]:
temp_qb["successful_plays"].mean() / temp_qb["carries"].mean()

In [None]:
temp_qb["yards_per_carry"].mean()

# Fantasy Points Correlation

In [138]:
def sort_corr(df, corr_col = "fantasy_points"):
    self_corr_dict = {}
    corr_dict = {}
    cols = list(df.columns[3:])
    for col in cols:
        if "age" not in col and "games" not in col and "fantasy" not in col and "position" not in col:
            temp_corr_mat = df[[col, corr_col]].corr()
            temp_corr_value = temp_corr_mat[corr_col].iloc[0]
            
            if "last" not in col:
                self_corr_mat = df[[col, f"{col}_last"]].corr()
                self_corr_value = self_corr_mat[col].iloc[1]
                
                self_corr_dict[f"{col}_last"] = self_corr_value
            else:
                self_corr_value = self_corr_dict[col]
             
            if self_corr_value >= 0.4:
                corr_dict[col] = temp_corr_value

    return sorted([x for (x, y) in list(corr_dict.items()) if y >= 0.4], key = lambda x: x[1], reverse=True)

In [139]:
def create_corr_for_fantasy_relevance(query, title):
    new_df = df
    if query:
        new_df = df.query(query)
    
    x = list( \
     set(sort_corr(new_df)) | \
     set(sort_corr(new_df, "fantasy_points_last")
    ))
    y = ['fantasy_points', 'fantasy_points_last']
    filter_df(new_df, "", x, y, base_path=base_path, title=title)

In [161]:
df[["rushing_touchdowns", "rushing_touchdowns_last"]].corr()

Unnamed: 0,rushing_touchdowns,rushing_touchdowns_last
rushing_touchdowns,1.0,0.597977
rushing_touchdowns_last,0.597977,1.0


In [159]:
df[["fumbles", "fumbles_last"]].corr()

Unnamed: 0,fumbles,fumbles_last
fumbles,1.0,0.433144
fumbles_last,0.433144,1.0


In [140]:
create_corr_for_fantasy_relevance("", title="fantasy-relevance-unfiltered")

fumbles

Save path: fantasy-relevance-unfiltered-scatter-fumbles



In [141]:
df[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.745423
successful_plays_last,0.745423,1.0


## RBs

In [164]:
df.query("position == 'RB'")[["rushing_yards", "rushing_yards_last", 
                            "successful_plays", "successful_plays_last",
                            "carries", "carries_last"
                           ]].corr()

Unnamed: 0,rushing_yards,rushing_yards_last,successful_plays,successful_plays_last,carries,carries_last
rushing_yards,1.0,0.643109,0.987551,0.639507,0.982299,0.62357
rushing_yards_last,0.643109,1.0,0.650333,0.985333,0.666171,0.979436
successful_plays,0.987551,0.650333,1.0,0.652068,0.983629,0.634229
successful_plays_last,0.639507,0.985333,0.652068,1.0,0.665067,0.982461
carries,0.982299,0.666171,0.983629,0.665067,1.0,0.654134
carries_last,0.62357,0.979436,0.634229,0.982461,0.654134,1.0


In [142]:
create_corr_for_fantasy_relevance("position == 'RB'", title="fantasy-relevance-unfiltered-rb")

rushing_yards

Save path: fantasy-relevance-unfiltered-rb-scatter-rushing_yards



In [143]:
df.query("position == 'RB'")[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.652068
successful_plays_last,0.652068,1.0


### 100 Carries

In [144]:
sort_corr(df.query("position == 'RB' and carries > 100 and carries_last > 100"))

[]

In [145]:
df.query("position == 'RB' and carries > 100 and carries_last > 100")[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.314945
successful_plays_last,0.314945,1.0


In [146]:
query = "position == 'RB' and carries > 100 and carries_last > 100"
create_corr_for_fantasy_relevance(query=query, title="fantasy-relevance-unfiltered-rb-100")

ValueError: nan is not in list

### 200 Carries

In [147]:
sort_corr(df.query("position == 'RB' and carries > 200 and carries_last > 200"))

[]

In [148]:
df.query("position == 'RB' and \
carries > 200 and carries_last > 200")[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.176429
successful_plays_last,0.176429,1.0


In [150]:
query = "position == 'RB' and carries > 200 and carries_last > 200"
create_corr_for_fantasy_relevance(query=query, title="fantasy-relevance-unfiltered-rb-200")

ValueError: nan is not in list

## QBs

In [151]:
sort_corr(df.query("position == 'QB'"))

['rushing_touchdowns', 'successful_plays', 'carries']

In [62]:
df.query("position == 'QB' and \
carries > 20 and carries_last > 20")[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.652673
successful_plays_last,0.652673,1.0


In [152]:
query = "position == 'QB'"
create_corr_for_fantasy_relevance(query=query, title="fantasy-relevance-unfiltered-qb")

carries

Save path: fantasy-relevance-unfiltered-qb-scatter-carries



### 20 Carries

In [154]:
sort_corr(df.query("position == 'QB' and carries > 20 and carries_last > 20"))

['rushing_yards', 'successful_plays', 'carries']

In [155]:
query = "position == 'QB' and carries > 20 and carries_last > 20"
create_corr_for_fantasy_relevance(query=query, title="fantasy-relevance-unfiltered-qb-20")

successful_plays

Save path: fantasy-relevance-unfiltered-qb-20-scatter-successful_plays



### 40 Carries

In [156]:
sort_corr(df.query("position == 'QB' and carries > 40 and carries_last > 40"))

['rushing_yards', 'successful_plays', 'carries']

In [157]:
df.query("position == 'QB' and \
carries > 40 and carries_last > 40")[["successful_plays", "successful_plays_last"]].corr()

Unnamed: 0,successful_plays,successful_plays_last
successful_plays,1.0,0.425633
successful_plays_last,0.425633,1.0


In [158]:
query = "position == 'QB' and carries > 40 and carries_last > 40"
create_corr_for_fantasy_relevance(query=query, title="fantasy-relevance-unfiltered-qb-40")

carries

Save path: fantasy-relevance-unfiltered-qb-40-scatter-carries

