# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_redzone-tgts_team-snap-data_1999_2023.pkl")

# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [7]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec',
    "redzone_targets", "total_snaps", 'Att_passing_totals', 'Att_rushing_totals'
]

In [8]:
df_wr = df_wr[wr_cols]

In [9]:
df_wr["yards_per_snap"] = df_wr["receiving_yards"] / df_wr["offense_snaps"]
df_wr["receptions_per_snap"] = df_wr["receiving_yards"] / df_wr["offense_snaps"]

**Note:** Proxy is based on roughly the league median in 2023! This will likely penalize pass happy offenses and improve run happy offenses

Two simplifying assumptions:
 - `freq_pass` is a fair representation of passing across each team in the league
 - `route_proxy` assumes that a WR will be on the field for the same number of rushing and passing plays, proportionally

In [10]:
df_wr["pass_freq"] = df_wr["Att_passing_totals"] / df_wr["total_snaps"]
total_passing_snaps = (df_wr["offense_snaps"] / df_wr["offense_pct"]) * df_wr["pass_freq"]

df_wr["route_proxy"] = total_passing_snaps * df_wr["offense_pct"]
df_wr["yards_per_route_run"] = df_wr["receiving_yards"] / df_wr["route_proxy"]

In [11]:
df_wr["tds_per_redzone_target"] = df_wr["receiving_tds"] / df_wr["redzone_targets"]

**Note:** filtering by snaps automatically removes any data prior to 2012!

In [12]:
df_wr = df_wr.query("games >= 6 and offense_snaps >= 100 and redzone_targets > 0")

In [13]:
len(df_wr)

1520

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [14]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [15]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [16]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
495,Mike Evans,2017,1001.0,1321.0


In [17]:
import numpy as np

In [18]:
df_lag['improved'] = np.where(df_lag['fantasy_points_ppr'] >= df_lag['fantasy_points_ppr_last'], True, False)

# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [19]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [20]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last', 'improved'],
      dtype='object')

In [21]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [22]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_tds',
    # 'receiving_2pt_conversions_last',
    'receiving_fumbles_lost',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    # 'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',

    # our metrics
    "yards_per_snap",
    "receptions_per_snap",
    "route_proxy",
    "yards_per_route_run",
    "redzone_targets",
    "tds_per_redzone_target",

    # snap data
    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',

    
    # 'tgt_pfr_rec_last',
    # 'rec_pfr_rec_last',
    # 'td_pfr_rec_last',
    # 'x1d_pfr_rec_last',
    # 'int_pfr_rec_last',
    # 'rat_pfr_rec_last'
]

In [23]:
label = "improved"

In [24]:
cols_to_use = feature_columns + [label]
df_ML = df_lag[cols_to_use].copy()

In [25]:
df_ML.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,receiving_tds,receiving_fumbles_lost,racr_last,...,yards_per_snap,receptions_per_snap,route_proxy,yards_per_route_run,redzone_targets,tds_per_redzone_target,catch_percentage_last,offense_snaps_last,offense_pct_last,improved
0,45,59,544.0,589.0,148.0,30.0,27.585154,0,0.0,17.371708,...,1.0,1.0,68.315739,1.68336,1.0,0.0,,597.0,0.562667,False
1,73,138,1174.0,1832.0,268.0,51.0,25.508378,4,0.0,11.858338,...,0.967532,0.967532,380.972803,1.95552,16.0,0.25,,911.0,0.89125,False
2,64,110,745.0,1219.0,178.0,44.0,24.914914,6,1.0,10.475034,...,1.29562,1.29562,454.479042,2.343342,20.0,0.3,,770.0,0.78,True
3,79,134,1065.0,1413.0,363.0,45.0,39.161685,3,0.0,13.027294,...,1.930836,1.930836,221.503305,3.024786,9.0,0.333333,,822.0,0.766875,False
4,46,73,670.0,719.0,256.0,28.0,22.417599,5,0.0,6.641955,...,1.108183,1.108183,468.029637,1.707157,15.0,0.333333,,347.0,0.694286,True


In [26]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_ppr_last


# Create Train, Test, Validation Splits

In [27]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [28]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_ppr_last


In [29]:
y[:5]

0    False
1    False
2     True
3    False
4     True
Name: improved, dtype: bool

In [30]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [31]:
print(f"Length of train set: {len(X_train)}")
print(f"Length of test set: {len(X_test)}")
print(f"Length of val set: {len(X_val)}")

Length of train set: 591
Length of test set: 198
Length of val set: 197


# AutoGluon

In [32]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [33]:
label_column = 'improved'

## Fit

In [34]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240512_023101"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240512_023101"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [35]:
performance = predictor.evaluate(test_data)
print(performance)

{'accuracy': 0.9444444444444444, 'balanced_accuracy': 0.9484122852680895, 'mcc': 0.890039613614, 'roc_auc': 0.9926080166579907, 'f1': 0.9378531073446328, 'precision': 0.9021739130434783, 'recall': 0.9764705882352941}


In [36]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

613    False
451    False
731     True
436     True
275     True
Name: improved, dtype: bool


In [37]:
answers = test_data[label_column]

In [38]:
answers.head()

613    False
451    False
731     True
436     True
275     True
Name: improved, dtype: bool

In [40]:
negative = 0
close = 0
far = 0
for guess, answer in zip(predictions, answers):
    if answer == guess:
        close += 1
    else:
        far += 1

In [45]:
print(f"Correct guesses: {close}")
print(f"Sample space size: {len(answers)}")
print(f"Percent correct: {round(close / len(answers), 3)}")
print(f"Incorrect guesses: {far}")

Correct guesses: 187
Sample space size: 198
Percent correct: 0.944
Incorrect guesses: 11


## Visualization

In [72]:
map_to_player = dict()
j = 0
for i, row in test_data.iterrows():
    player = df_lag.loc[i, 'player_name']
    season = df_lag.loc[i, 'season']
    pair = (player, season)
    map_to_player[j] = pair
    j += 1

In [75]:
correct = list()
miss = list()
for i, (guess, answer) in enumerate(zip(predictions, answers)):
    pair = map_to_player[i]
    pair = (pair[0], pair[1], guess, answer)
    if guess == answer:
        correct.append(pair)
    else:
        miss.append(pair)

In [76]:
miss

[('Tyler Lockett', 2022, True, False),
 ('Adam Thielen', 2022, True, False),
 ('Jacoby Jones', 2013, False, True),
 ('Kadarius Toney', 2023, True, False),
 ('Michael Thomas', 2017, False, True),
 ('Brandon Aiyuk', 2021, True, False),
 ('Tim Patrick', 2021, True, False),
 ('DeVante Parker', 2017, True, False),
 ('Brandin Cooks', 2016, True, False),
 ('Mike Williams', 2019, True, False),
 ('Jakobi Meyers', 2022, True, False)]

## Further Information

In [46]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.959596,0.93401,accuracy,0.003433,0.003763,1.312872,0.003433,0.003763,1.312872,1,True,5
1,NeuralNetFastAI,0.944444,0.964467,accuracy,0.008627,0.00796,0.806458,0.008627,0.00796,0.806458,1,True,8
2,WeightedEnsemble_L2,0.944444,0.964467,accuracy,0.009683,0.008387,0.865112,0.001056,0.000427,0.058654,2,True,11
3,RandomForestEntr,0.939394,0.888325,accuracy,0.047249,0.028597,0.311518,0.047249,0.028597,0.311518,1,True,4
4,NeuralNetTorch,0.934343,0.949239,accuracy,0.0071,0.005312,1.955636,0.0071,0.005312,1.955636,1,True,10
5,ExtraTreesGini,0.934343,0.883249,accuracy,0.053016,0.039552,0.295227,0.053016,0.039552,0.295227,1,True,6
6,ExtraTreesEntr,0.934343,0.883249,accuracy,0.053565,0.028227,0.299848,0.053565,0.028227,0.299848,1,True,7
7,XGBoost,0.929293,0.93401,accuracy,0.011031,0.003193,0.552284,0.011031,0.003193,0.552284,1,True,9
8,RandomForestGini,0.924242,0.888325,accuracy,0.059221,0.039295,0.372194,0.059221,0.039295,0.372194,1,True,3
9,KNeighborsDist,0.792929,0.771574,accuracy,0.006656,0.003182,0.169248,0.006656,0.003182,0.169248,1,True,2


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [47]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['receptions_per_snap']
Computing feature importance via permutation shuffling for 31 features using 198 rows with 5 shuffle sets...
	3.94s	= Expected runtime (0.79s per shuffle set)
	0.41s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
route_proxy,0.1828283,0.029751,8.1e-05,5,0.244086,0.121571
fantasy_points_ppr_last,0.06969697,0.018694,0.000566,5,0.108188,0.031206
yards_per_snap,0.06969697,0.012576,0.000122,5,0.09559,0.043804
tier_last,0.06262626,0.017713,0.000692,5,0.099097,0.026155
yards_per_route_run,0.03636364,0.010949,0.000878,5,0.058908,0.013819
receiving_tds,0.03636364,0.013552,0.001941,5,0.064267,0.00846
rank_last,0.03333333,0.011065,0.001265,5,0.056117,0.01055
receptions_last,0.03131313,0.008299,0.00054,5,0.048401,0.014226
receiving_yards_last,0.02424242,0.010949,0.003878,5,0.046787,0.001698
receiving_first_downs_last,0.01818182,0.009175,0.005705,5,0.037073,-0.000709
