# Imports

In [None]:
# pip install autogluon

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [2]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

# Filter By Position

In [3]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [4]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [5]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [6]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec'
]

In [7]:
df_wr = df_wr[wr_cols]

In [8]:
df_wr["yards_per_snap"] = df_wr["receiving_yards"] / df_wr["offense_snaps"]
df_wr["receptions_per_snap"] = df_wr["receiving_yards"] / df_wr["offense_snaps"]

In [9]:
freq_pass = 0.64  # just above league's best pass % of plays in 2023
df_wr["route_proxy"] = (df_wr["offense_snaps"] / df_wr["offense_pct"]) * freq_pass
df_wr["yards_per_route_run"] = df_wr["receiving_yards"] / df_wr["route_proxy"]

In [56]:
df_lag.query("player_name == 'Mike Evans' and season == 2019")[["player_name", "season", 
                                                                "receiving_yards", "route_proxy",
                                                                "offense_snaps", "offense_pct"]]

Unnamed: 0,player_name,season,receiving_yards,route_proxy,offense_snaps,offense_pct
511,Mike Evans,2019,1157.0,596.869565,792.0,0.849231


In [57]:
df_lag.query("player_name == 'Chris Godwin' and season == 2019")[["player_name", "season", 
                                                                "receiving_yards", "route_proxy",
                                                                "offense_snaps", "offense_pct"]]

Unnamed: 0,player_name,season,receiving_yards,route_proxy,offense_snaps,offense_pct
762,Chris Godwin,2019,1333.0,649.305491,937.0,0.923571


**Note:** filtering by snaps automatically removes any data prior to 2012!

In [10]:
df_wr = df_wr.query("games >= 6 and offense_snaps >= 100")

In [11]:
len(df_wr)

1571

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [12]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [13]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [14]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
509,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [15]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [16]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last'],
      dtype='object')

In [17]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [18]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    # 'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',

    # our metrics
    "yards_per_snap",
    "receptions_per_snap",
    "route_proxy",
    "yards_per_route_run",

    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',

    
    # 'tgt_pfr_rec_last',
    # 'rec_pfr_rec_last',
    # 'td_pfr_rec_last',
    # 'x1d_pfr_rec_last',
    # 'int_pfr_rec_last',
    # 'rat_pfr_rec_last'
]

In [19]:
label = "fantasy_points_ppr"

In [20]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [21]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [22]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [23]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,racr_last,target_share_last,air_yards_share_last,...,tier_last_norm,Draft_Round_last_norm,yards_per_snap_norm,receptions_per_snap_norm,route_proxy_norm,yards_per_route_run_norm,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,fantasy_points_ppr_norm
0,45,59,544.0,589.0,148.0,30.0,27.585154,17.371708,1.604753,1.890366,...,0.266667,0.272727,0.363185,0.363185,0.105137,0.12772,,0.472883,0.512017,0.042897
1,73,138,1174.0,1832.0,268.0,51.0,25.508378,11.858338,4.521201,6.445839,...,0.066667,0.181818,0.350169,0.350169,0.689892,0.414932,,0.771646,0.887235,0.361162
2,64,110,745.0,1219.0,178.0,44.0,24.914914,10.475034,3.743449,4.719369,...,0.2,0.181818,0.481698,0.481698,0.784967,0.55,,0.637488,0.760196,0.49262
3,79,134,1065.0,1413.0,363.0,45.0,39.161685,13.027294,3.988147,5.097152,...,0.066667,0.181818,0.736351,0.736351,0.142826,0.746167,,0.686965,0.745208,0.288515
4,46,73,670.0,719.0,256.0,28.0,22.417599,6.641955,1.75286,2.135309,...,0.266667,0.181818,0.406555,0.406555,0.696991,0.442945,,0.235014,0.662316,0.410517


In [24]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [25]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,racr_last_norm,target_share_last_norm,air_yards_share_last_norm,...,tier_last_norm,Draft_Round_last_norm,yards_per_snap_norm,receptions_per_snap_norm,route_proxy_norm,yards_per_route_run_norm,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,fantasy_points_ppr_norm
0,0.287671,0.267677,0.26615,0.223259,0.170036,0.307692,0.489046,0.316998,0.227557,0.216355,...,0.266667,0.272727,0.363185,0.363185,0.105137,0.12772,,0.472883,0.512017,0.042897
1,0.479452,0.666667,0.591731,0.669419,0.312723,0.538462,0.476458,0.248383,0.702902,0.709592,...,0.066667,0.181818,0.350169,0.350169,0.689892,0.414932,,0.771646,0.887235,0.361162
2,0.417808,0.525253,0.370026,0.44939,0.205707,0.461538,0.472861,0.231167,0.576138,0.522661,...,0.2,0.181818,0.481698,0.481698,0.784967,0.55,,0.637488,0.760196,0.49262
3,0.520548,0.646465,0.535401,0.519024,0.425684,0.472527,0.559215,0.262931,0.616021,0.563565,...,0.066667,0.181818,0.736351,0.736351,0.142826,0.746167,,0.686965,0.745208,0.288515
4,0.294521,0.338384,0.331266,0.269921,0.298454,0.285714,0.457724,0.183463,0.251697,0.242876,...,0.266667,0.181818,0.406555,0.406555,0.696991,0.442945,,0.235014,0.662316,0.410517


In [26]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm
fantasy_points_ppr_norm


# Create Train, Test, Validation Splits

In [27]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [28]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm


In [29]:
y[:5]

0    0.042897
1    0.361162
2    0.492620
3    0.288515
4    0.410517
Name: fantasy_points_ppr_norm, dtype: float64

In [30]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [31]:
print(f"Length of train set: {len(X_train)}")
print(f"Length of test set: {len(X_test)}")
print(f"Length of val set: {len(X_val)}")

Length of train set: 612
Length of test set: 205
Length of val set: 205


# AutoGluon

In [32]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [33]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [34]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240510_180118"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240510_180118"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [35]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.03760269124632814, 'mean_squared_error': -0.0014139623889666828, 'mean_absolute_error': -0.029361678153053822, 'r2': 0.957800327360784, 'pearsonr': 0.9786936355580175, 'median_absolute_error': -0.022375471653533596}


In [36]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

992    0.532522
478    0.150984
689    0.072319
31     0.568142
617    0.184117
Name: fantasy_points_ppr_norm, dtype: float32


In [37]:
test_data[label_column]

992    0.511301
478    0.093635
689    0.000000
31     0.545434
617    0.165590
         ...   
451    0.670434
717    0.393450
925    0.382149
559    0.744234
796    0.425046
Name: fantasy_points_ppr_norm, Length: 205, dtype: float64

### Reversing the normalization

In [38]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [39]:
min_v = df_to_norm['fantasy_points_ppr_last'].min()
max_v = df_to_norm['fantasy_points_ppr_last'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [40]:
original_predictions

992    228.284179
478     47.768911
689      7.300000
31     243.036393
617     78.868173
          ...    
451    297.061393
717    177.349170
925    172.464991
559    328.958072
796    191.004935
Name: fantasy_points_ppr_norm, Length: 205, dtype: float64

In [41]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [42]:
model_predictions

992    237.455948
478     72.555199
689     38.556309
31     252.851013
617     86.875511
          ...    
451    305.405304
717    198.663330
925    169.958298
559    321.687012
796    169.507141
Name: fantasy_points_ppr_norm, Length: 205, dtype: float32

In [43]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [44]:
close_enough

139

In [45]:
len(model_predictions)

205

In [46]:
close_enough / len(model_predictions)

0.6780487804878049

In [47]:
far

16

## Further Information

In [48]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.037603,-0.034174,root_mean_squared_error,0.126698,0.067883,6.29541,0.001765,0.000269,0.008332,2,True,9
1,CatBoost,-0.038589,-0.037018,root_mean_squared_error,0.002591,0.001271,1.189082,0.002591,0.001271,1.189082,1,True,4
2,NeuralNetFastAI,-0.04085,-0.036992,root_mean_squared_error,0.009497,0.004636,0.760176,0.009497,0.004636,0.760176,1,True,6
3,NeuralNetTorch,-0.041933,-0.038303,root_mean_squared_error,0.005214,0.003786,2.838973,0.005214,0.003786,2.838973,1,True,8
4,ExtraTreesMSE,-0.041966,-0.037503,root_mean_squared_error,0.039366,0.026773,0.266473,0.039366,0.026773,0.266473,1,True,5
5,RandomForestMSE,-0.042335,-0.038954,root_mean_squared_error,0.060642,0.028915,0.640481,0.060642,0.028915,0.640481,1,True,3
6,XGBoost,-0.057484,-0.04,root_mean_squared_error,0.007623,0.002233,0.591892,0.007623,0.002233,0.591892,1,True,7
7,KNeighborsDist,-0.081643,-0.083067,root_mean_squared_error,0.008046,0.004441,0.170232,0.008046,0.004441,0.170232,1,True,2
8,KNeighborsUnif,-0.082972,-0.083718,root_mean_squared_error,0.009885,0.055335,3.575892,0.009885,0.055335,3.575892,1,True,1


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [49]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['receptions_per_snap_norm']
Computing feature importance via permutation shuffling for 28 features using 205 rows with 5 shuffle sets...
	19.8s	= Expected runtime (3.96s per shuffle set)
	1.21s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
yards_per_route_run_norm,0.161336,0.004908,1.026547e-07,5,0.171442,0.15123
route_proxy_norm,0.070598,0.004053,1.297583e-06,5,0.078943,0.062254
receiving_air_yards_last_norm,0.003645,0.000867,0.0003562492,5,0.005429,0.001861
yards_per_snap_norm,0.002925,0.000901,0.0009566639,5,0.00478,0.00107
air_yards_share_last_norm,0.002729,0.00081,0.0008302998,5,0.004396,0.001062
w8dom_last_norm,0.00187,0.000602,0.001126265,5,0.003109,0.000631
games_last_norm,0.001283,0.000429,0.001301892,5,0.002167,0.000399
fantasy_points_ppr_last_norm,0.001264,0.00068,0.007090489,5,0.002663,-0.000136
Draft_Round_last_norm,0.001251,0.000423,0.001358146,5,0.002122,0.00038
receptions_last_norm,0.001213,0.000528,0.003398499,5,0.002299,0.000126
