# Imports

In [1]:
# pip install autogluon

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [2]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

# Filter By Position

In [3]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [4]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [5]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [6]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec'
]

In [7]:
df_wr = df_wr[wr_cols]

In [8]:
df_wr = df_wr.query("Draft_Round <= 3 and games >= 6 and offense_snaps >= 100")

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [9]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [10]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [11]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
348,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [12]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [13]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last'],
      dtype='object')

In [14]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [15]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',

    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',
    'tgt_pfr_rec_last',
    'rec_pfr_rec_last',
    'td_pfr_rec_last',
    'x1d_pfr_rec_last',
    'int_pfr_rec_last',
    'rat_pfr_rec_last'
]

In [16]:
label = "fantasy_points_ppr"

In [17]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [18]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [19]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [20]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,receiving_2pt_conversions_last,racr_last,target_share_last,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,73,138,1174.0,1832.0,268.0,51.0,25.508378,0,11.858338,4.521201,...,,0.76834,0.883652,,,,,,,0.359093
1,64,110,745.0,1219.0,178.0,44.0,24.914914,0,10.475034,3.743449,...,,0.632239,0.752577,,,,,,,0.490976
2,79,134,1065.0,1413.0,363.0,45.0,39.161685,0,13.027294,3.988147,...,,0.682432,0.737113,,,,,,,0.28621
3,46,73,670.0,719.0,256.0,28.0,22.417599,0,6.641955,1.75286,...,,0.223938,0.651588,,,,,,,0.408607
4,41,62,573.0,589.0,235.0,27.0,34.112226,0,15.595634,2.282899,...,,0.32722,0.344624,,,,,,,0.21379


In [21]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [22]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,receiving_2pt_conversions_last_norm,racr_last_norm,target_share_last_norm,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,0.472222,0.658031,0.587898,0.659771,0.312723,0.538462,0.476458,0.0,0.248383,0.79987,...,,0.76834,0.883652,,,,,,,0.359093
1,0.409722,0.512953,0.364111,0.433321,0.205707,0.461538,0.472861,0.0,0.231167,0.652329,...,,0.632239,0.752577,,,,,,,0.490976
2,0.513889,0.637306,0.531038,0.504987,0.425684,0.472527,0.559215,0.0,0.262931,0.698749,...,,0.682432,0.737113,,,,,,,0.28621
3,0.284722,0.321244,0.324987,0.248615,0.298454,0.285714,0.457724,0.0,0.183463,0.274711,...,,0.223938,0.651588,,,,,,,0.408607
4,0.25,0.264249,0.274387,0.200591,0.273484,0.274725,0.528609,0.0,0.294895,0.375261,...,,0.32722,0.344624,,,,,,,0.21379


In [23]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm
fantasy_points_ppr_norm


# Create Train, Test, Validation Splits

In [24]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [25]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm


In [26]:
y

0      0.359093
1      0.490976
2      0.286210
3      0.408607
4      0.213790
         ...   
616    0.008792
617    0.107358
618    0.046043
619    0.291300
620    0.217492
Name: fantasy_points_ppr_norm, Length: 621, dtype: float64

In [27]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [28]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [29]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [30]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240510_004229"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240510_004229"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [31]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.14117215699982866, 'mean_squared_error': -0.01992957791198427, 'mean_absolute_error': -0.1149230401735403, 'r2': 0.40777723998101423, 'pearsonr': 0.6462987599003708, 'median_absolute_error': -0.10727315837726831}


In [32]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

49     0.321614
584    0.178912
82     0.414351
305    0.282938
109    0.311972
Name: fantasy_points_ppr_norm, dtype: float32


In [33]:
test_data[label_column]

49     0.425035
584    0.160111
82     0.536789
305    0.383850
109    0.364415
         ...   
104    0.506941
114    0.219343
158    0.273022
181    0.321148
531    0.504627
Name: fantasy_points_ppr_norm, Length: 125, dtype: float64

### Reversing the normalization

In [34]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [35]:
min_v = df_to_norm['fantasy_points_ppr_last'].min()
max_v = df_to_norm['fantasy_points_ppr_last'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [36]:
original_predictions

49     192.379917
584     78.515733
82     240.411708
305    174.678760
109    166.325405
          ...    
104    227.583341
114    103.973577
158    127.044748
181    147.729246
531    226.588894
Name: fantasy_points_ppr_norm, Length: 125, dtype: float64

In [37]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [38]:
model_predictions

49     147.929504
584     86.596558
82     187.787903
305    131.306656
109    143.785767
          ...    
104    213.251724
114    186.203674
158    166.817520
181    202.362411
531    225.312408
Name: fantasy_points_ppr_norm, Length: 125, dtype: float32

In [39]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [40]:
close_enough

20

In [41]:
len(model_predictions)

125

In [42]:
close_enough / len(model_predictions)

0.16

In [43]:
far

81

## Further Information

In [44]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,-0.140505,-0.152593,root_mean_squared_error,0.002828,0.001145,0.598773,0.002828,0.001145,0.598773,1,True,4
1,WeightedEnsemble_L2,-0.141172,-0.146412,root_mean_squared_error,0.01659,0.016388,1.7892,0.001106,0.000252,0.007974,2,True,9
2,NeuralNetFastAI,-0.14134,-0.147875,root_mean_squared_error,0.009295,0.005445,0.543073,0.009295,0.005445,0.543073,1,True,6
3,NeuralNetTorch,-0.14277,-0.147059,root_mean_squared_error,0.006189,0.010691,1.238153,0.006189,0.010691,1.238153,1,True,8
4,KNeighborsUnif,-0.143087,-0.174598,root_mean_squared_error,0.007843,0.056605,3.728341,0.007843,0.056605,3.728341,1,True,1
5,KNeighborsDist,-0.144235,-0.175415,root_mean_squared_error,0.004315,0.002737,0.177471,0.004315,0.002737,0.177471,1,True,2
6,ExtraTreesMSE,-0.14481,-0.160103,root_mean_squared_error,0.038809,0.02824,0.24471,0.038809,0.02824,0.24471,1,True,5
7,RandomForestMSE,-0.14574,-0.163612,root_mean_squared_error,0.057026,0.029045,0.45285,0.057026,0.029045,0.45285,1,True,3
8,XGBoost,-0.151244,-0.177336,root_mean_squared_error,0.008714,0.00189,0.528629,0.008714,0.00189,0.528629,1,True,7


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [45]:
predictor.feature_importance(data=test_data)

Computing feature importance via permutation shuffling for 32 features using 125 rows with 5 shuffle sets...
	5.14s	= Expected runtime (1.03s per shuffle set)
	0.38s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
age_last_norm,0.004921,0.002873,0.009308,5,0.010836,-0.000994
receiving_2pt_conversions_last_norm,0.001511,0.001207,0.02442,5,0.003996,-0.000974
td_pfr_rec_last_norm,0.001152,0.00152,0.082689,5,0.004282,-0.001978
receiving_first_downs_last_norm,0.000858,0.000542,0.012033,5,0.001975,-0.000259
offense_snaps_last_norm,0.000848,0.000408,0.004836,5,0.001688,8e-06
catch_percentage_last_norm,0.000813,0.000499,0.010984,5,0.001841,-0.000215
rank_last_norm,0.000645,0.000568,0.031971,5,0.001814,-0.000524
receiving_epa_last_norm,0.000605,0.000242,0.00251,5,0.001102,0.000107
rec_pfr_rec_last_norm,0.000429,0.00049,0.060978,5,0.001437,-0.00058
rat_pfr_rec_last_norm,0.000387,0.000465,0.068194,5,0.001344,-0.000571
