# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [7]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec'
]

In [8]:
df_wr = df_wr[wr_cols]

In [9]:
df_wr = df_wr.query("Draft_Round <= 3 and games >= 6 and offense_snaps >= 100 and tier <= 5")

In [10]:
len(df_wr)

499

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [11]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [12]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [13]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
184,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [14]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [15]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last'],
      dtype='object')

In [16]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [17]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',

    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',
    'tgt_pfr_rec_last',
    'rec_pfr_rec_last',
    'td_pfr_rec_last',
    'x1d_pfr_rec_last',
    'int_pfr_rec_last',
    'rat_pfr_rec_last'
]

In [18]:
label = "fantasy_points_ppr"

In [19]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [20]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [21]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [22]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,receiving_2pt_conversions_last,racr_last,target_share_last,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,73,138,1174.0,1832.0,268.0,51.0,25.508378,0,11.858338,4.521201,...,,0.701493,0.819015,,,,,,,0.147692
1,64,110,745.0,1219.0,178.0,44.0,24.914914,0,10.475034,3.743449,...,,0.526119,0.61512,,,,,,,0.323077
2,79,134,1065.0,1413.0,363.0,45.0,39.161685,0,13.027294,3.988147,...,,0.590796,0.591065,,,,,,,0.050769
3,46,73,670.0,719.0,256.0,28.0,22.417599,0,6.641955,1.75286,...,,0.0,0.458027,,,,,,,0.213538
4,112,163,1598.0,1848.0,548.0,79.0,75.891575,0,14.032147,4.711548,...,,0.782338,0.775487,,,,,,,0.508308


In [23]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [24]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,receiving_2pt_conversions_last_norm,racr_last_norm,target_share_last_norm,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,0.394958,0.595092,0.487346,0.609249,0.221024,0.44,0.476458,0.0,0.093661,0.761987,...,,0.701493,0.819015,,,,,,,0.147692
1,0.319328,0.423313,0.208955,0.349173,0.09973,0.346667,0.472861,0.0,0.072901,0.586518,...,,0.526119,0.61512,,,,,,,0.323077
2,0.445378,0.570552,0.416613,0.431481,0.349057,0.36,0.559215,0.0,0.111204,0.641725,...,,0.590796,0.591065,,,,,,,0.050769
3,0.168067,0.196319,0.160286,0.137039,0.204852,0.133333,0.457724,0.0,0.015377,0.137421,...,,0.0,0.458027,,,,,,,0.213538
4,0.722689,0.748466,0.762492,0.616037,0.598383,0.813333,0.781846,0.0,0.126284,0.804931,...,,0.782338,0.775487,,,,,,,0.508308


In [25]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm
fantasy_points_ppr_norm


# Create Train, Test, Validation Splits

In [26]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [27]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm


In [28]:
y

0      0.147692
1      0.323077
2      0.050769
3      0.213538
4      0.508308
         ...   
311    0.184246
312    0.359385
313    0.290154
314    0.303692
315    0.031692
Name: fantasy_points_ppr_norm, Length: 316, dtype: float64

In [29]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [30]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [31]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [33]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240510_172701"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240510_172701"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [34]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.17549547628817364, 'mean_squared_error': -0.030798662197612913, 'mean_absolute_error': -0.13301471954117028, 'r2': 0.18931411000656517, 'pearsonr': 0.44393012371176477, 'median_absolute_error': -0.10008043087995969}


In [35]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

173    0.500628
33     0.204873
165    0.250482
78     0.259718
93     0.306701
Name: fantasy_points_ppr_norm, dtype: float32


In [36]:
test_data[label_column]

173    0.680000
33     0.461231
165    0.175077
78     0.116000
93     0.097231
         ...   
132    0.011385
253    0.286523
175    0.494954
225    0.228923
223    0.161908
Name: fantasy_points_ppr_norm, Length: 64, dtype: float64

### Reversing the normalization

In [37]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [38]:
min_v = df_to_norm['fantasy_points_ppr_last'].min()
max_v = df_to_norm['fantasy_points_ppr_last'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [39]:
original_predictions

173    335.50
33     264.40
165    171.40
78     152.20
93     146.10
        ...  
132    118.20
253    207.62
175    275.36
225    188.90
223    167.12
Name: fantasy_points_ppr_norm, Length: 64, dtype: float64

In [40]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [41]:
model_predictions

173    277.204041
33     181.083801
165    195.906525
78     198.908218
93     214.177826
          ...    
132    194.094147
253    176.349335
175    205.448242
225    180.342804
223    186.408997
Name: fantasy_points_ppr_norm, Length: 64, dtype: float32

In [42]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [43]:
close_enough

15

In [44]:
len(model_predictions)

64

In [45]:
close_enough / len(model_predictions)

0.234375

In [46]:
far

33

## Further Information

In [47]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.175495,-0.152439,root_mean_squared_error,0.028553,0.011707,1.418302,0.001413,0.000253,0.007737,2,True,9
1,CatBoost,-0.176859,-0.158214,root_mean_squared_error,0.004272,0.001049,0.390423,0.004272,0.001049,0.390423,1,True,4
2,ExtraTreesMSE,-0.178077,-0.161085,root_mean_squared_error,0.036282,0.027929,0.224676,0.036282,0.027929,0.224676,1,True,5
3,RandomForestMSE,-0.179842,-0.16182,root_mean_squared_error,0.055849,0.026269,0.26404,0.055849,0.026269,0.26404,1,True,3
4,NeuralNetFastAI,-0.183296,-0.160336,root_mean_squared_error,0.014548,0.006308,0.675897,0.014548,0.006308,0.675897,1,True,6
5,KNeighborsUnif,-0.184592,-0.16873,root_mean_squared_error,0.002953,0.003364,0.008771,0.002953,0.003364,0.008771,1,True,1
6,XGBoost,-0.184819,-0.161131,root_mean_squared_error,0.006166,0.001478,0.340911,0.006166,0.001478,0.340911,1,True,7
7,KNeighborsDist,-0.186389,-0.16868,root_mean_squared_error,0.003171,0.001756,0.006023,0.003171,0.001756,0.006023,1,True,2
8,NeuralNetTorch,-0.186977,-0.157868,root_mean_squared_error,0.006426,0.003668,0.393757,0.006426,0.003668,0.393757,1,True,8


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [48]:
predictor.feature_importance(data=test_data)

Computing feature importance via permutation shuffling for 32 features using 64 rows with 5 shuffle sets...
	7.25s	= Expected runtime (1.45s per shuffle set)
	0.7s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
age_last_norm,0.003477,0.00271,0.022751,5,0.009057,-0.002103
receptions_last_norm,0.001692,0.003388,0.163284,5,0.008668,-0.005283
receiving_yards_after_catch_last_norm,0.001564,0.002069,0.083116,5,0.005825,-0.002697
int_pfr_rec_last_norm,0.001278,0.000724,0.008453,5,0.002769,-0.000214
air_yards_share_last_norm,0.00079,0.001303,0.123292,5,0.003473,-0.001892
yptmpa_last_norm,0.000735,0.00177,0.202995,5,0.00438,-0.002911
receiving_air_yards_last_norm,0.000733,0.000427,0.009258,5,0.001612,-0.000147
receiving_2pt_conversions_last_norm,0.000668,0.000873,0.081009,5,0.002465,-0.001129
targets_last_norm,0.000662,0.003924,0.362543,5,0.008742,-0.007418
offense_pct_last_norm,0.000568,0.000576,0.045986,5,0.001753,-0.000617
