# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [7]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec'
]

In [8]:
df_wr = df_wr[wr_cols]

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [9]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [10]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [11]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
2664,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [12]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [13]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last'],
      dtype='object')

In [14]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [15]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'ay_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',
    'Draft_Overall_last',

    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',
    'gs_pfr_rec_last',
    'tgt_pfr_rec_last',
    'rec_pfr_rec_last',
    'yds_pfr_rec_last',
    'td_pfr_rec_last',
    'x1d_pfr_rec_last',
    'ybc_pfr_rec_last',
    'ybc_r_pfr_rec_last',
    'yac_pfr_rec_last',
    'yac_r_pfr_rec_last',
    'adot_pfr_rec_last',
    'brk_tkl_pfr_rec_last',
    'rec_br_pfr_rec_last',
    'drop_pfr_rec_last',
    'drop_percent_pfr_rec_last',
    'int_pfr_rec_last',
    'rat_pfr_rec_last'
]

In [16]:
label = "fantasy_points_ppr"

In [17]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [18]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [19]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [20]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,receiving_2pt_conversions_last,racr_last,target_share_last,...,yac_pfr_rec_last_norm,yac_r_pfr_rec_last_norm,adot_pfr_rec_last_norm,brk_tkl_pfr_rec_last_norm,rec_br_pfr_rec_last_norm,drop_pfr_rec_last_norm,drop_percent_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,30,60,296.0,0.0,0.0,18.0,-4.798412,0,0.0,2.213902,...,,,,,,,,,,0.14425
1,15,33,232.0,0.0,0.0,13.0,9.82212,0,0.0,1.227592,...,,,,,,,,,,0.074393
2,3,6,26.0,0.0,0.0,2.0,-3.555738,0,0.0,0.15758,...,,,,,,,,,,0.003175
3,16,28,202.0,0.0,0.0,10.0,-6.491849,0,0.0,0.864763,...,,,,,,,,,,0.011794
4,3,5,23.0,0.0,0.0,2.0,0.845925,0,0.0,0.157258,...,,,,,,,,,,0.025176


In [21]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [22]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,receiving_2pt_conversions_last_norm,racr_last_norm,target_share_last_norm,...,yac_pfr_rec_last_norm,yac_r_pfr_rec_last_norm,adot_pfr_rec_last_norm,brk_tkl_pfr_rec_last_norm,rec_br_pfr_rec_last_norm,drop_pfr_rec_last_norm,drop_percent_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,0.201342,0.292683,0.152008,0.011845,0.003534,0.193548,0.233803,0.0,0.071005,0.348974,...,,,,,,,,,,0.14425
1,0.100671,0.160976,0.119471,0.011845,0.003534,0.139785,0.304575,0.0,0.071005,0.193503,...,,,,,,,,,,0.074393
2,0.020134,0.029268,0.014743,0.011845,0.003534,0.021505,0.239818,0.0,0.071005,0.024839,...,,,,,,,,,,0.003175
3,0.107383,0.136585,0.10422,0.011845,0.003534,0.107527,0.225605,0.0,0.071005,0.136311,...,,,,,,,,,,0.011794
4,0.020134,0.02439,0.013218,0.011845,0.003534,0.021505,0.261125,0.0,0.071005,0.024788,...,,,,,,,,,,0.025176


In [23]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm
fantasy_points_ppr_norm


# Create Train, Test, Validation Splits

In [24]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [25]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm


In [26]:
y

0       0.144250
1       0.074393
2       0.003175
3       0.011794
4       0.025176
          ...   
3453    0.124972
3454    0.008165
3455    0.064867
3456    0.305285
3457    0.232933
Name: fantasy_points_ppr_norm, Length: 3458, dtype: float64

In [27]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [28]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [29]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [30]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240510_002857"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240510_002857"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [31]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.13188848629492417, 'mean_squared_error': -0.017394572817166398, 'mean_absolute_error': -0.10494354047461818, 'r2': 0.5327754245287739, 'pearsonr': 0.7308652334501999, 'median_absolute_error': -0.08760047707361113}


In [32]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

1775    0.438851
51      0.390444
194     0.557609
2756    0.064342
2019    0.136600
Name: fantasy_points_ppr_norm, dtype: float32


In [33]:
test_data[label_column]

1775    0.299614
51      0.516444
194     0.737582
2756    0.089136
2019    0.067135
          ...   
2250    0.012701
611     0.115672
1506    0.181901
3173    0.202087
1569    0.025629
Name: fantasy_points_ppr_norm, Length: 692, dtype: float64

### Reversing the normalization

In [34]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [35]:
min_v = df_to_norm['fantasy_points_ppr_last'].min()
max_v = df_to_norm['fantasy_points_ppr_last'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [36]:
original_predictions

1775    129.733468
51      225.632692
194     323.437863
2756     36.643007
2019     26.912647
           ...    
2250      2.837528
611      48.379628
1506     77.671023
3173     86.598880
1569      8.555369
Name: fantasy_points_ppr_norm, Length: 692, dtype: float64

In [37]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [38]:
model_predictions

1775    191.315186
51      169.905426
194     243.839401
2756     25.676994
2019     57.635258
           ...    
2250     45.952198
611      77.149368
1506     38.501255
3173     98.845734
1569     95.208130
Name: fantasy_points_ppr_norm, Length: 692, dtype: float32

In [39]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [40]:
close_enough

129

In [41]:
len(model_predictions)

692

In [42]:
close_enough / len(model_predictions)

0.18641618497109827

In [43]:
far

412

## Further Information

In [44]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.131888,-0.14236,root_mean_squared_error,0.160904,0.102746,5.744626,0.001932,0.000267,0.008601,2,True,9
1,RandomForestMSE,-0.132385,-0.145908,root_mean_squared_error,0.063242,0.039137,2.013027,0.063242,0.039137,2.013027,1,True,3
2,ExtraTreesMSE,-0.132777,-0.145233,root_mean_squared_error,0.065569,0.040601,0.605943,0.065569,0.040601,0.605943,1,True,5
3,CatBoost,-0.132916,-0.144033,root_mean_squared_error,0.00359,0.001584,1.034756,0.00359,0.001584,1.034756,1,True,4
4,XGBoost,-0.133741,-0.147052,root_mean_squared_error,0.008959,0.002693,1.125784,0.008959,0.002693,1.125784,1,True,7
5,NeuralNetTorch,-0.134442,-0.148229,root_mean_squared_error,0.011426,0.009775,3.185748,0.011426,0.009775,3.185748,1,True,8
6,NeuralNetFastAI,-0.136631,-0.144924,root_mean_squared_error,0.017734,0.012085,1.906975,0.017734,0.012085,1.906975,1,True,6
7,KNeighborsDist,-0.145032,-0.15332,root_mean_squared_error,0.008837,0.009072,0.175323,0.008837,0.009072,0.175323,1,True,2
8,KNeighborsUnif,-0.147407,-0.154298,root_mean_squared_error,0.007164,0.064039,3.819023,0.007164,0.064039,3.819023,1,True,1


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [45]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['yac_sh_last_norm']
Computing feature importance via permutation shuffling for 44 features using 692 rows with 5 shuffle sets...
	40.49s	= Expected runtime (8.1s per shuffle set)
	3.7s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
yptmpa_last_norm,0.005389,0.000928,0.000102,5,0.0073,0.003477
age_last_norm,0.003788,0.000961,0.000458,5,0.005767,0.001808
tgt_sh_last_norm,0.003514,0.000762,0.000249,5,0.005082,0.001945
receiving_yards_last_norm,0.003205,0.000947,0.000818,5,0.005155,0.001255
rank_last_norm,0.00275,0.000607,0.000267,5,0.003999,0.001501
fantasy_points_ppr_last_norm,0.001842,0.000454,0.000411,5,0.002778,0.000906
receptions_last_norm,0.001804,0.000533,0.000817,5,0.002901,0.000706
games_last_norm,0.001801,0.00071,0.002384,5,0.003263,0.000339
fantasy_points_last_norm,0.001761,0.000415,0.000345,5,0.002616,0.000906
receiving_first_downs_last_norm,0.00175,0.000613,0.001544,5,0.003011,0.000488
