# Imports

In [1]:
# pip install autogluon

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [2]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

# Filter By Position

In [3]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5039


In [4]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4809


In [5]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# WR Relveant Columns

In [6]:
wr_cols = [
    'player_id', 'season',         
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'fantasy_points', 'fantasy_points_ppr',
    'games', 'tgt_sh', 'ay_sh', 'yac_sh', 'wopr_y', 'ry_sh', 'rtd_sh',
    'rfd_sh', 'rtdfd_sh', 'dom', 'w8dom', 'yptmpa', 'ppr_sh', 
    'position', 'player_name', 'age', 'team', 'rank', 'tier', 
    'Draft_Year', 'Draft_Round', 'Draft_Overall',
    'Draft Team', 'College', 'avg_cushion', 'avg_separation',
    'avg_intended_air_yards_receiving', 'percent_share_of_intended_air_yards', 
    'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac',
    'avg_yac_above_expectation', 'pfr_player_id', 'offense_snaps', 'offense_pct', 
    'gs_pfr_rec', 'tgt_pfr_rec', 'rec_pfr_rec', 'yds_pfr_rec', 'td_pfr_rec',
    'x1d_pfr_rec', 'ybc_pfr_rec', 'ybc_r_pfr_rec', 'yac_pfr_rec',
    'yac_r_pfr_rec', 'adot_pfr_rec', 'brk_tkl_pfr_rec', 'rec_br_pfr_rec',
    'drop_pfr_rec', 'drop_percent_pfr_rec', 'int_pfr_rec', 'rat_pfr_rec'
]

In [7]:
df_wr = df_wr[wr_cols]

In [8]:
df_wr = df_wr.query("Draft_Round <= 3")

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [9]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [10]:
df_lag = create_lag_df(df_wr.copy(), cols_to_filter=2)

In [11]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
1468,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [12]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [13]:
non_numeric_cols

Index(['player_id', 'position', 'player_name', 'team', 'Draft Team', 'College',
       'pfr_player_id', 'position_last', 'player_name_last', 'team_last',
       'Draft Team_last', 'College_last', 'pfr_player_id_last'],
      dtype='object')

In [14]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [15]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'yac_sh_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Round_last',

    'catch_percentage_last',
    'offense_snaps_last',
    'offense_pct_last',
    'tgt_pfr_rec_last',
    'rec_pfr_rec_last',
    'td_pfr_rec_last',
    'x1d_pfr_rec_last',
    'int_pfr_rec_last',
    'rat_pfr_rec_last'
]

In [16]:
label = "fantasy_points_ppr"

In [17]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [18]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [19]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [20]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,receiving_2pt_conversions_last,racr_last,target_share_last,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,30,60,296.0,0.0,0.0,18.0,-4.798412,0,0.0,2.213902,...,,,,,,,,,,0.141524
1,15,33,232.0,0.0,0.0,13.0,9.82212,0,0.0,1.227592,...,,,,,,,,,,0.071445
2,3,6,26.0,0.0,0.0,2.0,-3.555738,0,0.0,0.15758,...,,,,,,,,,,0.0
3,2,7,19.0,0.0,0.0,1.0,-1.031869,0,0.0,0.163173,...,,,,,,,,,,0.018203
4,4,7,42.0,0.0,0.0,2.0,-0.727651,0,0.0,0.230948,...,,,,,,,,,,0.070535


In [21]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [22]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,receiving_2pt_conversions_last_norm,racr_last_norm,target_share_last_norm,...,catch_percentage_last_norm,offense_snaps_last_norm,offense_pct_last_norm,tgt_pfr_rec_last_norm,rec_pfr_rec_last_norm,td_pfr_rec_last_norm,x1d_pfr_rec_last_norm,int_pfr_rec_last_norm,rat_pfr_rec_last_norm,fantasy_points_ppr_norm
0,0.201342,0.292683,0.150713,0.007928,0.001181,0.193548,0.233803,0.0,0.071005,0.366797,...,,,,,,,,,,0.141524
1,0.100671,0.160976,0.118126,0.007928,0.001181,0.139785,0.304575,0.0,0.071005,0.203386,...,,,,,,,,,,0.071445
2,0.020134,0.029268,0.013238,0.007928,0.001181,0.021505,0.239818,0.0,0.071005,0.026108,...,,,,,,,,,,0.0
3,0.013423,0.034146,0.009674,0.007928,0.001181,0.010753,0.252035,0.0,0.071005,0.027034,...,,,,,,,,,,0.018203
4,0.026846,0.034146,0.021385,0.007928,0.001181,0.021505,0.253508,0.0,0.071005,0.038263,...,,,,,,,,,,0.070535


In [23]:
for col in df_ML.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm
fantasy_points_ppr_norm


# Create Train, Test, Validation Splits

In [24]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [25]:
for col in X.columns:
    if "fantasy" in col:
        print(col)

fantasy_points_last_norm
fantasy_points_ppr_last_norm


In [26]:
y

0       0.141524
1       0.071445
2       0.000000
3       0.018203
4       0.070535
          ...   
1814    0.025256
1815    0.122184
1816    0.061889
1817    0.303072
1818    0.230489
Name: fantasy_points_ppr_norm, Length: 1819, dtype: float64

In [27]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [28]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [29]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [30]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240510_003817"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240510_003817"
AutoGluon Version:  1.1.0
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.2.0: Wed Nov 15 21:53:34 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T8103
CPU 

## Evaluation

In [31]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.1486545606995378, 'mean_squared_error': -0.022098178416772567, 'mean_absolute_error': -0.11844035294060756, 'r2': 0.41657989068678036, 'pearsonr': 0.6477445896578498, 'median_absolute_error': -0.09837172332477245}


In [32]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

1509    0.328195
990     0.143766
135     0.100611
408     0.450102
591     0.086604
Name: fantasy_points_ppr_norm, dtype: float32


In [33]:
test_data[label_column]

1509    0.213879
990     0.140159
135     0.021388
408     0.250967
591     0.201365
          ...   
534     0.087600
711     0.554721
432     0.156314
1684    0.448237
170     0.500796
Name: fantasy_points_ppr_norm, Length: 364, dtype: float64

### Reversing the normalization

In [34]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [35]:
min_v = df_to_norm['fantasy_points_ppr_last'].min()
max_v = df_to_norm['fantasy_points_ppr_last'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [36]:
original_predictions

1509     93.371104
990      60.912127
135       8.617110
408     109.700774
591      87.861092
           ...    
534      37.770080
711     243.443777
432      68.025051
1684    196.558589
170     219.700637
Name: fantasy_points_ppr_norm, Length: 364, dtype: float64

In [37]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [38]:
model_predictions

1509    143.704224
990      62.500305
135      43.498966
408     197.379761
591      37.331825
           ...    
534      75.649452
711     189.448242
432     113.859993
1684    203.998947
170     136.181412
Name: fantasy_points_ppr_norm, Length: 364, dtype: float32

In [39]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [40]:
close_enough

69

In [41]:
len(model_predictions)

364

In [42]:
close_enough / len(model_predictions)

0.18956043956043955

In [43]:
far

237

## Further Information

In [44]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.148655,-0.147682,root_mean_squared_error,0.080431,0.043299,3.549593,0.001399,0.000261,0.008193,2,True,9
1,CatBoost,-0.150305,-0.152583,root_mean_squared_error,0.003108,0.001349,1.136072,0.003108,0.001349,1.136072,1,True,4
2,ExtraTreesMSE,-0.150637,-0.150384,root_mean_squared_error,0.05915,0.026447,0.380475,0.05915,0.026447,0.380475,1,True,5
3,RandomForestMSE,-0.152016,-0.151855,root_mean_squared_error,0.107135,0.042018,0.870236,0.107135,0.042018,0.870236,1,True,3
4,NeuralNetFastAI,-0.153686,-0.154315,root_mean_squared_error,0.01211,0.010392,1.154853,0.01211,0.010392,1.154853,1,True,6
5,NeuralNetTorch,-0.154488,-0.151729,root_mean_squared_error,0.007772,0.006199,2.006072,0.007772,0.006199,2.006072,1,True,8
6,XGBoost,-0.156958,-0.156157,root_mean_squared_error,0.008173,0.002191,0.801534,0.008173,0.002191,0.801534,1,True,7
7,KNeighborsDist,-0.157776,-0.16447,root_mean_squared_error,0.007705,0.005509,0.171307,0.007705,0.005509,0.171307,1,True,2
8,KNeighborsUnif,-0.158331,-0.166151,root_mean_squared_error,0.010094,0.057818,3.671206,0.010094,0.057818,3.671206,1,True,1


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [45]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['yac_sh_last_norm']
Computing feature importance via permutation shuffling for 31 features using 364 rows with 5 shuffle sets...
	13.7s	= Expected runtime (2.74s per shuffle set)
	1.19s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
age_last_norm,0.004924,0.000464,9e-06,5,0.005878,0.003969
yptmpa_last_norm,0.003252,0.001626,0.00553,5,0.006599,-9.6e-05
ppr_sh_last_norm,0.001738,0.000627,0.001726,5,0.00303,0.000446
w8dom_last_norm,0.00115,0.000333,0.000754,5,0.001835,0.000465
games_last_norm,0.000837,0.000613,0.018948,5,0.0021,-0.000425
Draft_Round_last_norm,0.000746,0.000469,0.011809,5,0.00171,-0.000219
int_pfr_rec_last_norm,0.000683,0.000163,0.000362,5,0.001019,0.000347
offense_snaps_last_norm,0.000486,0.000203,0.002963,5,0.000905,6.7e-05
tgt_sh_last_norm,0.000428,0.000569,0.083885,5,0.001598,-0.000743
receiving_yards_last_norm,0.000388,0.001114,0.239775,5,0.002681,-0.001905
