# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_1999_2023.pkl")

## DF Columns By Category

| Category                     | Columns                                  |
|------------------------------|------------------------------------------|
| **Passing Statistics**       | completions, attempts, passing_yards, passing_tds, interceptions, sacks, sack_yards, sack_fumbles, sack_fumbles_lost, passing_air_yards, passing_yards_after_catch, passing_first_downs, passing_epa, passing_2pt_conversions, pacr, dakota |
| **Rushing Statistics**       | carries, rushing_yards, rushing_tds, rushing_fumbles, rushing_fumbles_lost, rushing_first_downs, rushing_epa, rushing_2pt_conversions, ry_sh, rtd_sh, rfd_sh, rtdfd_sh |
| **Receiving Statistics**     | receptions, targets, receiving_yards, receiving_tds, receiving_fumbles, receiving_fumbles_lost, receiving_air_yards, receiving_yards_after_catch, receiving_first_downs, receiving_epa, receiving_2pt_conversions, racr, target_share, air_yards_share, wopr_x |
| **Fantasy and Special Teams**| special_teams_tds, fantasy_points, fantasy_points_ppr |
| **General Game Statistics**  | games                                    |
| **Player Information**       | player_id, season, season_type, position, player_name, age, team, rank, tier, Draft Year, Draft No., Draft Round, Draft Pick, Draft Overall, Draft Team, College |
| **Passing Efficiency Metrics**| avg_time_to_throw, avg_completed_air_yards, avg_intended_air_yards, avg_air_yards_differential, aggressiveness, max_completed_air_distance, avg_air_yards_to_sticks, pass_yards, pass_touchdowns, passer_rating, completion_percentage, expected_completion_percentage, completion_percentage_above_expectation, avg_air_distance, max_air_distance, efficiency, percent_attempts_gte_eight_defenders, avg_time_to_los |
| **Rushing Efficiency Metrics**| rush_attempts, rush_yards, expected_rush_yards, rush_yards_over_expected, avg_rush_yards, rush_yards_over_expected_per_att, rush_pct_over_expected, rush_touchdowns |
| **Receiving Efficiency Metrics**| avg_cushion, avg_separation, avg_intended_air_yards_receiving, percent_share_of_intended_air_yards, catch_percentage, yards, rec_touchdowns, avg_yac, avg_expected_yac, avg_yac_above_expectation |


# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5009


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4779


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [7]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season", "season_type"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [8]:
df_lag = create_lag_df(df_wr.copy())

In [9]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
2630,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [10]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [11]:
non_numeric_cols

Index(['player_id', 'season_type', 'position', 'player_name', 'team',
       'Draft Year', 'Draft No.', 'Draft Round', 'Draft Pick', 'Draft Overall',
       'Draft Team', 'College', 'position_last', 'player_name_last',
       'team_last', 'Draft Year_last', 'Draft No._last', 'Draft Round_last',
       'Draft Pick_last', 'Draft Overall_last', 'Draft Team_last',
       'College_last'],
      dtype='object')

In [12]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [13]:
[col for col in df_lag.columns if '_last' in col]

['completions_last',
 'attempts_last',
 'passing_yards_last',
 'passing_tds_last',
 'interceptions_last',
 'sacks_last',
 'sack_yards_last',
 'sack_fumbles_last',
 'sack_fumbles_lost_last',
 'passing_air_yards_last',
 'passing_yards_after_catch_last',
 'passing_first_downs_last',
 'passing_epa_last',
 'passing_2pt_conversions_last',
 'pacr_last',
 'dakota_last',
 'carries_last',
 'rushing_yards_last',
 'rushing_tds_last',
 'rushing_fumbles_last',
 'rushing_fumbles_lost_last',
 'rushing_first_downs_last',
 'rushing_epa_last',
 'rushing_2pt_conversions_last',
 'receptions_last',
 'targets_last',
 'receiving_yards_last',
 'receiving_tds_last',
 'receiving_fumbles_last',
 'receiving_fumbles_lost_last',
 'receiving_air_yards_last',
 'receiving_yards_after_catch_last',
 'receiving_first_downs_last',
 'receiving_epa_last',
 'receiving_2pt_conversions_last',
 'racr_last',
 'target_share_last',
 'air_yards_share_last',
 'wopr_x_last',
 'special_teams_tds_last',
 'fantasy_points_last',
 'fantasy

In [14]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_tds_last',
    'receiving_fumbles_last',
    'receiving_fumbles_lost_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'wopr_x_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'ay_sh_last',
    'yac_sh_last',
    'wopr_y_last',
    'ry_sh_last',
    'rtd_sh_last',
    'rfd_sh_last',
    'rtdfd_sh_last',
    'dom_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Year_last',
    'Draft_Round_last',
    'Draft_Overall_last',
    'avg_cushion_last',
    'avg_separation_last',
    'avg_intended_air_yards_receiving_last',
    'percent_share_of_intended_air_yards_last',
    'catch_percentage_last',
#     'yards_last',  # same as receiving yards - duplicate column
    'rec_touchdowns_last',
    'avg_yac_last',
    'avg_expected_yac_last',
    'avg_yac_above_expectation_last'
]

In [15]:
label = "fantasy_points_ppr"

In [16]:
cols_to_norm = feature_columns
df_to_norm = df_lag[cols_to_norm].copy()

In [17]:
df_to_norm[label] = df_lag[label]

# Normalize the Dataset

In [18]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [19]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [20]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_tds_last,receiving_fumbles_last,receiving_fumbles_lost_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,...,Draft_Overall_last_norm,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm
0,30,60,296.0,1,1.0,0.0,0.0,0.0,18.0,-4.798412,...,0.04,,,,,,,,,
1,15,33,232.0,4,0.0,0.0,0.0,0.0,13.0,9.82212,...,0.04,,,,,,,,,
2,3,6,26.0,0,0.0,0.0,0.0,0.0,2.0,-3.555738,...,0.205,,,,,,,,,
3,16,28,202.0,0,1.0,1.0,0.0,0.0,10.0,-6.491849,...,0.275,,,,,,,,,
4,3,5,23.0,0,0.0,0.0,0.0,0.0,2.0,0.845925,...,0.275,,,,,,,,,


In [21]:
df_norm["fantasy_points_ppr"] = df_to_norm["fantasy_points_ppr"]

In [22]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_tds_last,receiving_fumbles_last,receiving_fumbles_lost_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,...,Draft_Overall_last_norm,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm
0,30,60,296.0,1,1.0,0.0,0.0,0.0,18.0,-4.798412,...,0.04,,,,,,,,,
1,15,33,232.0,4,0.0,0.0,0.0,0.0,13.0,9.82212,...,0.04,,,,,,,,,
2,3,6,26.0,0,0.0,0.0,0.0,0.0,2.0,-3.555738,...,0.205,,,,,,,,,
3,16,28,202.0,0,1.0,1.0,0.0,0.0,10.0,-6.491849,...,0.275,,,,,,,,,
4,3,5,23.0,0,0.0,0.0,0.0,0.0,2.0,0.845925,...,0.275,,,,,,,,,


In [26]:
df_norm.columns[len(cols_to_norm):]

Index(['fantasy_points_ppr', 'receptions_last_norm', 'targets_last_norm',
       'receiving_yards_last_norm', 'receiving_tds_last_norm',
       'receiving_fumbles_last_norm', 'receiving_fumbles_lost_last_norm',
       'receiving_air_yards_last_norm',
       'receiving_yards_after_catch_last_norm',
       'receiving_first_downs_last_norm', 'receiving_epa_last_norm',
       'receiving_2pt_conversions_last_norm', 'racr_last_norm',
       'target_share_last_norm', 'air_yards_share_last_norm',
       'wopr_x_last_norm', 'fantasy_points_last_norm',
       'fantasy_points_ppr_last_norm', 'games_last_norm', 'tgt_sh_last_norm',
       'ay_sh_last_norm', 'yac_sh_last_norm', 'wopr_y_last_norm',
       'ry_sh_last_norm', 'rtd_sh_last_norm', 'rfd_sh_last_norm',
       'rtdfd_sh_last_norm', 'dom_last_norm', 'w8dom_last_norm',
       'yptmpa_last_norm', 'ppr_sh_last_norm', 'age_last_norm',
       'rank_last_norm', 'tier_last_norm', 'Draft_Year_last_norm',
       'Draft_Round_last_norm', 'Draft_Overal

In [29]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [30]:
df_ML.head()

Unnamed: 0,fantasy_points_ppr,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_tds_last_norm,receiving_fumbles_last_norm,receiving_fumbles_lost_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,...,Draft_Overall_last_norm,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm
0,62.2,0.201342,0.292683,0.152008,0.043478,0.2,0.0,0.011845,0.003534,0.193548,...,0.04,,,,,,,,,
1,31.4,0.100671,0.160976,0.119471,0.173913,0.0,0.0,0.011845,0.003534,0.139785,...,0.04,,,,,,,,,
2,0.0,0.020134,0.029268,0.014743,0.0,0.0,0.0,0.011845,0.003534,0.021505,...,0.205,,,,,,,,,
3,3.8,0.107383,0.136585,0.10422,0.0,0.2,0.25,0.011845,0.003534,0.107527,...,0.275,,,,,,,,,
4,9.7,0.020134,0.02439,0.013218,0.0,0.0,0.0,0.011845,0.003534,0.021505,...,0.275,,,,,,,,,


# Create Train, Test, Validation Splits

In [39]:
X = df_ML.iloc[:, 1:]  # features
y = df_ML.iloc[:, 0]   # label

In [41]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [42]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [43]:
label_column = 'fantasy_points_ppr' 

## Fit

In [44]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240505_203038"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240505_203038"
AutoGluon Version:  1.1.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2
CPU Count:          24
Memory Avail:    

## Evaluation

In [45]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -61.87817987644226, 'mean_squared_error': -3828.9091448213435, 'mean_absolute_error': -48.85640369710435, 'r2': 0.5001792643761886, 'pearsonr': 0.7105142383982317, 'median_absolute_error': -40.79430084228515}


In [46]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

790     106.371712
1735     57.866531
2266    130.331406
1052    182.371643
443     175.218491
Name: fantasy_points_ppr, dtype: float32


In [47]:
test_data[label_column]

790      49.3
1735     86.7
2266    225.1
1052    115.3
443      61.9
        ...  
2405     11.8
1253    241.7
2164     57.0
572      26.2
1551    278.6
Name: fantasy_points_ppr, Length: 685, dtype: float64

In [51]:
close_enough = 0
far = 0
for guess, answer in zip(predictions, test_data[label_column]):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [52]:
close_enough

122

In [54]:
len(predictions)

685

In [55]:
close_enough / len(predictions)

0.1781021897810219

In [56]:
far

423

## Further Information

In [57]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-61.87818,-60.037331,root_mean_squared_error,0.082409,0.054358,5.103268,0.001445,0.000225,0.007993,2,True,12
1,ExtraTreesMSE,-61.880947,-60.507898,root_mean_squared_error,0.051904,0.036042,0.33191,0.051904,0.036042,0.33191,1,True,7
2,RandomForestMSE,-62.711948,-61.08062,root_mean_squared_error,0.052823,0.035196,0.57799,0.052823,0.035196,0.57799,1,True,5
3,LightGBM,-62.773786,-61.705229,root_mean_squared_error,0.000967,0.001152,0.453728,0.000967,0.001152,0.453728,1,True,4
4,LightGBMXT,-62.860233,-60.434209,root_mean_squared_error,0.005604,0.001602,0.688485,0.005604,0.001602,0.688485,1,True,3
5,CatBoost,-62.991678,-61.004419,root_mean_squared_error,0.003097,0.001525,0.533871,0.003097,0.001525,0.533871,1,True,6
6,NeuralNetTorch,-63.312083,-62.070689,root_mean_squared_error,0.007329,0.006227,1.482296,0.007329,0.006227,1.482296,1,True,10
7,LightGBMLarge,-63.651824,-63.239259,root_mean_squared_error,0.007099,0.001341,1.382063,0.007099,0.001341,1.382063,1,True,11
8,XGBoost,-63.774146,-62.789007,root_mean_squared_error,0.009267,0.002628,0.523796,0.009267,0.002628,0.523796,1,True,9
9,NeuralNetFastAI,-63.792526,-62.01552,root_mean_squared_error,0.016127,0.010262,2.592584,0.016127,0.010262,2.592584,1,True,8


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [58]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['yac_sh_last_norm']
Computing feature importance via permutation shuffling for 44 features using 685 rows with 5 shuffle sets...
	19.69s	= Expected runtime (3.94s per shuffle set)
	1.77s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
yptmpa_last_norm,1.919353,0.404464,0.000223,5,2.752151,1.086555
age_last_norm,1.006704,0.155933,6.7e-05,5,1.327773,0.685635
fantasy_points_last_norm,0.930321,0.247861,0.000551,5,1.440669,0.419973
tier_last_norm,0.807577,0.228816,0.000697,5,1.278713,0.336441
fantasy_points_ppr_last_norm,0.769165,0.16619,0.000246,5,1.111352,0.426978
rank_last_norm,0.755206,0.161851,0.000238,5,1.088458,0.421953
receiving_yards_last_norm,0.728733,0.201825,0.000639,5,1.144293,0.313173
receptions_last_norm,0.635826,0.144352,0.000298,5,0.933048,0.338605
receiving_epa_last_norm,0.548011,0.178077,0.001169,5,0.914674,0.181347
ppr_sh_last_norm,0.509916,0.197407,0.002231,5,0.916381,0.103452
