# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_1999_2023.pkl")

## DF Columns By Category

| Category                     | Columns                                  |
|------------------------------|------------------------------------------|
| **Passing Statistics**       | completions, attempts, passing_yards, passing_tds, interceptions, sacks, sack_yards, sack_fumbles, sack_fumbles_lost, passing_air_yards, passing_yards_after_catch, passing_first_downs, passing_epa, passing_2pt_conversions, pacr, dakota |
| **Rushing Statistics**       | carries, rushing_yards, rushing_tds, rushing_fumbles, rushing_fumbles_lost, rushing_first_downs, rushing_epa, rushing_2pt_conversions, ry_sh, rtd_sh, rfd_sh, rtdfd_sh |
| **Receiving Statistics**     | receptions, targets, receiving_yards, receiving_tds, receiving_fumbles, receiving_fumbles_lost, receiving_air_yards, receiving_yards_after_catch, receiving_first_downs, receiving_epa, receiving_2pt_conversions, racr, target_share, air_yards_share, wopr_x |
| **Fantasy and Special Teams**| special_teams_tds, fantasy_points, fantasy_points_ppr |
| **General Game Statistics**  | games                                    |
| **Player Information**       | player_id, season, season_type, position, player_name, age, team, rank, tier, Draft Year, Draft No., Draft Round, Draft Pick, Draft Overall, Draft Team, College |
| **Passing Efficiency Metrics**| avg_time_to_throw, avg_completed_air_yards, avg_intended_air_yards, avg_air_yards_differential, aggressiveness, max_completed_air_distance, avg_air_yards_to_sticks, pass_yards, pass_touchdowns, passer_rating, completion_percentage, expected_completion_percentage, completion_percentage_above_expectation, avg_air_distance, max_air_distance, efficiency, percent_attempts_gte_eight_defenders, avg_time_to_los |
| **Rushing Efficiency Metrics**| rush_attempts, rush_yards, expected_rush_yards, rush_yards_over_expected, avg_rush_yards, rush_yards_over_expected_per_att, rush_pct_over_expected, rush_touchdowns |
| **Receiving Efficiency Metrics**| avg_cushion, avg_separation, avg_intended_air_yards_receiving, percent_share_of_intended_air_yards, catch_percentage, yards, rec_touchdowns, avg_yac, avg_expected_yac, avg_yac_above_expectation |


# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR' and games >= 10")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 2720


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 2651


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [7]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season", "season_type"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [8]:
df_lag = create_lag_df(df_wr.copy())

In [9]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
1372,Mike Evans,2017,1001.0,1321.0


# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [10]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [11]:
non_numeric_cols

Index(['player_id', 'season_type', 'position', 'player_name', 'team',
       'Draft Year', 'Draft No.', 'Draft Round', 'Draft Pick', 'Draft Overall',
       'Draft Team', 'College', 'position_last', 'player_name_last',
       'team_last', 'Draft Year_last', 'Draft No._last', 'Draft Round_last',
       'Draft Pick_last', 'Draft Overall_last', 'Draft Team_last',
       'College_last'],
      dtype='object')

In [12]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [13]:
[col for col in df_lag.columns if '_last' in col]

['completions_last',
 'attempts_last',
 'passing_yards_last',
 'passing_tds_last',
 'interceptions_last',
 'sacks_last',
 'sack_yards_last',
 'sack_fumbles_last',
 'sack_fumbles_lost_last',
 'passing_air_yards_last',
 'passing_yards_after_catch_last',
 'passing_first_downs_last',
 'passing_epa_last',
 'passing_2pt_conversions_last',
 'pacr_last',
 'dakota_last',
 'carries_last',
 'rushing_yards_last',
 'rushing_tds_last',
 'rushing_fumbles_last',
 'rushing_fumbles_lost_last',
 'rushing_first_downs_last',
 'rushing_epa_last',
 'rushing_2pt_conversions_last',
 'receptions_last',
 'targets_last',
 'receiving_yards_last',
 'receiving_tds_last',
 'receiving_fumbles_last',
 'receiving_fumbles_lost_last',
 'receiving_air_yards_last',
 'receiving_yards_after_catch_last',
 'receiving_first_downs_last',
 'receiving_epa_last',
 'receiving_2pt_conversions_last',
 'racr_last',
 'target_share_last',
 'air_yards_share_last',
 'wopr_x_last',
 'special_teams_tds_last',
 'fantasy_points_last',
 'fantasy

In [14]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_tds_last',
    'receiving_fumbles_last',
    'receiving_fumbles_lost_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'wopr_x_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'ay_sh_last',
    'yac_sh_last',
    'wopr_y_last',
    'ry_sh_last',
    'rtd_sh_last',
    'rfd_sh_last',
    'rtdfd_sh_last',
    'dom_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Year_last',
    'Draft_Round_last',
    'Draft_Overall_last',
    'avg_cushion_last',
    'avg_separation_last',
    'avg_intended_air_yards_receiving_last',
    'percent_share_of_intended_air_yards_last',
    'catch_percentage_last',
#     'yards_last',  # same as receiving yards - duplicate column
    'rec_touchdowns_last',
    'avg_yac_last',
    'avg_expected_yac_last',
    'avg_yac_above_expectation_last'
]

In [15]:
label = "fantasy_points_ppr"

In [16]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [17]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [18]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [19]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_tds_last,receiving_fumbles_last,receiving_fumbles_lost_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,...,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm,fantasy_points_ppr_norm
0,30,60,296.0,1,1.0,0.0,0.0,0.0,18.0,-4.798412,...,,,,,,,,,,0.113278
1,15,33,232.0,4,0.0,0.0,0.0,0.0,13.0,9.82212,...,,,,,,,,,,0.040893
2,45,72,464.0,2,2.0,1.0,0.0,0.0,27.0,2.518711,...,,,,,,,,,,0.576404
3,100,157,1071.0,8,2.0,1.0,0.0,0.0,56.0,32.32483,...,,,,,,,,,,0.572644
4,97,167,1189.0,6,0.0,0.0,0.0,14.0,54.0,17.804618,...,,,,,,,,,,0.312103


In [20]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [21]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_tds_last_norm,receiving_fumbles_last_norm,receiving_fumbles_lost_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,...,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm,fantasy_points_ppr_norm
0,0.198582,0.282178,0.142857,0.043478,0.2,0.0,0.011845,0.00128,0.175824,0.233803,...,,,,,,,,,,0.113278
1,0.092199,0.148515,0.109969,0.173913,0.0,0.0,0.011845,0.00128,0.120879,0.304575,...,,,,,,,,,,0.040893
2,0.304965,0.341584,0.229188,0.086957,0.4,0.25,0.011845,0.00128,0.274725,0.269222,...,,,,,,,,,,0.576404
3,0.695035,0.762376,0.54111,0.347826,0.4,0.25,0.011845,0.00128,0.593407,0.413503,...,,,,,,,,,,0.572644
4,0.673759,0.811881,0.601747,0.26087,0.0,0.0,0.011845,0.019206,0.571429,0.343216,...,,,,,,,,,,0.312103


# Create Train, Test, Validation Splits

In [22]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [23]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [24]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [25]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [26]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240505_203417"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240505_203417"
AutoGluon Version:  1.1.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2
CPU Count:          24
Memory Avail:    

## Evaluation

In [27]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.13614752267172275, 'mean_squared_error': -0.01853614792964726, 'mean_absolute_error': -0.108129202037175, 'r2': 0.4425832548669769, 'pearsonr': 0.6681820769629151, 'median_absolute_error': -0.09721696787589304}


In [28]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

599     0.350173
1201    0.382190
628     0.229385
1498    0.562376
1263    0.289869
Name: fantasy_points_ppr_norm, dtype: float32


In [29]:
test_data[label_column]

599     0.545241
1201    0.334195
628     0.170858
1498    0.663925
1263    0.083431
          ...   
100     0.017156
274     0.342891
1206    0.379788
101     0.017156
1084    0.494712
Name: fantasy_points_ppr_norm, Length: 346, dtype: float64

### Reversing the normalization

In [30]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [31]:
min_v = df_to_norm['fantasy_points_ppr'].min()
max_v = df_to_norm['fantasy_points_ppr'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [32]:
original_predictions

599     246.0
1201    156.2
628      86.7
1498    296.5
1263     49.5
        ...  
100      21.3
274     159.9
1206    175.6
101      21.3
1084    224.5
Name: fantasy_points_ppr_norm, Length: 346, dtype: float64

In [33]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [34]:
model_predictions

599     162.998642
1201    176.621857
628     111.603531
1498    253.291122
1263    137.339447
           ...    
100     106.162590
274     191.863312
1206    248.349335
101     106.162590
1084    206.933289
Name: fantasy_points_ppr_norm, Length: 346, dtype: float32

In [35]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [36]:
close_enough

77

In [37]:
len(model_predictions)

346

In [38]:
close_enough / len(model_predictions)

0.22254335260115607

In [39]:
far

207

## Further Information

In [40]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.136148,-0.134455,root_mean_squared_error,0.099475,0.068375,4.454784,0.001799,0.000216,0.008105,2,True,12
1,RandomForestMSE,-0.136923,-0.135506,root_mean_squared_error,0.03635,0.026694,0.366147,0.03635,0.026694,0.366147,1,True,5
2,LightGBM,-0.137318,-0.138442,root_mean_squared_error,0.000779,0.000984,0.322687,0.000779,0.000984,0.322687,1,True,4
3,ExtraTreesMSE,-0.137992,-0.135541,root_mean_squared_error,0.035116,0.025872,0.324433,0.035116,0.025872,0.324433,1,True,7
4,XGBoost,-0.138721,-0.138697,root_mean_squared_error,0.007812,0.002269,0.445801,0.007812,0.002269,0.445801,1,True,9
5,NeuralNetTorch,-0.138993,-0.139297,root_mean_squared_error,0.006259,0.005565,1.102252,0.006259,0.005565,1.102252,1,True,10
6,LightGBMXT,-0.13937,-0.137932,root_mean_squared_error,0.008836,0.001272,0.320829,0.008836,0.001272,0.320829,1,True,3
7,CatBoost,-0.139613,-0.137109,root_mean_squared_error,0.002794,0.00109,0.415699,0.002794,0.00109,0.415699,1,True,6
8,LightGBMLarge,-0.141764,-0.140649,root_mean_squared_error,0.008788,0.001183,1.096416,0.008788,0.001183,1.096416,1,True,11
9,NeuralNetFastAI,-0.141814,-0.139988,root_mean_squared_error,0.009344,0.006668,1.792347,0.009344,0.006668,1.792347,1,True,8


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [41]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['yac_sh_last_norm']
Computing feature importance via permutation shuffling for 44 features using 346 rows with 5 shuffle sets...
	25.46s	= Expected runtime (5.09s per shuffle set)
	1.44s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
yptmpa_last_norm,0.004645452,0.00099,0.000234,5,0.006685,0.002606
age_last_norm,0.003659532,0.000773,0.000226,5,0.005252,0.002067
ppr_sh_last_norm,0.002925216,0.000773,0.000536,5,0.004518,0.001333
fantasy_points_ppr_last_norm,0.002627676,0.00096,0.001808,5,0.004605,0.00065
fantasy_points_last_norm,0.002003341,0.000718,0.00168,5,0.003481,0.000525
receiving_epa_last_norm,0.00166304,0.000484,0.000771,5,0.002659,0.000667
games_last_norm,0.001596177,0.000711,0.003698,5,0.003061,0.000132
receptions_last_norm,0.0009725095,0.000357,0.001833,5,0.001707,0.000238
rank_last_norm,0.0008472418,0.000319,0.002012,5,0.001504,0.000191
Draft_Overall_last_norm,0.0007935885,0.000412,0.006287,5,0.001642,-5.5e-05
