# Imports

In [1]:
# pip install autogluon

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

# Load Data

In [3]:
df = pd.read_pickle("../../FantasyData/data-frames/df_basic_ngs_snaps_adv_1999_2023.pkl")

## DF Columns By Category

| Category                     | Columns                                  |
|------------------------------|------------------------------------------|
| **Passing Statistics**       | completions, attempts, passing_yards, passing_tds, interceptions, sacks, sack_yards, sack_fumbles, sack_fumbles_lost, passing_air_yards, passing_yards_after_catch, passing_first_downs, passing_epa, passing_2pt_conversions, pacr, dakota |
| **Rushing Statistics**       | carries, rushing_yards, rushing_tds, rushing_fumbles, rushing_fumbles_lost, rushing_first_downs, rushing_epa, rushing_2pt_conversions, ry_sh, rtd_sh, rfd_sh, rtdfd_sh |
| **Receiving Statistics**     | receptions, targets, receiving_yards, receiving_tds, receiving_fumbles, receiving_fumbles_lost, receiving_air_yards, receiving_yards_after_catch, receiving_first_downs, receiving_epa, receiving_2pt_conversions, racr, target_share, air_yards_share, wopr_x |
| **Fantasy and Special Teams**| special_teams_tds, fantasy_points, fantasy_points_ppr |
| **General Game Statistics**  | games                                    |
| **Player Information**       | player_id, season, season_type, position, player_name, age, team, rank, tier, Draft Year, Draft No., Draft Round, Draft Pick, Draft Overall, Draft Team, College |
| **Passing Efficiency Metrics**| avg_time_to_throw, avg_completed_air_yards, avg_intended_air_yards, avg_air_yards_differential, aggressiveness, max_completed_air_distance, avg_air_yards_to_sticks, pass_yards, pass_touchdowns, passer_rating, completion_percentage, expected_completion_percentage, completion_percentage_above_expectation, avg_air_distance, max_air_distance, efficiency, percent_attempts_gte_eight_defenders, avg_time_to_los |
| **Rushing Efficiency Metrics**| rush_attempts, rush_yards, expected_rush_yards, rush_yards_over_expected, avg_rush_yards, rush_yards_over_expected_per_att, rush_pct_over_expected, rush_touchdowns |
| **Receiving Efficiency Metrics**| avg_cushion, avg_separation, avg_intended_air_yards_receiving, percent_share_of_intended_air_yards, catch_percentage, yards, rec_touchdowns, avg_yac, avg_expected_yac, avg_yac_above_expectation |


# Filter By Position

In [4]:
df_wr = df.copy().query("position == 'WR'")
print(f"Length of wide receiver data set: {len(df_wr)}")

Length of wide receiver data set: 5009


In [5]:
# remove our na's
df_wr.dropna(subset="Draft Year", inplace=True)
print(f"After removing NA draft rows, data set size: {len(df_wr)}")

After removing NA draft rows, data set size: 4779


In [6]:
df_wr['Draft_Year'] = df_wr['Draft Year'].astype(str).str.replace('s', '')
df_wr["Draft_Year"] = df_wr["Draft_Year"].astype(int)

df_wr["Draft_Round"] = df_wr["Draft Round"].astype(str).str.replace('Undrafted', '8')
df_wr["Draft_Round"] = df_wr["Draft_Round"].astype(int)

df_wr["Draft_Overall"] = df_wr["Draft Overall"].astype(str).str.replace('Undrafted', '400')
df_wr["Draft_Overall"] = df_wr["Draft_Overall"].astype(int)

# Get Lag Version

For having the previous season as features and fantasy points/receiving first downs be the labels.

In [7]:
def create_lag_df(df, cols_to_filter=3, col_to_increment="season", 
                  cols_to_merge=["player_id", "season", "season_type"]
    ):
    """"""
    df_now = df.copy()
    df_last = df.copy()

    rename_dict = {}
    for col in list(df_last.columns[cols_to_filter:]):
        rename_dict[col] = f"{col}_last"

    df_last.rename(columns=rename_dict, inplace=True)
    df_last[col_to_increment] += 1

    df_lag = df_now.merge(df_last, how='inner', on=cols_to_merge)

    return df_lag

In [8]:
df_lag = create_lag_df(df_wr.copy())

In [9]:
df_lag.query("player_name == 'Mike Evans' and season == 2017")[["player_name", "season", 
                                                                "receiving_yards", "receiving_yards_last"]]

Unnamed: 0,player_name,season,receiving_yards,receiving_yards_last
2630,Mike Evans,2017,1001.0,1321.0


# Visualize Dataset

In [32]:
df_vis = df_lag \
    .groupby(by="season") \
    .agg({"fantasy_points_ppr": ["mean", "max", "min", "median", "sum"]})
df_vis.columns = ['_'.join(col).strip() for col in df_vis.columns.values]
df_vis.reset_index(inplace=True)

In [62]:
fig = px.line(df_vis, x="season", y=["fantasy_points_ppr_mean", "fantasy_points_ppr_max", 
                                     "fantasy_points_ppr_min", "fantasy_points_ppr_median",
                                     "fantasy_points_ppr_sum"
                                    ])
fig.write_html("../../interactive-2.0/appendix/fantasy_point_ppr_agg_by_season.html")
fig.show()

In [48]:
df_vis_2 = df_lag.copy()
df_vis_2["Tier"] = df_vis_2['tier'].astype(str)

In [51]:
cat_order = ['1.0',
 '2.0',
 '3.0',
 '4.0',
 '5.0',
 '6.0',
 '7.0',
 '8.0',
 '9.0',
 '10.0',
 '11.0',
 '12.0',
 '13.0',
 '14.0',
 '15.0',
 '16.0',
 '17.0',
 '18.0',
 '19.0',
 '20.0',
]

In [61]:
fig = px.scatter(df_vis_2, x="season", y=["fantasy_points_ppr"], color="Tier",
                 category_orders={"Tier": cat_order}
                )

# Update layout for clarity.
fig.update_layout(xaxis_title='Season', yaxis_title='Fantasy Points PPR')
fig.write_html("../../interactive-2.0/appendix/fantasy_point_ppr_by_season.html")
fig.show()

# Select Feature Columns

I.e. the columns with last in them, and of numeric type.

In [10]:
non_numeric_cols = df_lag.select_dtypes(exclude=[np.number]).columns

In [11]:
non_numeric_cols

Index(['player_id', 'season_type', 'position', 'player_name', 'team',
       'Draft Year', 'Draft No.', 'Draft Round', 'Draft Pick', 'Draft Overall',
       'Draft Team', 'College', 'position_last', 'player_name_last',
       'team_last', 'Draft Year_last', 'Draft No._last', 'Draft Round_last',
       'Draft Pick_last', 'Draft Overall_last', 'Draft Team_last',
       'College_last'],
      dtype='object')

In [12]:
assert(df_lag["Draft_Year"].dtype == 'int64')
assert(df_lag["Draft_Round"].dtype == 'int64')
assert(df_lag["Draft_Overall"].dtype == 'int64')

In [13]:
[col for col in df_lag.columns if '_last' in col]

['completions_last',
 'attempts_last',
 'passing_yards_last',
 'passing_tds_last',
 'interceptions_last',
 'sacks_last',
 'sack_yards_last',
 'sack_fumbles_last',
 'sack_fumbles_lost_last',
 'passing_air_yards_last',
 'passing_yards_after_catch_last',
 'passing_first_downs_last',
 'passing_epa_last',
 'passing_2pt_conversions_last',
 'pacr_last',
 'dakota_last',
 'carries_last',
 'rushing_yards_last',
 'rushing_tds_last',
 'rushing_fumbles_last',
 'rushing_fumbles_lost_last',
 'rushing_first_downs_last',
 'rushing_epa_last',
 'rushing_2pt_conversions_last',
 'receptions_last',
 'targets_last',
 'receiving_yards_last',
 'receiving_tds_last',
 'receiving_fumbles_last',
 'receiving_fumbles_lost_last',
 'receiving_air_yards_last',
 'receiving_yards_after_catch_last',
 'receiving_first_downs_last',
 'receiving_epa_last',
 'receiving_2pt_conversions_last',
 'racr_last',
 'target_share_last',
 'air_yards_share_last',
 'wopr_x_last',
 'special_teams_tds_last',
 'fantasy_points_last',
 'fantasy

In [14]:
feature_columns = [
    'receptions_last',
    'targets_last',
    'receiving_yards_last',
    'receiving_tds_last',
    'receiving_fumbles_last',
    'receiving_fumbles_lost_last',
    'receiving_air_yards_last',
    'receiving_yards_after_catch_last',
    'receiving_first_downs_last',
    'receiving_epa_last',
    'receiving_2pt_conversions_last',
    'racr_last',
    'target_share_last',
    'air_yards_share_last',
    'wopr_x_last',
    'fantasy_points_last',
    'fantasy_points_ppr_last',
    'games_last',
    'tgt_sh_last',
    'ay_sh_last',
    'yac_sh_last',
    'wopr_y_last',
    'ry_sh_last',
    'rtd_sh_last',
    'rfd_sh_last',
    'rtdfd_sh_last',
    'dom_last',
    'w8dom_last',
    'yptmpa_last',
    'ppr_sh_last',
    'age_last',
    'rank_last',
    'tier_last',
    'Draft_Year_last',
    'Draft_Round_last',
    'Draft_Overall_last',
    'avg_cushion_last',
    'avg_separation_last',
    'avg_intended_air_yards_receiving_last',
    'percent_share_of_intended_air_yards_last',
    'catch_percentage_last',
#     'yards_last',  # same as receiving yards - duplicate column
    'rec_touchdowns_last',
    'avg_yac_last',
    'avg_expected_yac_last',
    'avg_yac_above_expectation_last'
]

In [15]:
label = "fantasy_points_ppr"

In [16]:
cols_to_norm = feature_columns + [label]
df_to_norm = df_lag[cols_to_norm].copy()

# Normalize the Dataset

In [17]:
def min_max_scaling(df, cols_to_norm):
    for col in cols_to_norm:
        max_v = df[col].max()
        min_v = df[col].min()
        
        df[f"{col}_norm"] = (df[col] - min_v) / (max_v - min_v)
        
    return df

In [18]:
df_norm = min_max_scaling(df_to_norm.copy(), cols_to_norm)

In [19]:
df_norm.head()

Unnamed: 0,receptions_last,targets_last,receiving_yards_last,receiving_tds_last,receiving_fumbles_last,receiving_fumbles_lost_last,receiving_air_yards_last,receiving_yards_after_catch_last,receiving_first_downs_last,receiving_epa_last,...,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm,fantasy_points_ppr_norm
0,30,60,296.0,1,1.0,0.0,0.0,0.0,18.0,-4.798412,...,,,,,,,,,,0.14425
1,15,33,232.0,4,0.0,0.0,0.0,0.0,13.0,9.82212,...,,,,,,,,,,0.074393
2,3,6,26.0,0,0.0,0.0,0.0,0.0,2.0,-3.555738,...,,,,,,,,,,0.003175
3,16,28,202.0,0,1.0,1.0,0.0,0.0,10.0,-6.491849,...,,,,,,,,,,0.011794
4,3,5,23.0,0,0.0,0.0,0.0,0.0,2.0,0.845925,...,,,,,,,,,,0.025176


In [20]:
df_ML = df_norm[df_norm.columns[len(cols_to_norm):]].copy()

In [21]:
df_ML.head()

Unnamed: 0,receptions_last_norm,targets_last_norm,receiving_yards_last_norm,receiving_tds_last_norm,receiving_fumbles_last_norm,receiving_fumbles_lost_last_norm,receiving_air_yards_last_norm,receiving_yards_after_catch_last_norm,receiving_first_downs_last_norm,receiving_epa_last_norm,...,avg_cushion_last_norm,avg_separation_last_norm,avg_intended_air_yards_receiving_last_norm,percent_share_of_intended_air_yards_last_norm,catch_percentage_last_norm,rec_touchdowns_last_norm,avg_yac_last_norm,avg_expected_yac_last_norm,avg_yac_above_expectation_last_norm,fantasy_points_ppr_norm
0,0.201342,0.292683,0.152008,0.043478,0.2,0.0,0.011845,0.003534,0.193548,0.233803,...,,,,,,,,,,0.14425
1,0.100671,0.160976,0.119471,0.173913,0.0,0.0,0.011845,0.003534,0.139785,0.304575,...,,,,,,,,,,0.074393
2,0.020134,0.029268,0.014743,0.0,0.0,0.0,0.011845,0.003534,0.021505,0.239818,...,,,,,,,,,,0.003175
3,0.107383,0.136585,0.10422,0.0,0.2,0.25,0.011845,0.003534,0.107527,0.225605,...,,,,,,,,,,0.011794
4,0.020134,0.02439,0.013218,0.0,0.0,0.0,0.011845,0.003534,0.021505,0.261125,...,,,,,,,,,,0.025176


# Create Train, Test, Validation Splits

In [22]:
X = df_ML.iloc[:, :-1]  # features
y = df_ML.iloc[:, -1]   # label

In [23]:
# First, split into train and temporary sets (train + validation, test)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now, split the train_temp into actual train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# AutoGluon

In [24]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [25]:
label_column = 'fantasy_points_ppr_norm' 

## Fit

In [26]:
predictor = TabularPredictor(label=label_column).fit(
    train_data=train_data,
    tuning_data=val_data  # Optional, only if you want to use a separate validation set
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240503_004218"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240503_004218"
AutoGluon Version:  1.1.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2
CPU Count:          24
Memory Avail:    

## Evaluation

In [27]:
performance = predictor.evaluate(test_data)
print(performance)

{'root_mean_squared_error': -0.14048862886716565, 'mean_squared_error': -0.019737054840976213, 'mean_absolute_error': -0.11084410004385986, 'r2': 0.4991568323464729, 'pearsonr': 0.7097408218677148, 'median_absolute_error': -0.09409943028057638}


In [28]:
predictions = predictor.predict(test_data.drop(columns=[label_column]))
print(predictions.head())

790     0.236862
1735    0.127854
2266    0.297805
1052    0.420660
443     0.408449
Name: fantasy_points_ppr_norm, dtype: float32


In [29]:
test_data[label_column]

790     0.114992
1735    0.199819
2266    0.513722
1052    0.264686
443     0.143570
          ...   
2405    0.029939
1253    0.551372
2164    0.132456
572     0.062599
1551    0.635065
Name: fantasy_points_ppr_norm, Length: 685, dtype: float64

### Reversing the normalization

In [32]:
def reverse_min_max_scaling(normalized_data, min_v, max_v):
    return normalized_data * (max_v - min_v) + min_v

In [33]:
min_v = df_to_norm['fantasy_points_ppr'].min()
max_v = df_to_norm['fantasy_points_ppr'].max()
original_predictions = reverse_min_max_scaling(test_data[label_column].copy(), min_v, max_v)

In [36]:
original_predictions

790      49.3
1735     86.7
2266    225.1
1052    115.3
443      61.9
        ...  
2405     11.8
1253    241.7
2164     57.0
572      26.2
1551    278.6
Name: fantasy_points_ppr_norm, Length: 685, dtype: float64

In [38]:
model_predictions = reverse_min_max_scaling(predictions.copy(), min_v, max_v)

In [41]:
model_predictions

790     103.032257
1735     54.971004
2266    129.902130
1052    184.068878
443     178.685272
           ...    
2405     31.701941
1253    227.453674
2164     47.625607
572      54.906818
1551    161.881866
Name: fantasy_points_ppr_norm, Length: 685, dtype: float32

In [52]:
close_enough = 0
far = 0
for guess, answer in zip(model_predictions, original_predictions):
    diff = abs(answer - guess)
    if diff <= 15:
        close_enough += 1
    elif diff >= 30:
        far += 1

In [53]:
close_enough

123

In [54]:
len(model_predictions)

685

In [55]:
close_enough / len(model_predictions)

0.17956204379562044

In [56]:
far

422

## Further Information

In [30]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.140489,-0.13628,root_mean_squared_error,0.080013,0.05182,4.511516,0.001466,0.000213,0.008197,2,True,12
1,ExtraTreesMSE,-0.140496,-0.1375,root_mean_squared_error,0.051522,0.035202,0.338903,0.051522,0.035202,0.338903,1,True,7
2,RandomForestMSE,-0.142312,-0.138778,root_mean_squared_error,0.053516,0.024929,0.572448,0.053516,0.024929,0.572448,1,True,5
3,LightGBM,-0.142376,-0.139953,root_mean_squared_error,0.002114,0.001021,0.351715,0.002114,0.001021,0.351715,1,True,4
4,LightGBMXT,-0.142573,-0.13707,root_mean_squared_error,0.008094,0.001517,0.434847,0.008094,0.001517,0.434847,1,True,3
5,CatBoost,-0.142871,-0.138363,root_mean_squared_error,0.003063,0.001277,0.503718,0.003063,0.001277,0.503718,1,True,6
6,NeuralNetTorch,-0.143909,-0.141088,root_mean_squared_error,0.007385,0.00634,1.517601,0.007385,0.00634,1.517601,1,True,10
7,LightGBMLarge,-0.144343,-0.143462,root_mean_squared_error,0.00572,0.001254,1.210714,0.00572,0.001254,1.210714,1,True,11
8,XGBoost,-0.144645,-0.142411,root_mean_squared_error,0.010172,0.002475,0.508627,0.010172,0.002475,0.508627,1,True,9
9,NeuralNetFastAI,-0.144675,-0.140657,root_mean_squared_error,0.011546,0.008549,2.211968,0.011546,0.008549,2.211968,1,True,8


For feature clarification:
- **yptmpa:** receiving yards per team pass attempt

In [31]:
predictor.feature_importance(data=test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['yac_sh_last_norm']
Computing feature importance via permutation shuffling for 44 features using 685 rows with 5 shuffle sets...
	18.11s	= Expected runtime (3.62s per shuffle set)
	1.76s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
yptmpa_last_norm,0.004329,0.000898,0.00021,5,0.006177,0.00248
age_last_norm,0.002264,0.000357,7.2e-05,5,0.002999,0.001528
fantasy_points_last_norm,0.002073,0.000553,0.000556,5,0.003212,0.000933
tier_last_norm,0.001751,0.000477,0.000599,5,0.002733,0.000769
fantasy_points_ppr_last_norm,0.001651,0.000395,0.000365,5,0.002465,0.000838
rank_last_norm,0.001589,0.000355,0.000279,5,0.002319,0.000858
receiving_yards_last_norm,0.001583,0.000471,0.000837,5,0.002552,0.000614
receptions_last_norm,0.001416,0.000318,0.000285,5,0.00207,0.000762
ppr_sh_last_norm,0.001315,0.000471,0.001677,5,0.002285,0.000345
receiving_epa_last_norm,0.001313,0.000426,0.001162,5,0.00219,0.000436
