In [1]:
# logistic regression classification modeling

In [2]:
## ** INPUT REQUIRED **
# add filtering option for backtest or live predictions

In [3]:
# installations
!pip install fpdf




In [4]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from fpdf import FPDF
from datetime import datetime
import re

In [5]:
# Load the raw dataset
wr_df_raw = pd.read_csv("wr_nfl_df_sorted_new_features_final.csv")

In [6]:
# Create working copy
wr_df = wr_df_raw.copy()

In [7]:
# Begin: feature engineering - imputations

In [8]:
# Feature Engineering on fantasypros stats
# imputation and drop unnecessary columns

# Reapply 'drop' logic using 'receiving_drop' and 'targets'
drop_mask = wr_df['drop'].isna() & wr_df['receiving_drop'].notna() & wr_df['targets'].notna() & (wr_df['targets'] > 0)
zero_targets_mask = wr_df['targets'] == 0

# If drop is missing and targets > 0, use receiving_drop
wr_df.loc[drop_mask, 'drop'] = wr_df.loc[drop_mask, 'receiving_drop']

# If targets == 0, set drop = 0
wr_df.loc[zero_targets_mask, 'drop'] = 0

# Failsafe: fill any remaining nulls with 0
wr_df['drop'] = wr_df['drop'].fillna(0)

# Drop confirmed redundant or low-value columns
columns_to_drop = [
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_drop', 'receiving_drop_pct',
    'g', 'rec', 'yds', 'ybc', 'air', 'yac', 'yacon', 'brktkl', 'tgt', 'catchable',
    'rz tgt', '10+ yds', '20+ yds', '30+ yds', '40+ yds', '50+ yds',
    'rec pct_rz', 'y/r_rz', 'tgt pct_rz', 'team_abbr_x', 'result', 'score', 'macro_tier_score_season', 
    'macro_tier_score_missing', 'macro_tier_season', 'over_under', 'O_U'

]

# Preview the actual columns that will be dropped
to_drop_confirmed = [col for col in columns_to_drop if col in wr_df.columns]
print("✅ Columns confirmed to be dropped:", to_drop_confirmed)


# Drop only if columns are present in the current frame
wr_df.drop(columns=[col for col in columns_to_drop if col in wr_df.columns], inplace=True)


✅ Columns confirmed to be dropped: ['receiving_fumbles', 'receiving_fumbles_lost', 'receiving_drop', 'receiving_drop_pct', 'g', 'rec', 'yds', 'ybc', 'air', 'yac', 'yacon', 'brktkl', 'tgt', 'catchable', 'rz tgt', '10+ yds', '20+ yds', '30+ yds', '40+ yds', '50+ yds', 'rec pct_rz', 'y/r_rz', 'tgt pct_rz', 'team_abbr_x', 'result', 'score', 'macro_tier_score_season', 'macro_tier_score_missing', 'macro_tier_season', 'over_under', 'O_U']


In [9]:
# Show the remaining columns after the drop
remaining_columns = wr_df.columns.tolist()
remaining_columns


['season',
 'season_type',
 'week',
 'name',
 'position',
 'recent_team',
 'player_display_name',
 'rost',
 'dk_salary',
 'fd_salary',
 'value_ratio_dk',
 'value_ratio_fd',
 'value_ratio_dk_log',
 'value_ratio_fd_log',
 'value_ratio_dk_log_z',
 'value_ratio_fd_log_z',
 'fpts',
 'pos_avg_fpts',
 'fpts_above_pos_avg',
 'expected_fpts_dk',
 'fpts_diff_dk',
 'hit_value_dk',
 'rolling_fpts_diff_dk',
 'z_fpts_diff_dk',
 'z_value_ratio_dk',
 'expected_fpts_fd',
 'fpts_diff_fd',
 'hit_value_fd',
 'rolling_fpts_diff_fd',
 'z_fpts_diff_fd',
 'z_value_ratio_fd',
 'double_digit_targets',
 'boom_week',
 'bust_week',
 'is_macro_high_tier',
 'is_macro_mid_tier',
 'is_macro_low_tier',
 'opponent_abbr',
 'home',
 'role',
 'spread',
 'is_home_game',
 'Total',
 'receptions',
 'receiving_yards',
 'yards',
 'receiving_yards_after_catch',
 'targets',
 'target_share',
 'target_share_z',
 'catch_percentage',
 'catch_percentage_scaled',
 'catch_percentage_scaled_z',
 'avg_cushion',
 'avg_separation',
 'avg_yac

In [10]:
# csv output check
# Export the dataframe after column drop to CSV
# wr_df.to_csv("step1_after_column_drop.csv", index=False)
# print("CSV export complete: step1_after_column_drop.csv")


In [11]:
# Observed (true recorded stats)
observed_stats = [
    'targets', 'receptions', 'receiving_yards', 'receiving_yards_after_catch',
    'receiving_air_yards', 'receiving_tds', 'rec_touchdowns',
    'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions',
    'fpts', 'catch_percentage', 'avg_cushion', 'avg_separation',
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'avg_intended_air_yards', 'percent_share_of_intended_air_yards',
    'receiving_broken_tackles'
]

# Salary and roster
salary_fields = ['dk_salary', 'fd_salary']
roster_fields = ['rost']

# Metadata (unchanged throughout pipeline)
metadata = [
    'season', 'season_type', 'week', 'name', 'player_display_name',
    'position', 'recent_team', 'opponent_abbr', 'role', 'home', 'is_home_game'
]

# Dynamic categories
booleans_flags = [col for col in wr_df.columns if col.startswith('is_') or col.endswith('_ge_5') or col.endswith('_ge_7') or col.startswith('boom_') or col.startswith('bust_') or col.startswith('double_digit') or col.endswith('over_100_yds')]
performance_bins = [col for col in wr_df.columns if col.endswith('_performance_bin')]
rolling_stats = [col for col in wr_df.columns if any(sub in col for sub in ['_avg', '_lag', '_delta', '_games_played'])]
percentile_stats = [col for col in wr_df.columns if 'percentile' in col]

# Derived = numeric columns not already categorized
categorized_cols = set(observed_stats + salary_fields + roster_fields + metadata + booleans_flags + performance_bins + rolling_stats + percentile_stats)
derived_features = [col for col in wr_df.columns if col not in categorized_cols and wr_df[col].dtype in ['float64', 'int64']]
derived_features.extend(['hit_value_dk', 'hit_value_fd'])


In [12]:
# Confirm shape after cleaning
column_count_after_drop = wr_df.shape[1]

# Rebuild the category_lists dictionary from previous block
category_lists = {
    "observed_stats": observed_stats,
    "salary_fields": salary_fields,
    "roster_fields": roster_fields,
    "metadata": metadata,
    "booleans_flags": booleans_flags,
    "performance_bins": performance_bins,
    "rolling_stats": rolling_stats,
    "derived_features": derived_features,
    "percentile_stats": percentile_stats
}

# Flatten all categorized columns into one set
all_categorized_columns = set().union(*category_lists.values())

# Compare
{
    "column_count_after_drop": column_count_after_drop,
    "total_categorized_columns": len(all_categorized_columns),
    "match": column_count_after_drop == len(all_categorized_columns)
}


{'column_count_after_drop': 214,
 'total_categorized_columns': 214,
 'match': True}

In [13]:
# Reveal uncategorized columns
uncategorized_columns = set(wr_df.columns) - all_categorized_columns
print("🔍 Uncategorized columns:", uncategorized_columns)


🔍 Uncategorized columns: set()


In [14]:
# Define is_active using the refined logic
wr_df['is_active'] = (
    (wr_df['dk_salary'].fillna(0) > 0) |
    (wr_df['fd_salary'].fillna(0) > 0) |
    (wr_df['targets'].fillna(0) > 0) |
    (wr_df['receptions'].fillna(0) > 0) |
    (wr_df['receiving_yards'].fillna(0) > 0) |
    (wr_df['fpts'].fillna(0) > 0)
)

# Count how many players are considered active
active_count = wr_df['is_active'].sum()
total_count = wr_df.shape[0]
inactive_count = total_count - active_count

# Display summary as a DataFrame
summary_df = pd.DataFrame({
    "Total Rows": [total_count],
    "Active Rows": [active_count],
    "Inactive Rows": [inactive_count],
    "Active %": [round(100 * active_count / total_count, 2)],
    "Inactive %": [round(100 * inactive_count / total_count, 2)]
})

summary_df


Unnamed: 0,Total Rows,Active Rows,Inactive Rows,Active %,Inactive %
0,17449,17429,20,99.89,0.11


In [15]:
# Zero out or False-out specified columns for rows where is_active is False.
def apply_default_zeros(df, column_groups):

    for group_name, cols in column_groups.items():
        for col in cols:
            if col in df.columns:
                if df[col].dtype == 'bool':
                    df.loc[~df['is_active'], col] = False
                else:
                    df.loc[~df['is_active'], col] = 0
    return df


In [16]:
## Begin block-by-block imputations

In [17]:
# Impute missing 'player_display_name' using 'name'
wr_df['player_display_name'] = wr_df['player_display_name'].fillna(wr_df['name'])

In [18]:
# Check how many missing values remain in 'player_display_name' after the imputation
missing_display_name = wr_df['player_display_name'].isna().sum()

# Display a few rows where the original imputation was applied (i.e., name and player_display_name were previously not equal)
imputed_rows = wr_df[wr_df['player_display_name'] == wr_df['name']][['name', 'player_display_name']].head(10)

missing_display_name, imputed_rows


(0,
          name player_display_name
 0  A.J. Green          A.J. Green
 1  A.J. Green          A.J. Green
 2  A.J. Green          A.J. Green
 3  A.J. Green          A.J. Green
 4  A.J. Green          A.J. Green
 5  A.J. Green          A.J. Green
 6  A.J. Green          A.J. Green
 7  A.J. Green          A.J. Green
 8  A.J. Green          A.J. Green
 9  A.J. Green          A.J. Green)

In [19]:
# Export the dataframe after imputing 'player_display_name'
# wr_df.to_csv("step2_after_player_display_name_imputation.csv", index=False)
# print("CSV export complete: step2_after_player_display_name_imputation.csv")


In [20]:
# Descriptive Stats - catch_percentage

# Get statistical summary of 'catch_percentage' and its scaled versions
catch_pct_cols = ['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']
catch_pct_stats = wr_df[catch_pct_cols].describe().T

# Add missing value counts
catch_pct_stats['missing_count'] = wr_df[catch_pct_cols].isna().sum()
catch_pct_stats['missing_percent'] = wr_df[catch_pct_cols].isna().mean() * 100

catch_pct_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
catch_percentage,8199.0,63.015331,19.083549,0.0,50.0,62.5,77.8,100.0,9250,53.011634
catch_percentage_scaled,8199.0,0.633138,0.195842,0.0,0.5,0.6,0.8,1.0,9250,53.011634
catch_percentage_scaled_z,8199.0,-0.006403,1.004702,-3.3,-0.7,0.0,0.8,1.9,9250,53.011634


In [21]:
# Check how many rows have missing catch_percentage but have valid receptions and targets

# Check how many valid values we would get using 'receptions' and 'targets'
correct_mask = (
    wr_df['catch_percentage'].isna() &
    wr_df['receptions'].notna() &
    wr_df['targets'].notna() &
    (wr_df['targets'] > 0)
)

correct_count = correct_mask.sum()

# recalculate number of rows fields
correct_count


8964

In [22]:
# fill in catch_percentage receptions / targets

# Apply the corrected calculation
wr_df.loc[correct_mask, 'catch_percentage'] = (
    wr_df.loc[correct_mask, 'receptions'] / wr_df.loc[correct_mask, 'targets']
) * 100

# Recompute the scaled version
wr_df['catch_percentage_scaled'] = wr_df['catch_percentage'] / 100

# Recompute the z-score version
scaled_mean = wr_df['catch_percentage_scaled'].mean(skipna=True)
scaled_std = wr_df['catch_percentage_scaled'].std(skipna=True)
wr_df['catch_percentage_scaled_z'] = (wr_df['catch_percentage_scaled'] - scaled_mean) / scaled_std


In [23]:
# verify catch_percentage results
# Re-check how many missing values remain in the three catch_percentage-related columns
final_missing_summary = wr_df[['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']].isna().sum()
final_missing_summary


catch_percentage             286
catch_percentage_scaled      286
catch_percentage_scaled_z    286
dtype: int64

In [24]:
# check df
# Sample a few rows with valid values to verify the imputation and scaling logic
check_rows = wr_df[
    wr_df['catch_percentage'].notna() &
    wr_df['catch_percentage_scaled'].notna() &
    wr_df['catch_percentage_scaled_z'].notna()
][['receptions', 'targets', 'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']].sample(10)

display(check_rows)


Unnamed: 0,receptions,targets,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z
14609,1,2,50.0,0.5,-0.397307
3407,3,4,75.0,0.75,0.431949
9167,0,1,0.0,0.0,-2.05582
1509,7,10,70.0,0.7,0.266098
16933,5,6,83.3,0.833,0.707262
9411,0,3,0.0,0.0,-2.05582
11916,2,2,100.0,1.0,1.261205
17436,2,6,33.3,0.333,-0.95125
613,1,4,25.0,0.25,-1.226564
13419,7,12,58.3,0.583,-0.121994


In [25]:
# Show 10 sample rows where catch_percentage is still missing
remaining_na_rows = wr_df[wr_df['catch_percentage'].isna()][
    ['name', 'week', 'season', 'receptions', 'targets', 'catch_percentage']
].head(10)

remaining_na_rows


Unnamed: 0,name,week,season,receptions,targets,catch_percentage
81,Alex Erickson,8,2017,0,0,
174,ArDarius Stewart,5,2017,0,0,
177,ArDarius Stewart,13,2017,0,0,
198,Bernard Reedy,6,2017,0,0,
199,Bernard Reedy,7,2017,0,0,
201,Bobo Wilson,13,2017,0,0,
203,Bobo Wilson,17,2017,0,0,
268,Braxton Miller,10,2017,0,0,
333,Chad Williams,14,2017,0,0,
460,Curtis Samuel,6,2017,0,0,


In [26]:
# Feature Engineering - Imputation for remaining missing values
# Define masks for remaining missing values
mask_catch_pct = wr_df['catch_percentage'].isna()

# Impute dummy values
wr_df.loc[mask_catch_pct, 'catch_percentage'] = -10.0
wr_df.loc[mask_catch_pct, 'catch_percentage_scaled'] = -0.1
wr_df.loc[mask_catch_pct, 'catch_percentage_scaled_z'] = -4.0

# Add boolean flags for each
wr_df['is_missing_catch_pct'] = mask_catch_pct.astype(int)
wr_df['is_missing_catch_pct_scaled'] = mask_catch_pct.astype(int)
wr_df['is_missing_catch_pct_z'] = mask_catch_pct.astype(int)


In [27]:
# Verify no missing values
# Re-check how many missing values remain in the catch_percentage feature group
final_check = wr_df[
    ['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']
].isna().sum()

final_check


catch_percentage             0
catch_percentage_scaled      0
catch_percentage_scaled_z    0
dtype: int64

In [28]:
# check df
# Confirm that all dummy rows were flagged and values assigned correctly
check_dummy_rows = wr_df[wr_df['is_missing_catch_pct'] == 1][[
    'receptions', 'targets', 'catch_percentage',
    'catch_percentage_scaled', 'catch_percentage_scaled_z',
    'is_missing_catch_pct', 'is_missing_catch_pct_scaled', 'is_missing_catch_pct_z'
]].sample(10)

display(check_dummy_rows)


Unnamed: 0,receptions,targets,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z,is_missing_catch_pct,is_missing_catch_pct_scaled,is_missing_catch_pct_z
6091,0,0,-10.0,-0.1,-4.0,1,1,1
1865,0,0,-10.0,-0.1,-4.0,1,1,1
7224,0,0,-10.0,-0.1,-4.0,1,1,1
5172,0,0,-10.0,-0.1,-4.0,1,1,1
13808,0,0,-10.0,-0.1,-4.0,1,1,1
8310,0,0,-10.0,-0.1,-4.0,1,1,1
1929,0,0,-10.0,-0.1,-4.0,1,1,1
11535,0,0,-10.0,-0.1,-4.0,1,1,1
3620,0,0,-10.0,-0.1,-4.0,1,1,1
10454,0,0,-10.0,-0.1,-4.0,1,1,1


In [29]:
# Export the dataframe after imputing catch_percentage and related fields
# wr_df.to_csv("step3_after_catch_percentage_imputation.csv", index=False)
# print("CSV export complete: step3_after_catch_percentage_imputation.csv")


In [30]:
# Descriptive Stats - avg_cushion

# Get statistical summary of 'avg_cushion'
cushion_stats = wr_df[['avg_cushion']].describe().T

# Add missing count and percent
cushion_stats['missing_count'] = wr_df['avg_cushion'].isna().sum()
cushion_stats['missing_percent'] = wr_df['avg_cushion'].isna().mean() * 100

cushion_stats


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_cushion,8197.0,6.044809,1.505897,1.9,5.0,6.0,7.1,14.4,9252,53.023096


In [31]:
# Feature Engineering - Imputation for avg_cushion

# Identify missing values
mask_avg_cushion = wr_df['avg_cushion'].isna()

# Impute with dummy value
wr_df.loc[mask_avg_cushion, 'avg_cushion'] = -1.0

# Add boolean flag
wr_df['is_missing_avg_cushion'] = mask_avg_cushion.astype(int)


In [32]:
# Verify no missing values remain in 'avg_cushion'
missing_avg_cushion = wr_df['avg_cushion'].isna().sum()
missing_avg_cushion

0

In [33]:
# check df
check_cushion = wr_df[wr_df['is_missing_avg_cushion'] == 1][['avg_cushion', 'is_missing_avg_cushion']].sample(10)
display(check_cushion)


Unnamed: 0,avg_cushion,is_missing_avg_cushion
8980,-1.0,1
12481,-1.0,1
2294,-1.0,1
1139,-1.0,1
13806,-1.0,1
10498,-1.0,1
10454,-1.0,1
2413,-1.0,1
10610,-1.0,1
5166,-1.0,1


In [34]:
# Export the dataframe after imputing avg_cushion
# wr_df.to_csv("step4_after_avg_cushion_imputation.csv", index=False)
# print("CSV export complete: step4_after_avg_cushion_imputation.csv")


In [35]:
# Descriptive Stats - avg_separation

# check for the next feature: 'avg_separation'
separation_stats = wr_df[['avg_separation']].describe().T
separation_stats['missing_count'] = wr_df['avg_separation'].isna().sum()
separation_stats['missing_percent'] = wr_df['avg_separation'].isna().mean() * 100

separation_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_separation,8199.0,2.862666,0.937429,0.6,2.2,2.8,3.4,8.7,9250,53.011634


In [36]:
# Feature Engineering - Imputation for avg_separation

# Identify missing values
mask_avg_separation = wr_df['avg_separation'].isna()

# Impute with dummy value
wr_df.loc[mask_avg_separation, 'avg_separation'] = -1.0

# Add boolean flag
wr_df['is_missing_avg_separation'] = mask_avg_separation.astype(int)


In [37]:
# Verify that 'avg_separation' has no remaining missing values
missing_avg_separation = wr_df['avg_separation'].isna().sum()
missing_avg_separation

0

In [38]:
# check df
check_separation = wr_df[wr_df['is_missing_avg_separation'] == 1][['avg_separation', 'is_missing_avg_separation']].sample(10)
display(check_separation)


Unnamed: 0,avg_separation,is_missing_avg_separation
9449,-1.0,1
274,-1.0,1
8066,-1.0,1
10587,-1.0,1
14725,-1.0,1
6618,-1.0,1
814,-1.0,1
1113,-1.0,1
13478,-1.0,1
8983,-1.0,1


In [39]:
# Export the dataframe after imputing avg_separation
# wr_df.to_csv("step5_after_avg_separation_imputation.csv", index=False)
# print("CSV export complete: step5_after_avg_separation_imputation.csv")


In [40]:
# Descriptive stats for NGS-related features
next_ngs_features = ['avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation']
next_ngs_stats = wr_df[next_ngs_features].describe().T
next_ngs_stats['missing_count'] = wr_df[next_ngs_features].isna().sum()
next_ngs_stats['missing_percent'] = wr_df[next_ngs_features].isna().mean() * 100

next_ngs_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_yac,8167.0,4.49258,3.456574,-2.6,2.3,3.8,5.9,43.2,9282,53.195026
avg_expected_yac,8158.0,3.916082,2.3139,0.1,2.3,3.5,5.1,19.1,9291,53.246604
avg_yac_above_expectation,8158.0,0.5785,2.335047,-9.9,-0.6,0.2,1.2,37.7,9291,53.246604


In [41]:
# Feature Engineering - apply imputation for YAC-related Next Gen Stats

# Identify missing masks for each column
mask_yac = wr_df['avg_yac'].isna()
mask_exp_yac = wr_df['avg_expected_yac'].isna()
mask_yac_diff = wr_df['avg_yac_above_expectation'].isna()

# Apply smart imputation values
wr_df.loc[mask_yac, 'avg_yac'] = -5.0
wr_df.loc[mask_exp_yac, 'avg_expected_yac'] = -1.0
wr_df.loc[mask_yac_diff, 'avg_yac_above_expectation'] = -10.0

# Add boolean flags
wr_df['is_missing_avg_yac'] = mask_yac.astype(int)
wr_df['is_missing_avg_expected_yac'] = mask_exp_yac.astype(int)
wr_df['is_missing_avg_yac_above_expectation'] = mask_yac_diff.astype(int)


In [42]:
# Final verification: confirm no missing values remain in the YAC-related fields
yac_verification = wr_df[
    ['avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation']
].isna().sum()

yac_verification


avg_yac                      0
avg_expected_yac             0
avg_yac_above_expectation    0
dtype: int64

In [43]:
# check df
check_yac = wr_df[wr_df['is_missing_avg_yac'] == 1][[
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'is_missing_avg_yac', 'is_missing_avg_expected_yac', 'is_missing_avg_yac_above_expectation'
]].sample(10)
display(check_yac)


Unnamed: 0,avg_yac,avg_expected_yac,avg_yac_above_expectation,is_missing_avg_yac,is_missing_avg_expected_yac,is_missing_avg_yac_above_expectation
9829,-5.0,-1.0,-10.0,1,1,1
7218,-5.0,-1.0,-10.0,1,1,1
2791,-5.0,-1.0,-10.0,1,1,1
15845,-5.0,-1.0,-10.0,1,1,1
7776,-5.0,-1.0,-10.0,1,1,1
15498,-5.0,-1.0,-10.0,1,1,1
8552,-5.0,-1.0,-10.0,1,1,1
15146,-5.0,-1.0,-10.0,1,1,1
3072,-5.0,-1.0,-10.0,1,1,1
144,-5.0,-1.0,-10.0,1,1,1


In [44]:
# Export the dataframe after imputing YAC-related fields
# wr_df.to_csv("step6_after_yac_imputation.csv", index=False)
# print("CSV export complete: step6_after_yac_imputation.csv")


In [45]:
# Define important and secondary NGS features for evaluation
important_ngs = [
    'receiving_broken_tackles', 'receiving_air_yards',
    'receiving_epa', 'receiving_2pt_conversions'
]

potential_duplicates = [
    'receiving_first_downs', 'receiving_tds', 'rec_touchdowns'
]

all_ngs = important_ngs + potential_duplicates

# Generate descriptive statistics and missing value summary
ngs_stats = wr_df[all_ngs].describe().T
ngs_stats['missing_count'] = wr_df[all_ngs].isna().sum()
ngs_stats['missing_percent'] = wr_df[all_ngs].isna().mean() * 100

ngs_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
receiving_broken_tackles,14675.0,0.164634,0.480238,0.0,0.0,0.0,0.0,7.0,2774,15.897759
receiving_air_yards,17449.0,52.960399,45.542857,-32.0,17.0,43.0,78.0,334.0,0,0.0
receiving_epa,17163.0,1.139678,3.896251,-23.1,-1.0,0.8,3.2,23.6,286,1.639062
receiving_2pt_conversions,17449.0,0.01192,0.108531,0.0,0.0,0.0,0.0,1.0,0,0.0
receiving_first_downs,17449.0,1.882973,1.778176,0.0,1.0,1.0,3.0,14.0,0,0.0
receiving_tds,17449.0,0.237836,0.499024,0.0,0.0,0.0,0.0,4.0,0,0.0
rec_touchdowns,8199.0,0.377607,0.605227,0.0,0.0,0.0,1.0,4.0,9250,53.011634


In [46]:
# rec_touchdowns = receiving_tds if missing

# Identify rows where rec_touchdowns is missing but receiving_tds is available
mask_rec_td_missing = wr_df['rec_touchdowns'].isna() & wr_df['receiving_tds'].notna()

# Fill missing rec_touchdowns using receiving_tds
wr_df.loc[mask_rec_td_missing, 'rec_touchdowns'] = wr_df.loc[mask_rec_td_missing, 'receiving_tds']


In [47]:
# Export the dataframe after imputing rec_touchdowns using receiving_tds
# wr_df.to_csv("step7_after_rec_touchdowns_imputation.csv", index=False)
# print("CSV export complete: step7_after_rec_touchdowns_imputation.csv")


In [48]:
# Feature Engineering - imputation for receiving_air_yards 

# Identify masks for missing values
mask_air_yards = wr_df['receiving_air_yards'].isna()
mask_epa = wr_df['receiving_epa'].isna()

# Apply smart imputation logic for air yards
# If receptions or targets are 0, set air yards to 0
zero_air_mask = (
    (wr_df['receptions'] == 0) | (wr_df['targets'] == 0)
) & mask_air_yards

# For remaining missing air yards (actual NGS gaps), use -10.0
missing_air_yards_mask = mask_air_yards & ~zero_air_mask

# Apply both types of imputations
wr_df.loc[zero_air_mask, 'receiving_air_yards'] = 0
wr_df.loc[missing_air_yards_mask, 'receiving_air_yards'] = -10.0

# Add flag for imputed (non-zero) air yards only
wr_df['is_missing_receiving_air_yards'] = missing_air_yards_mask.astype(int)

In [49]:
# Export the dataframe after imputing receiving_air_yards
# wr_df.to_csv("step8a_after_receiving_air_yards_imputation.csv", index=False)
# print("CSV export complete: step8_after_receiving_air_yards_imputation.csv")


In [50]:
# Feature Engineering - imputation receiving_epa

# Identify missing values
mask_epa = wr_df['receiving_epa'].isna()

# Impute with a distinct dummy value (-30.0 falls well below the observed min of -23.1)
wr_df.loc[mask_epa, 'receiving_epa'] = -30.0

# Add boolean flag for rows that were imputed
wr_df['is_missing_receiving_epa'] = mask_epa.astype(int)

# Verify no missing values remain
print("Remaining missing in receiving_epa:", wr_df['receiving_epa'].isna().sum())


Remaining missing in receiving_epa: 0


In [51]:
# Export the dataframe after imputing receiving_epa
# wr_df.to_csv("step9_after_receiving_epa_imputation.csv", index=False)
# print("CSV export complete: step9_after_receiving_epa_imputation.csv")


In [52]:
# Imputation for receiving_broken_tackles

# Identify missing values
mask_broken_tackles = wr_df['receiving_broken_tackles'].isna()

# Impute with dummy value (sentinel)
wr_df.loc[mask_broken_tackles, 'receiving_broken_tackles'] = -1

# Add boolean flag — always add, even if currently no missing
wr_df['is_missing_receiving_broken_tackles'] = mask_broken_tackles.astype(int)


In [53]:
# Confirm no missing values remain in 'receiving_broken_tackles'
missing_broken_tackles = wr_df['receiving_broken_tackles'].isna().sum()
missing_broken_tackles

0

In [54]:
# Export the dataframe after imputing receiving_broken_tackles
# wr_df.to_csv("step10_after_receiving_broken_tackles_imputation.csv", index=False)
# print("CSV export complete: step10_after_receiving_broken_tackles_imputation.csv")


In [55]:
# verify imputations for NGS stats
# Gather all columns that were either imputed or created as flags
imputation_columns = [
    
    # Catch percentage trio + flags
    'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z',
    'is_missing_catch_pct', 'is_missing_catch_pct_scaled', 'is_missing_catch_pct_z',
    
    # Spatial stats + flags
    'avg_cushion', 'is_missing_avg_cushion',
    'avg_separation', 'is_missing_avg_separation',
    
    # YAC stats + flags
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'is_missing_avg_yac', 'is_missing_avg_expected_yac', 'is_missing_avg_yac_above_expectation',
    
    # Other receiving stats + flags
    'receiving_air_yards', 'is_missing_receiving_air_yards',
    'receiving_epa', 'is_missing_receiving_epa',
    'receiving_2pt_conversions', 'rec_touchdowns', 'receiving_tds',

    # broken tackles
    'receiving_broken_tackles',
    'is_missing_receiving_broken_tackles'
]

# Extract sample rows for visual verification
imputed_preview = wr_df[imputation_columns].head(10)

# Summary of missing values in all imputed columns
missing_summary = wr_df[imputation_columns].isna().sum().sort_values(ascending=False)

missing_summary


catch_percentage                        0
is_missing_avg_yac                      0
receiving_broken_tackles                0
receiving_tds                           0
rec_touchdowns                          0
receiving_2pt_conversions               0
is_missing_receiving_epa                0
receiving_epa                           0
is_missing_receiving_air_yards          0
receiving_air_yards                     0
is_missing_avg_yac_above_expectation    0
is_missing_avg_expected_yac             0
avg_yac_above_expectation               0
catch_percentage_scaled                 0
avg_expected_yac                        0
avg_yac                                 0
is_missing_avg_separation               0
avg_separation                          0
is_missing_avg_cushion                  0
avg_cushion                             0
is_missing_catch_pct_z                  0
is_missing_catch_pct_scaled             0
is_missing_catch_pct                    0
catch_percentage_scaled_z         

In [56]:
# check df
# Combine all boolean imputation flags
flags = [col for col in wr_df.columns if col.startswith('is_missing_')]

# Select rows where at least one flag is triggered
imputed_rows = wr_df[wr_df[flags].sum(axis=1) > 0]

# Display selected columns from imputed rows
cols_to_check = [
    'name', 'week', 'season'
] + [col for col in imputation_columns if not col.startswith('is_missing_')] + flags

# Show a sample for visual inspection
display(imputed_rows[cols_to_check].sample(10))


Unnamed: 0,name,week,season,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z,avg_cushion,avg_separation,avg_yac,avg_expected_yac,...,is_missing_catch_pct_scaled,is_missing_catch_pct_z,is_missing_avg_cushion,is_missing_avg_separation,is_missing_avg_yac,is_missing_avg_expected_yac,is_missing_avg_yac_above_expectation,is_missing_receiving_air_yards,is_missing_receiving_epa,is_missing_receiving_broken_tackles
13540,Demario Douglas,4,2023,66.666667,0.666667,0.15553,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
4965,Isaiah McKenzie,3,2019,100.0,1.0,1.261205,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
2485,Cole Beasley,9,2018,75.0,0.75,0.431949,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
15502,Cedrick Wilson,13,2024,100.0,1.0,1.261205,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
8796,Collin Johnson,5,2021,0.0,0.0,-2.05582,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
14795,Robert Woods,6,2023,33.333333,0.333333,-0.950145,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
13227,Chris Moore,9,2023,50.0,0.5,-0.397307,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
3306,Larry Fitzgerald,12,2018,100.0,1.0,1.261205,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
15205,Zach Pascal,8,2023,0.0,0.0,-2.05582,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
16649,Mack Hollins,18,2024,75.0,0.75,0.431949,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0


In [57]:
# Feature Engineering - Imputation for the value ratio group
# Identify columns for imputation
value_ratio_cols = [
    'value_ratio_dk', 'value_ratio_fd',
    'value_ratio_dk_log', 'value_ratio_fd_log',
    'value_ratio_dk_log_z'
]

# Impute value_ratio and log variants with 0.0
wr_df['value_ratio_dk'] = wr_df['value_ratio_dk'].fillna(0.0)
wr_df['value_ratio_fd'] = wr_df['value_ratio_fd'].fillna(0.0)
wr_df['value_ratio_dk_log'] = wr_df['value_ratio_dk_log'].fillna(0.0)
wr_df['value_ratio_fd_log'] = wr_df['value_ratio_fd_log'].fillna(0.0)

# Impute z-score variant with an extreme low and flag
z_mask = wr_df['value_ratio_dk_log_z'].isna()
wr_df.loc[z_mask, 'value_ratio_dk_log_z'] = -4.0
wr_df['is_missing_value_ratio_dk_log_z'] = z_mask.astype(int)


In [58]:
# Spot-check value ratio columns
wr_df[
    [
        'value_ratio_dk', 'value_ratio_fd',
        'value_ratio_dk_log', 'value_ratio_fd_log',
        'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
    ]
].sample(10)

# And confirm missing values
wr_df[
    [
        'value_ratio_dk', 'value_ratio_fd',
        'value_ratio_dk_log', 'value_ratio_fd_log',
        'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
    ]
].isna().sum()


value_ratio_dk                     0
value_ratio_fd                     0
value_ratio_dk_log                 0
value_ratio_fd_log                 0
value_ratio_dk_log_z               0
is_missing_value_ratio_dk_log_z    0
dtype: int64

In [59]:
# Check the actual value types and examples
print(wr_df['value_ratio_dk'].unique()[:10])
print(wr_df['value_ratio_dk'].dtype)

# Count how many entries are actually zero, empty string, or 'nan'
print((wr_df['value_ratio_dk'] == '').sum())  # empty string
print((wr_df['value_ratio_dk'] == 'nan').sum())  # string 'nan'


[0.]
float64
0
0


In [60]:
# How many rows had original missing values before fill?
# Check how many were flagged
wr_df['is_missing_value_ratio_dk_log_z'].sum()


2833

In [61]:
# Inspect actual value distribution and types
print(wr_df['value_ratio_dk'].value_counts(dropna=False).head(10))

# See how many are truly NaN (np.nan)
print("NaN count:", wr_df['value_ratio_dk'].isna().sum())

# See how many are empty strings
print("Empty string count:", (wr_df['value_ratio_dk'] == '').sum())

# See dtype
print("Data type:", wr_df['value_ratio_dk'].dtype)


value_ratio_dk
0.0    17449
Name: count, dtype: int64
NaN count: 0
Empty string count: 0
Data type: float64


In [62]:
## dataframe: correct columns ##

# Drop or overwrite old calculations to ensure clean slate
cols_to_reset = [
    'value_ratio_dk', 'value_ratio_dk_log', 'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
]
wr_df.drop(columns=[col for col in cols_to_reset if col in wr_df.columns], inplace=True)


In [63]:
## dataframe: correct columns ##

# Recalculate safely only for valid rows
mask_valid_dk = (wr_df['dk_salary'].notna()) & (wr_df['dk_salary'] != 0)
wr_df.loc[mask_valid_dk, 'value_ratio_dk'] = wr_df.loc[mask_valid_dk, 'fpts'] / (wr_df.loc[mask_valid_dk, 'dk_salary'] / 1000)


In [64]:
## dataframe: correct columns ##

# Use log1p and clip to avoid errors on invalid/missing values
wr_df['value_ratio_dk_log'] = np.log1p(wr_df['value_ratio_dk'].clip(lower=0))


In [65]:
## dataframe: correct columns ##

# Use mean and std on the log-transformed version
mean_log = wr_df['value_ratio_dk_log'].mean(skipna=True)
std_log = wr_df['value_ratio_dk_log'].std(skipna=True)
wr_df['value_ratio_dk_log_z'] = (wr_df['value_ratio_dk_log'] - mean_log) / std_log


In [66]:
## dataframe: correct columns ##

z_mask = wr_df['value_ratio_dk_log_z'].isna()
wr_df.loc[z_mask, 'value_ratio_dk_log_z'] = -4.0
wr_df['is_missing_value_ratio_dk_log_z'] = z_mask.astype(int)


In [67]:
# Export the dataframe after recalculating value_ratio_dk and related features
# wr_df.to_csv("step11_after_value_ratio_dk_recalculation.csv", index=False)
# print("CSV export complete: step11_after_value_ratio_dk_recalculation.csv")


In [68]:
## dataframe: correct columns ##

cols_to_reset = [
    'value_ratio_fd', 'value_ratio_fd_log', 'value_ratio_fd_log_z', 'is_missing_value_ratio_fd_log_z'
]
wr_df.drop(columns=[col for col in cols_to_reset if col in wr_df.columns], inplace=True)


In [69]:
## dataframe: correct columns ##

mask_valid_fd = (wr_df['fd_salary'].notna()) & (wr_df['fd_salary'] != 0)
wr_df.loc[mask_valid_fd, 'value_ratio_fd'] = wr_df.loc[mask_valid_fd, 'fpts'] / (wr_df.loc[mask_valid_fd, 'fd_salary'] / 1000)


In [70]:
## dataframe: correct columns ##
wr_df['value_ratio_fd_log'] = np.log1p(wr_df['value_ratio_fd'].clip(lower=0))


In [71]:
## dataframe: correct columns ##

mean_log_fd = wr_df['value_ratio_fd_log'].mean(skipna=True)
std_log_fd = wr_df['value_ratio_fd_log'].std(skipna=True)
wr_df['value_ratio_fd_log_z'] = (wr_df['value_ratio_fd_log'] - mean_log_fd) / std_log_fd

In [72]:
# Recreate the missing mask and fallback imputation
z_mask_fd = wr_df['value_ratio_fd_log_z'].isna()
wr_df.loc[z_mask_fd, 'value_ratio_fd_log_z'] = -4.0
wr_df['is_missing_value_ratio_fd_log_z'] = z_mask_fd.astype(int)


In [73]:
# Export the dataframe after recalculating value_ratio_fd and related features
# wr_df.to_csv("step12_fixed_value_ratio_fd_flags.csv", index=False)


In [74]:
check_value_ratio_z = wr_df[wr_df['is_missing_value_ratio_dk_log_z'] == 1][
    ['value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z',
     'value_ratio_dk', 'value_ratio_fd', 'value_ratio_dk_log', 'value_ratio_fd_log']
]

if check_value_ratio_z.shape[0] > 0:
    display(check_value_ratio_z.sample(10))
else:
    print("✅ No rows were imputed with -4.0 for value_ratio_dk_log_z — all values were originally valid.")


Unnamed: 0,value_ratio_dk_log_z,is_missing_value_ratio_dk_log_z,value_ratio_dk,value_ratio_fd,value_ratio_dk_log,value_ratio_fd_log
4428,-4.0,1,,,,
988,-4.0,1,,,,
10210,-4.0,1,,,,
6076,-4.0,1,,,,
7991,-4.0,1,,,,
7336,-4.0,1,,,,
1032,-4.0,1,,,,
13786,-4.0,1,,,,
1938,-4.0,1,,,,
10035,-4.0,1,,,,


In [75]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")

# Total number of columns with missing values
print(f"\n🟡 Total columns with missing values: {len(missing_summary)}")

tgt_rz                                   : 16330
rec_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_delta                           : 10664
fpts_7wk_avg_z                           : 10268
fpts_7wk_avg                             : 10268
rec_air_yards_7wk_avg_clipped            : 9961
tgt_7wk_avg                              : 9961
tgt_7wk_avg_z                            : 9961
rec_7wk_avg                              : 9961
rec_air_yards_7wk_delta                  : 9961
rec_yds_7wk_avg                          : 9961
rec_air_yards_7wk_avg                    : 9961
rec_yds_7wk_delta                        : 9961
tgt_7wk_delta                            : 9961
rec_7wk_avg_z                            : 9961
rec_7wk_delta                            : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_yds_7wk_avg_z               

In [76]:
# columns already imputed with remaining a NaNs - will impute with 0
value_ratio_cols = [
    'value_ratio_dk', 'value_ratio_fd',
    'value_ratio_dk_log', 'value_ratio_fd_log',
    'value_ratio_dk_log_z', 'value_ratio_fd_log_z'
]

for col in value_ratio_cols:
    print(f"{col:<25}: {wr_df[col].isna().sum()} missing")


value_ratio_dk           : 2833 missing
value_ratio_fd           : 2833 missing
value_ratio_dk_log       : 2833 missing
value_ratio_fd_log       : 2833 missing
value_ratio_dk_log_z     : 0 missing
value_ratio_fd_log_z     : 0 missing


In [77]:
# columns already imputed with remaining a NaNs - will impute with 0

# Fill missing base and log columns for modeling compatibility
wr_df['value_ratio_dk'] = wr_df['value_ratio_dk'].fillna(0.0)
wr_df['value_ratio_fd'] = wr_df['value_ratio_fd'].fillna(0.0)
wr_df['value_ratio_dk_log'] = wr_df['value_ratio_dk_log'].fillna(0.0)
wr_df['value_ratio_fd_log'] = wr_df['value_ratio_fd_log'].fillna(0.0)


In [78]:
# columns already imputed with remaining a NaNs - will impute with 0

# Confirm all cleared
print(wr_df[['value_ratio_dk', 'value_ratio_fd', 'value_ratio_dk_log', 'value_ratio_fd_log']].isna().sum())


value_ratio_dk        0
value_ratio_fd        0
value_ratio_dk_log    0
value_ratio_fd_log    0
dtype: int64


In [79]:
# columns already imputed with remaining a NaNs - will impute with 0

# Columns to fill with 0.0 that were already imputed but still contain NaNs
# Exclude value_ratio_dk and value_ratio_fd (and their variants)
columns_to_fill = [
    'avg_cushion', 'avg_separation',
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'receiving_air_yards', 'receiving_epa',
    'rec_touchdowns',
    'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z'
]

# Apply fillna(0.0) to those columns
wr_df[columns_to_fill] = wr_df[columns_to_fill].fillna(0.0)

# Confirm cleanup
nan_summary_post_fill = wr_df[columns_to_fill].isna().sum()
nan_summary_post_fill

avg_cushion                  0
avg_separation               0
avg_yac                      0
avg_expected_yac             0
avg_yac_above_expectation    0
receiving_air_yards          0
receiving_epa                0
rec_touchdowns               0
catch_percentage             0
catch_percentage_scaled      0
catch_percentage_scaled_z    0
dtype: int64

In [80]:
# Save to CSV
# output_path = "step13_after_1st_round_imputations_zero_fill.csv"
# wr_df.to_csv(output_path, index=False)

In [81]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

rec_rz                                   : 16330
tgt_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_delta                           : 10664
fpts_7wk_avg                             : 10268
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg                    : 9961
rec_yds_7wk_avg                          : 9961
tgt_7wk_delta                            : 9961
rec_7wk_avg                              : 9961
rec_air_yards_7wk_avg_clipped            : 9961
tgt_7wk_avg                              : 9961
rec_7wk_delta                            : 9961
rec_yds_7wk_delta                        : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_delta                  : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                   

In [82]:
# feature engineering - rolling averages

# === 1. Setup refined base mapping ===
refined_base_mapping = {
    'receptions': 'rec_',
    'targets': 'tgt_',
    'receiving_yards': 'rec_yds_',
    'fpts': 'fpts_',
    'receiving_air_yards': 'rec_air_yards_'
}

# === 2. Define stricter regex to match ONLY valid rolling columns ===
rolling_avg_pattern = re.compile(r'^.*_\d+wk_(avg|delta|z)$')

# === 3. Exclude known non-numeric or categorical suffixes ===
non_numeric_suffixes = ('_bin', '_tier', '_clipped', '_flag')

# === 4. Identify safe rolling average columns ===
rolling_avg_cols = [
    col for col in wr_df.columns
    if rolling_avg_pattern.search(col)
    and not any(col.endswith(suffix) for suffix in non_numeric_suffixes)
]

# === 5. Match rolling columns to base stats ===
rolling_to_base_pairs = []
for base_stat, base_prefix in refined_base_mapping.items():
    for col in rolling_avg_cols:
        if col.startswith(base_prefix):
            rolling_to_base_pairs.append((col, base_stat))

# === 6. Apply group-wise expanding mean imputation ===
wr_df.sort_values(by=['name', 'season', 'week'], inplace=True)

for rolling_col, base_stat in rolling_to_base_pairs:
    base_cols_matching = [
        col for col in wr_df.columns
        if col.startswith(refined_base_mapping[base_stat])
        and np.issubdtype(wr_df[col].dtype, np.number)
    ]

    for base_col in base_cols_matching:
        if base_col in wr_df.columns and rolling_col in wr_df.columns:
            progressive_avg = (
                wr_df.groupby(['name', 'season'])[base_col]
                .transform(lambda x: x.expanding().mean())
            )
            wr_df[rolling_col] = wr_df[rolling_col].fillna(progressive_avg)

# === 7. Optional: Visual summary ===
print("\n📊 Summary of remaining nulls in rolling average columns:")
missing_summary_rolling = wr_df[rolling_avg_cols].isna().sum().sort_values(ascending=False)
display(missing_summary_rolling)

print("\n👀 Sample rows where fallback expanding mean likely applied:")
rolling_imputed_rows = wr_df[wr_df[rolling_avg_cols].isna().sum(axis=1) == 0]
display(rolling_imputed_rows[['name', 'season', 'week'] + rolling_avg_cols].sample(10))



📊 Summary of remaining nulls in rolling average columns:


tgt_3wk_avg                0
tgt_5wk_avg                0
fpts_5wk_delta             0
fpts_3wk_delta             0
rec_air_yards_7wk_delta    0
rec_air_yards_5wk_delta    0
rec_air_yards_3wk_delta    0
rec_yds_7wk_delta          0
rec_yds_5wk_delta          0
rec_yds_3wk_delta          0
rec_7wk_delta              0
rec_5wk_delta              0
rec_3wk_delta              0
tgt_7wk_delta              0
tgt_5wk_delta              0
tgt_3wk_delta              0
fpts_7wk_avg               0
fpts_5wk_avg               0
fpts_3wk_avg               0
rec_air_yards_7wk_avg      0
rec_air_yards_5wk_avg      0
rec_air_yards_3wk_avg      0
rec_yds_7wk_avg            0
rec_yds_5wk_avg            0
rec_yds_3wk_avg            0
rec_7wk_avg                0
rec_5wk_avg                0
rec_3wk_avg                0
tgt_7wk_avg                0
fpts_7wk_delta             0
dtype: int64


👀 Sample rows where fallback expanding mean likely applied:


Unnamed: 0,name,season,week,tgt_3wk_avg,tgt_5wk_avg,tgt_7wk_avg,rec_3wk_avg,rec_5wk_avg,rec_7wk_avg,rec_yds_3wk_avg,...,rec_7wk_delta,rec_yds_3wk_delta,rec_yds_5wk_delta,rec_yds_7wk_delta,rec_air_yards_3wk_delta,rec_air_yards_5wk_delta,rec_air_yards_7wk_delta,fpts_3wk_delta,fpts_5wk_delta,fpts_7wk_delta
8274,Willie Snead,2020,5,2.0,2.0,2.0,2.0,0.2,0.2,17.7,...,0.2,-17.7,0.2,0.2,5.3,0.2,0.2,-1.8,-2.46,-2.46
14547,Nelson Agholor,2023,7,3.3,4.0,1.0,2.3,3.2,0.333333,36.0,...,0.333333,-24.0,-30.0,0.333333,-32.3,-40.8,0.333333,3.6,1.8,0.316667
15696,Darius Slayton,2024,18,2.0,3.6,3.3,1.0,1.4,1.4,23.0,...,-1.4,-23.0,-20.8,-21.9,-16.7,-43.0,-37.9,-4.5,-3.4,-3.3
9482,Jauan Jennings,2021,4,2.0,1.0,1.0,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.3,-1.3,-1.3
16732,Marvin Mims,2024,8,2.3,2.0,1.3,1.0,1.0,0.0,8.3,...,0.0,-0.3,-0.8,0.0,-10.7,-7.6,0.0,-0.6,-0.4,-4.442857
9062,Dee Eskridge,2021,15,2.3,2.0,1.583333,1.7,1.4,0.166667,15.0,...,0.166667,-15.0,-9.4,0.166667,119.3,116.8,0.166667,-3.5,-2.6,-3.15
14801,Robert Woods,2023,16,3.7,4.4,4.9,2.0,2.4,2.3,22.7,...,-0.3,-8.7,-11.4,-9.3,24.0,19.0,16.6,-0.2,-0.4,-1.1
7588,Marquez Callaway,2020,17,5.3,4.6,3.583333,4.3,3.6,0.0,37.3,...,0.0,13.7,18.6,0.0,-6.7,0.4,0.0,1.4,1.9,-2.716667
176,ArDarius Stewart,2017,10,1.0,2.4,2.4,0.7,0.0,0.0,8.7,...,0.0,-8.7,0.0,0.0,-7.7,0.0,0.0,-1.1,-4.22,-4.22
11085,Curtis Samuel,2022,12,3.3,4.4,5.0,2.0,2.8,3.1,34.3,...,-3.1,-34.3,-41.2,-39.1,-32.3,-34.6,-39.7,-7.7,-7.2,-5.8


In [83]:
# Check for any remaining missing values in rolling columns
rolling_cols = [col for col in wr_df.columns if rolling_avg_pattern.search(col) or col.endswith(('_3wk_avg', '_5wk_avg', '_7wk_avg'))]
missing_summary_rolling = wr_df[rolling_cols].isna().sum().sort_values(ascending=False)

print("\n📊 Null values in rolling average columns:")
display(missing_summary_rolling)

# Show a few rows with 0.0 values (likely imputed fallback)
rolling_imputed_rows = wr_df[wr_df[rolling_cols].eq(0.0).any(axis=1)]
print("\n🔍 Sample rows where at least one rolling column was imputed with 0.0:")
display(rolling_imputed_rows[['name', 'season', 'week'] + rolling_cols].sample(10))



📊 Null values in rolling average columns:


tgt_3wk_avg                0
tgt_5wk_avg                0
fpts_5wk_delta             0
fpts_3wk_delta             0
rec_air_yards_7wk_delta    0
rec_air_yards_5wk_delta    0
rec_air_yards_3wk_delta    0
rec_yds_7wk_delta          0
rec_yds_5wk_delta          0
rec_yds_3wk_delta          0
rec_7wk_delta              0
rec_5wk_delta              0
rec_3wk_delta              0
tgt_7wk_delta              0
tgt_5wk_delta              0
tgt_3wk_delta              0
fpts_7wk_avg               0
fpts_5wk_avg               0
fpts_3wk_avg               0
rec_air_yards_7wk_avg      0
rec_air_yards_5wk_avg      0
rec_air_yards_3wk_avg      0
rec_yds_7wk_avg            0
rec_yds_5wk_avg            0
rec_yds_3wk_avg            0
rec_7wk_avg                0
rec_5wk_avg                0
rec_3wk_avg                0
tgt_7wk_avg                0
fpts_7wk_delta             0
dtype: int64


🔍 Sample rows where at least one rolling column was imputed with 0.0:


Unnamed: 0,name,season,week,tgt_3wk_avg,tgt_5wk_avg,tgt_7wk_avg,rec_3wk_avg,rec_5wk_avg,rec_7wk_avg,rec_yds_3wk_avg,...,rec_7wk_delta,rec_yds_3wk_delta,rec_yds_5wk_delta,rec_yds_7wk_delta,rec_air_yards_3wk_delta,rec_air_yards_5wk_delta,rec_air_yards_7wk_delta,fpts_3wk_delta,fpts_5wk_delta,fpts_7wk_delta
11173,Darius Slayton,2022,5,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.45,-0.45,-0.45
4682,Davante Adams,2019,4,7.0,6.125,6.125,5.0,0.0,0.0,66.0,...,0.0,114.0,0.0,0.0,62.3,0.0,0.0,11.4,3.5,3.5
14064,Jonathan Mingo,2023,5,6.3,4.7,4.7,2.7,0.0,0.0,21.3,...,0.0,26.7,0.0,0.0,-24.3,0.0,0.0,2.7,-2.7,-2.7
17439,Zay Flowers,2024,15,7.0,6.6,6.6,3.3,3.8,3.9,58.3,...,2.1,-5.3,-14.2,-13.0,-47.3,-24.6,-29.1,-2.4,-5.0,-4.1
5539,Michael Walker,2019,14,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.9,-4.9,-4.9
7446,Keelan Cole,2020,5,6.0,4.8,4.8,4.7,0.6,0.6,49.0,...,0.6,-24.0,0.6,0.6,73.0,0.6,0.6,1.8,2.02,2.02
6095,Vyncint Smith,2019,9,2.0,0.0,0.0,1.0,0.0,0.0,5.0,...,0.0,15.0,0.0,0.0,11.0,0.0,0.0,1.1,-2.96,-2.96
4540,Curtis Samuel,2019,13,6.0,7.0,6.7,3.0,3.2,3.3,23.0,...,0.7,42.0,29.2,26.7,46.7,15.8,20.0,6.9,5.5,4.1
3364,Marvin Hall,2018,4,1.0,0.666667,0.666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.166667,-5.166667,-5.166667
9684,Keenan Allen,2021,8,8.3,9.0,9.257143,6.0,6.0,0.285714,53.7,...,0.285714,23.3,13.2,0.285714,30.7,24.8,0.285714,8.3,6.1,2.985714


In [84]:
# Export the dataframe after recalculating value_ratio_dk and related features
# wr_df.to_csv("step14_after_rolling_imputations.csv", index=False)
# print("CSV export complete: step14_after_rolling_imputations.csv")


In [85]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\n🟡 Total columns with missing values: {len(missing_summary)}")

tgt_rz                                   : 16330
rec_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_avg_clipped            : 9961
yards                                    : 9277
avg_cushion_performance_bin              : 9252
catch_percentage_performance_bin         : 9250
avg_separation_performance_bin           : 9250
avg_intended_air_yards_performance_bin   : 9250
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards_performance_bin : 9250
value_ratio_dk_log_performance_bin       : 2833
z_value_ratio_fd       

In [86]:
# feature engineering - redzone imputations

# Red Zone Feature Imputation
red_zone_cols = ['tgt_rz', 'rec_rz']

# Fill missing values with 0.0
wr_df[red_zone_cols] = wr_df[red_zone_cols].fillna(0.0)

# Optional: add flags to trace what was imputed
for col in red_zone_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Verify cleanup
print(wr_df[red_zone_cols].isna().sum())


tgt_rz    0
rec_rz    0
dtype: int64


In [87]:
# feature engineering - rolling averages 

# Define all multi-week average columns to impute
multiweek_avg_cols = [
    'rec_3wk_avg', 'rec_5wk_avg', 'rec_7wk_avg',
    'tgt_3wk_avg', 'tgt_5wk_avg', 'tgt_7wk_avg',
    'fpts_3wk_avg', 'fpts_5wk_avg', 'fpts_7wk_avg',
    'rec_air_yards_3wk_avg', 'rec_air_yards_5wk_avg', 'rec_air_yards_7wk_avg',
    'rec_yds_3wk_avg', 'rec_yds_5wk_avg', 'rec_yds_7wk_avg'
]

# Fill NaNs with 0.0
wr_df[multiweek_avg_cols] = wr_df[multiweek_avg_cols].fillna(0.0)

# Add missingness flags
for col in multiweek_avg_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Verify cleanup
print(wr_df[multiweek_avg_cols].isna().sum())

rec_3wk_avg              0
rec_5wk_avg              0
rec_7wk_avg              0
tgt_3wk_avg              0
tgt_5wk_avg              0
tgt_7wk_avg              0
fpts_3wk_avg             0
fpts_5wk_avg             0
fpts_7wk_avg             0
rec_air_yards_3wk_avg    0
rec_air_yards_5wk_avg    0
rec_air_yards_7wk_avg    0
rec_yds_3wk_avg          0
rec_yds_5wk_avg          0
rec_yds_7wk_avg          0
dtype: int64


In [88]:
# Export CSV after Step 15
# wr_df.to_csv("step15_after_multiweek_avg_imputation.csv", index=False)
# print("CSV export complete: step15_after_multiweek_avg_imputation.csv")

In [89]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg_clipped            : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
yards                                    : 9277
avg_cushion_performance_bin              : 9252
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards_performance_bin : 9250
percent_share_of_intended_air_yards      : 9250
avg_separation_performance_bin           : 9250
avg_intended_air_yards_performance_bin   : 9250
catch_percentage_performance_bin         : 9250
z_value_ratio_fd                         : 2833
value_ratio_dk_log_performance_bin       : 2833
z_fpts_diff_fd                           : 2833
value_ratio_fd_log_perfor

In [90]:
# Feature Engineering - Performance Bin Imputations

# Define performance bin columns
performance_bin_cols = [
    'avg_cushion_performance_bin',
    'catch_percentage_performance_bin',
    'avg_separation_performance_bin',
    'avg_intended_air_yards_performance_bin',
    'percent_share_of_intended_air_yards_performance_bin',
    'value_ratio_dk_log_performance_bin',
    'value_ratio_fd_log_performance_bin',
    'fpts_performance_bin',
    'target_share_performance_bin',
]

# Impute missing values with -1 (sentinel)
wr_df[performance_bin_cols] = wr_df[performance_bin_cols].fillna(-1)

# Add imputation flags
for col in performance_bin_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].eq(-1).astype(int)

# Confirm no missing values remain in those columns
print("\nMissing values after performance bin imputation:")
print(wr_df[performance_bin_cols].isna().sum())


Missing values after performance bin imputation:
avg_cushion_performance_bin                            0
catch_percentage_performance_bin                       0
avg_separation_performance_bin                         0
avg_intended_air_yards_performance_bin                 0
percent_share_of_intended_air_yards_performance_bin    0
value_ratio_dk_log_performance_bin                     0
value_ratio_fd_log_performance_bin                     0
fpts_performance_bin                                   0
target_share_performance_bin                           0
dtype: int64


  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)


In [91]:
# Export updated dataframe
# wr_df.to_csv("step16_after_performance_bin_imputation.csv", index=False)
# print("\nCSV export complete: step16_after_performance_bin_imputation.csv")

In [92]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\n🟡 Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_avg_clipped            : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
z_value_ratio_fd                         : 2833
z_value_ratio_dk                         : 2833
fpts_diff_dk                             : 2833
z_fpts_diff_dk                           : 2833
z_fpts_diff_fd                           : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
expected_fpts_dk                         : 2193
expected_fpts_fd                    

In [93]:
# Feature Engineering - Z-Score Imputations

# Define all z-score columns for imputation
z_score_cols = [
    'z_value_ratio_fd', 'z_value_ratio_dk',
    'z_fpts_diff_fd', 'z_fpts_diff_dk'
]

# Impute missing z-score values with -4.0
wr_df[z_score_cols] = wr_df[z_score_cols].fillna(-4.0)

# Add imputation flags for each z-score column
for col in z_score_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)

# Confirm no remaining missing values
print("\nMissing values after z-score imputation:")
print(wr_df[z_score_cols].isna().sum())



Missing values after z-score imputation:
z_value_ratio_fd    0
z_value_ratio_dk    0
z_fpts_diff_fd      0
z_fpts_diff_dk      0
dtype: int64


  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)


In [94]:
# Export after z-score imputation
# wr_df.to_csv("step17_after_z_score_imputation.csv", index=False)
# print("CSV export complete: step17_after_z_score_imputation.csv")


In [95]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\n🟡 Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
tgt_7wk_avg_z                            : 9961
rec_air_yards_7wk_avg_clipped            : 9961
rec_yds_7wk_avg_z                        : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
fd_salary                                : 2193
expected_fpts_fd                         : 2193
expected_fpts_dk                         : 2193
dk_salary                                : 2193
fpts_lag_1                               : 2075
rec_yds_lag_1                       

In [96]:
# Feature Engineering - Lag Feature Imputations

# Define lag columns to impute
lag_cols = [
    'fpts_lag_1', 'tgt_lag_1', 'rec_lag_1',
    'rec_yds_lag_1', 'rec_air_yards_lag_1'
]

# Impute missing values with 0.0
wr_df[lag_cols] = wr_df[lag_cols].fillna(0.0)

# Add flags to trace what was imputed
for col in lag_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after lag feature imputation:")
print(wr_df[lag_cols].isna().sum())



Missing values after lag feature imputation:
fpts_lag_1             0
tgt_lag_1              0
rec_lag_1              0
rec_yds_lag_1          0
rec_air_yards_lag_1    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [97]:
# Export after imputing lag features
# wr_df.to_csv("step18_after_lag_feature_imputation.csv", index=False)
# print("CSV export complete: step18_after_lag_feature_imputation.csv")


In [98]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg_clipped            : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_fd                             : 2833
fpts_diff_dk                             : 2833
receiving_rat                            : 2774
expected_fpts_fd                         : 2193
expected_fpts_dk                         : 2193
dk_salary                                : 2193
fd_salary                                : 2193
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                

In [99]:
# Feature Engineering - rec, fpts, avg Feature Imputations

# Define next group of columns for zero imputation
next_zero_impute_cols = [
    'rec_3wk', 'rec_5wk', 'rec_7wk',
    'fpts_7wk_avg_z',
    'rec_air_yards_7wk_avg_clipped', 'rec_air_yards_7wk_avg_clipped_z',
    'rec_7wk_avg_z', 'tgt_7wk_avg_z', 'rec_yds_7wk_avg_z'
]

# Impute with 0.0
wr_df[next_zero_impute_cols] = wr_df[next_zero_impute_cols].fillna(0.0)

# Add is_missing flags
for col in next_zero_impute_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)


# Confirm cleanup
print("\nMissing values after feature imputation:")
print(wr_df[flag_col].isna().sum())


Missing values after feature imputation:
0


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [100]:
# Export after z-score imputation
# wr_df.to_csv("step19_after_7wk_and_air_yard_clip_zero_imputation.csv")
# print("CSV export complete: step19_after_7wk_and_air_yard_clip_zero_imputation")

In [101]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
fd_salary                                : 2193
expected_fpts_dk                         : 2193
expected_fpts_fd                         : 2193
dk_salary                                : 2193
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
rost                                     : 780
fpts_above_pos_avg                       : 780
fpts                                     : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share_z                           : 28

In [102]:
# Feature Engineering - dk and fd salary and expected fpts

# Define columns to impute
salary_and_expected_cols = [
    'dk_salary', 'fd_salary',
    'expected_fpts_dk', 'expected_fpts_fd'
]

# Fill with 0.0
wr_df[salary_and_expected_cols] = wr_df[salary_and_expected_cols].fillna(0.0)

# Add flags for traceability
for col in salary_and_expected_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after salary/expected fpts imputation:")
print(wr_df[salary_and_expected_cols].isna().sum())



Missing values after salary/expected fpts imputation:
dk_salary           0
fd_salary           0
expected_fpts_dk    0
expected_fpts_fd    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [103]:
# Export after salary and expected fpts imputation
# wr_df.to_csv("step20_after_salary_expected_fpts_zero_imputation.csv", index=False)
# print("CSV export complete: step20_after_salary_expected_fpts_zero_imputation")


In [104]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

yards                                    : 9277
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards      : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
fpts                                     : 780
fpts_above_pos_avg                       : 780
rost                                     : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share_z                           : 286
air_yards_share                          : 286
target_share                             : 286
wopr                                     : 286
fpts_3wk                                 : 119
sp

In [105]:
# Feature Engineering - Air Yard Metrics Imputation

# Define columns to impute
air_yard_cols = [
    'avg_intended_air_yards',
    'percent_share_of_intended_air_yards'
]

# Impute missing with 0.0
wr_df[air_yard_cols] = wr_df[air_yard_cols].fillna(0.0)

# Add is_missing flags
for col in air_yard_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after air yard metrics imputation:")
print(wr_df[air_yard_cols].isna().sum())


Missing values after air yard metrics imputation:
avg_intended_air_yards                 0
percent_share_of_intended_air_yards    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [106]:
# Export CSV
# wr_df.to_csv("step21_after_air_yard_metrics_zero_imputation.csv", index=False)
# print("CSV export complete: step21_after_air_yard_metrics_zero_imputation")

In [107]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\n🟡 Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
fpts                                     : 780
rost                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
air_yards_share                          : 286
wopr                                     : 286
target_share_z                           : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7

In [108]:
# Feature Engineering - Value Ratio DK N-week Imputation

# Define columns
value_ratio_dk_nwk_cols = [
    'value_ratio_dk_3wk', 
    'value_ratio_dk_5wk', 
    'value_ratio_dk_7wk'
]

# Impute with 0.0
wr_df[value_ratio_dk_nwk_cols] = wr_df[value_ratio_dk_nwk_cols].fillna(0.0)

# Add trace flags
for col in value_ratio_dk_nwk_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after value_ratio_dk_nwk imputation:")
print(wr_df[value_ratio_dk_nwk_cols].isna().sum())



Missing values after value_ratio_dk_nwk imputation:
value_ratio_dk_3wk    0
value_ratio_dk_5wk    0
value_ratio_dk_7wk    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [109]:
# 📁 Export after value_ratio_dk_nwk imputation
# wr_df.to_csv("step22_after_value_ratio_dk_nwk_zero_imputation.csv", index=False)
# print("CSV export complete: step22_after_value_ratio_dk_nwk_zero_imputation")


In [110]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

yards                                    : 9277
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
rolling_fpts_diff_fd                     : 1543
rolling_fpts_diff_dk                     : 1543
fpts                                     : 780
rost                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
target_share_z                           : 286
air_yards_share                          : 286
wopr                                     : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7wk                                 : 19

Total columns with missing values: 19


In [111]:
# Feature Engineering - Differential & Rolling Performance Metrics

next_group_cols = [
    'fpts_diff_dk', 'fpts_diff_fd',
    'rolling_fpts_diff_dk', 'rolling_fpts_diff_fd',
    'receiving_rat'  # same logic: 0 implies no receptions or rate unrecorded
]

# Impute with 0.0
wr_df[next_group_cols] = wr_df[next_group_cols].fillna(0.0)

# Add is_missing flags
for col in next_group_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after diff/rolling/rat imputation:")
print(wr_df[next_group_cols].isna().sum())



Missing values after diff/rolling/rat imputation:
fpts_diff_dk            0
fpts_diff_fd            0
rolling_fpts_diff_dk    0
rolling_fpts_diff_fd    0
receiving_rat           0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [112]:
# Export after diff/rolling/receiving_rate imputation
# wr_df.to_csv("step23_after_diff_rolling_rat_zero_imputation.csv", index=False)
# print("CSV export complete: step23_after_diff_rolling_rat_zero_imputation")


In [113]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

yards                                    : 9277
rost                                     : 780
fpts                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
target_share_z                           : 286
air_yards_share                          : 286
wopr                                     : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7wk                                 : 19

Total columns with missing values: 14


In [114]:
# Feature Engineering - more zero imputations
# 🧮 Feature Engineering - Basic Performance Metrics Imputation

# Define the columns to impute
basic_zero_cols = [
    'fpts', 'fpts_above_pos_avg', 'pos_avg_fpts',
    'spread', 'fpts_3wk', 'fpts_5wk', 'fpts_7wk'
]

# Apply 0.0 fill
wr_df[basic_zero_cols] = wr_df[basic_zero_cols].fillna(0.0)

# Add trace flags
for col in basic_zero_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after basic metrics imputation:")
print(wr_df[basic_zero_cols].isna().sum())



Missing values after basic metrics imputation:
fpts                  0
fpts_above_pos_avg    0
pos_avg_fpts          0
spread                0
fpts_3wk              0
fpts_5wk              0
fpts_7wk              0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [115]:
# Export after basic metrics imputation
# wr_df.to_csv("step24_after_basic_metrics_zero_imputation.csv", index=False)
# print("CSV export complete: step24_after_basic_metrics_zero_imputation")


In [116]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")

yards                                    : 9277
rost                                     : 780
racr                                     : 328
target_share                             : 286
target_share_z                           : 286
air_yards_share                          : 286
wopr                                     : 286

Total columns with missing values: 7


In [117]:
# Feature Engineering - format 'rost' into a float if not already done

# Check dtype and convert if needed
if wr_df['rost'].dtype == 'object':
    wr_df['rost'] = pd.to_numeric(wr_df['rost'], errors='coerce')


In [118]:
# Feature Engineering - more zero imputations
# Final block: positional share & related stats
final_zero_cols = [
    'yards', 'rost', 'racr',
    'target_share', 'target_share_z', 'air_yards_share', 'wopr'
]

# Fill with 0.0
wr_df[final_zero_cols] = wr_df[final_zero_cols].fillna(0.0)

# Add missing flags
for col in final_zero_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\nMissing values after final group imputation:")
print(wr_df[final_zero_cols].isna().sum())



Missing values after final group imputation:
yards              0
rost               0
racr               0
target_share       0
target_share_z     0
air_yards_share    0
wopr               0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [119]:
# Export after final imputations
# wr_df.to_csv("step25_after_final_metrics_zero_imputation.csv", index=False)
# print("CSV export complete: step25_after_final_metrics_zero_imputation")


In [120]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nTotal columns with missing values: {len(missing_summary)}")


Total columns with missing values: 0


In [121]:
# End: feature engineering - imputations

In [122]:
### *** New DataFrame *** ###

# Rename the dataframe
wr_df_stats_features_imputations = wr_df.copy()

# Export the renamed dataframe to CSV
wr_df_stats_features_imputations.to_csv("wr_df_stats_features_imputations.csv", index=False)
print("CSV export complete: wr_df_stats_features_imputations.csv")


CSV export complete: wr_df_stats_features_imputations.csv
