In [None]:
# logistic regression classification modeling

In [1]:
## ** INPUT REQUIRED **
# add filtering option for backtest or live predictions

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from fpdf import FPDF
from datetime import datetime
import re

In [5]:
# Load the raw dataset
wr_df_raw = pd.read_csv("wr_nfl_df_sorted_new_features_final.csv")

In [8]:
# Create working copy
wr_df = wr_df_raw.copy()

In [10]:
# Begin: feature engineering - imputations

In [12]:
# Feature Engineering on fantasypros stats
# imputation and drop unnecessary columns

# Reapply 'drop' logic using 'receiving_drop' and 'targets'
drop_mask = wr_df['drop'].isna() & wr_df['receiving_drop'].notna() & wr_df['targets'].notna() & (wr_df['targets'] > 0)
zero_targets_mask = wr_df['targets'] == 0

# If drop is missing and targets > 0, use receiving_drop
wr_df.loc[drop_mask, 'drop'] = wr_df.loc[drop_mask, 'receiving_drop']

# If targets == 0, set drop = 0
wr_df.loc[zero_targets_mask, 'drop'] = 0

# Failsafe: fill any remaining nulls with 0
wr_df['drop'] = wr_df['drop'].fillna(0)

# Drop confirmed redundant or low-value columns
columns_to_drop = [
    'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_drop', 'receiving_drop_pct',
    'g', 'rec', 'yds', 'ybc', 'air', 'yac', 'yacon', 'brktkl', 'tgt', 'catchable',
    'rz tgt', '10+ yds', '20+ yds', '30+ yds', '40+ yds', '50+ yds',
    'rec pct_rz', 'y/r_rz', 'tgt pct_rz', 'team_abbr_x', 'result', 'score', 'macro_tier_score_season', 
    'macro_tier_score_missing', 'macro_tier_season', 'over_under', 'O_U'

]

# Preview the actual columns that will be dropped
to_drop_confirmed = [col for col in columns_to_drop if col in wr_df.columns]
print("‚úÖ Columns confirmed to be dropped:", to_drop_confirmed)


# Drop only if columns are present in the current frame
wr_df.drop(columns=[col for col in columns_to_drop if col in wr_df.columns], inplace=True)


‚úÖ Columns confirmed to be dropped: ['receiving_fumbles', 'receiving_fumbles_lost', 'receiving_drop', 'receiving_drop_pct', 'g', 'rec', 'yds', 'ybc', 'air', 'yac', 'yacon', 'brktkl', 'tgt', 'catchable', 'rz tgt', '10+ yds', '20+ yds', '30+ yds', '40+ yds', '50+ yds', 'rec pct_rz', 'y/r_rz', 'tgt pct_rz', 'team_abbr_x', 'result', 'score', 'macro_tier_score_season', 'macro_tier_score_missing', 'macro_tier_season', 'over_under', 'O_U']


In [15]:
# Show the remaining columns after the drop
remaining_columns = wr_df.columns.tolist()
remaining_columns


['season',
 'season_type',
 'week',
 'name',
 'position',
 'recent_team',
 'player_display_name',
 'rost',
 'dk_salary',
 'fd_salary',
 'value_ratio_dk',
 'value_ratio_fd',
 'value_ratio_dk_log',
 'value_ratio_fd_log',
 'value_ratio_dk_log_z',
 'value_ratio_fd_log_z',
 'fpts',
 'pos_avg_fpts',
 'fpts_above_pos_avg',
 'expected_fpts_dk',
 'fpts_diff_dk',
 'hit_value_dk',
 'rolling_fpts_diff_dk',
 'z_fpts_diff_dk',
 'z_value_ratio_dk',
 'expected_fpts_fd',
 'fpts_diff_fd',
 'hit_value_fd',
 'rolling_fpts_diff_fd',
 'z_fpts_diff_fd',
 'z_value_ratio_fd',
 'double_digit_targets',
 'boom_week',
 'bust_week',
 'is_macro_high_tier',
 'is_macro_mid_tier',
 'is_macro_low_tier',
 'opponent_abbr',
 'home',
 'role',
 'spread',
 'is_home_game',
 'Total',
 'receptions',
 'receiving_yards',
 'yards',
 'receiving_yards_after_catch',
 'targets',
 'target_share',
 'target_share_z',
 'catch_percentage',
 'catch_percentage_scaled',
 'catch_percentage_scaled_z',
 'avg_cushion',
 'avg_separation',
 'avg_yac

In [17]:
# csv output check
# Export the dataframe after column drop to CSV
wr_df.to_csv("step1_after_column_drop.csv", index=False)
print("‚úÖ CSV export complete: step1_after_column_drop.csv")


‚úÖ CSV export complete: step1_after_column_drop.csv


In [19]:
# Observed (true recorded stats)
observed_stats = [
    'targets', 'receptions', 'receiving_yards', 'receiving_yards_after_catch',
    'receiving_air_yards', 'receiving_tds', 'rec_touchdowns',
    'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions',
    'fpts', 'catch_percentage', 'avg_cushion', 'avg_separation',
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'avg_intended_air_yards', 'percent_share_of_intended_air_yards',
    'receiving_broken_tackles'
]

# Salary and roster
salary_fields = ['dk_salary', 'fd_salary']
roster_fields = ['rost']

# Metadata (unchanged throughout pipeline)
metadata = [
    'season', 'season_type', 'week', 'name', 'player_display_name',
    'position', 'recent_team', 'opponent_abbr', 'role', 'home', 'is_home_game'
]

# Dynamic categories
booleans_flags = [col for col in wr_df.columns if col.startswith('is_') or col.endswith('_ge_5') or col.endswith('_ge_7') or col.startswith('boom_') or col.startswith('bust_') or col.startswith('double_digit') or col.endswith('over_100_yds')]
performance_bins = [col for col in wr_df.columns if col.endswith('_performance_bin')]
rolling_stats = [col for col in wr_df.columns if any(sub in col for sub in ['_avg', '_lag', '_delta', '_games_played'])]
percentile_stats = [col for col in wr_df.columns if 'percentile' in col]

# Derived = numeric columns not already categorized
categorized_cols = set(observed_stats + salary_fields + roster_fields + metadata + booleans_flags + performance_bins + rolling_stats + percentile_stats)
derived_features = [col for col in wr_df.columns if col not in categorized_cols and wr_df[col].dtype in ['float64', 'int64']]
derived_features.extend(['hit_value_dk', 'hit_value_fd'])


In [26]:
# Confirm shape after cleaning
column_count_after_drop = wr_df.shape[1]

# Rebuild the category_lists dictionary from previous block
category_lists = {
    "observed_stats": observed_stats,
    "salary_fields": salary_fields,
    "roster_fields": roster_fields,
    "metadata": metadata,
    "booleans_flags": booleans_flags,
    "performance_bins": performance_bins,
    "rolling_stats": rolling_stats,
    "derived_features": derived_features,
    "percentile_stats": percentile_stats
}

# Flatten all categorized columns into one set
all_categorized_columns = set().union(*category_lists.values())

# Compare
{
    "column_count_after_drop": column_count_after_drop,
    "total_categorized_columns": len(all_categorized_columns),
    "match": column_count_after_drop == len(all_categorized_columns)
}


{'column_count_after_drop': 214,
 'total_categorized_columns': 214,
 'match': True}

In [28]:
# Reveal uncategorized columns
uncategorized_columns = set(wr_df.columns) - all_categorized_columns
print("üîç Uncategorized columns:", uncategorized_columns)


üîç Uncategorized columns: set()


In [31]:
# Define is_active using the refined logic
wr_df['is_active'] = (
    (wr_df['dk_salary'].fillna(0) > 0) |
    (wr_df['fd_salary'].fillna(0) > 0) |
    (wr_df['targets'].fillna(0) > 0) |
    (wr_df['receptions'].fillna(0) > 0) |
    (wr_df['receiving_yards'].fillna(0) > 0) |
    (wr_df['fpts'].fillna(0) > 0)
)

# Count how many players are considered active
active_count = wr_df['is_active'].sum()
total_count = wr_df.shape[0]
inactive_count = total_count - active_count

# Display summary as a DataFrame
summary_df = pd.DataFrame({
    "Total Rows": [total_count],
    "Active Rows": [active_count],
    "Inactive Rows": [inactive_count],
    "Active %": [round(100 * active_count / total_count, 2)],
    "Inactive %": [round(100 * inactive_count / total_count, 2)]
})

summary_df


Unnamed: 0,Total Rows,Active Rows,Inactive Rows,Active %,Inactive %
0,17449,17429,20,99.89,0.11


In [34]:
# Zero out or False-out specified columns for rows where is_active is False.
def apply_default_zeros(df, column_groups):

    for group_name, cols in column_groups.items():
        for col in cols:
            if col in df.columns:
                if df[col].dtype == 'bool':
                    df.loc[~df['is_active'], col] = False
                else:
                    df.loc[~df['is_active'], col] = 0
    return df


In [36]:
## Begin block-by-block imputations

In [37]:
# Impute missing 'player_display_name' using 'name'
wr_df['player_display_name'] = wr_df['player_display_name'].fillna(wr_df['name'])

In [40]:
# Check how many missing values remain in 'player_display_name' after the imputation
missing_display_name = wr_df['player_display_name'].isna().sum()

# Display a few rows where the original imputation was applied (i.e., name and player_display_name were previously not equal)
imputed_rows = wr_df[wr_df['player_display_name'] == wr_df['name']][['name', 'player_display_name']].head(10)

missing_display_name, imputed_rows


(0,
          name player_display_name
 0  A.J. Green          A.J. Green
 1  A.J. Green          A.J. Green
 2  A.J. Green          A.J. Green
 3  A.J. Green          A.J. Green
 4  A.J. Green          A.J. Green
 5  A.J. Green          A.J. Green
 6  A.J. Green          A.J. Green
 7  A.J. Green          A.J. Green
 8  A.J. Green          A.J. Green
 9  A.J. Green          A.J. Green)

In [43]:
# Export the dataframe after imputing 'player_display_name'
wr_df.to_csv("step2_after_player_display_name_imputation.csv", index=False)
print("‚úÖ CSV export complete: step2_after_player_display_name_imputation.csv")


‚úÖ CSV export complete: step2_after_player_display_name_imputation.csv


In [46]:
# Descriptive Stats - catch_percentage

# Get statistical summary of 'catch_percentage' and its scaled versions
catch_pct_cols = ['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']
catch_pct_stats = wr_df[catch_pct_cols].describe().T

# Add missing value counts
catch_pct_stats['missing_count'] = wr_df[catch_pct_cols].isna().sum()
catch_pct_stats['missing_percent'] = wr_df[catch_pct_cols].isna().mean() * 100

catch_pct_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
catch_percentage,8199.0,63.015331,19.083549,0.0,50.0,62.5,77.8,100.0,9250,53.011634
catch_percentage_scaled,8199.0,0.633138,0.195842,0.0,0.5,0.6,0.8,1.0,9250,53.011634
catch_percentage_scaled_z,8199.0,-0.006403,1.004702,-3.3,-0.7,0.0,0.8,1.9,9250,53.011634


In [50]:
# Check how many rows have missing catch_percentage but have valid receptions and targets

# Check how many valid values we would get using 'receptions' and 'targets'
correct_mask = (
    wr_df['catch_percentage'].isna() &
    wr_df['receptions'].notna() &
    wr_df['targets'].notna() &
    (wr_df['targets'] > 0)
)

correct_count = correct_mask.sum()

# recalculate number of rows fields
correct_count


8964

In [53]:
# fill in catch_percentage receptions / targets

# Apply the corrected calculation
wr_df.loc[correct_mask, 'catch_percentage'] = (
    wr_df.loc[correct_mask, 'receptions'] / wr_df.loc[correct_mask, 'targets']
) * 100

# Recompute the scaled version
wr_df['catch_percentage_scaled'] = wr_df['catch_percentage'] / 100

# Recompute the z-score version
scaled_mean = wr_df['catch_percentage_scaled'].mean(skipna=True)
scaled_std = wr_df['catch_percentage_scaled'].std(skipna=True)
wr_df['catch_percentage_scaled_z'] = (wr_df['catch_percentage_scaled'] - scaled_mean) / scaled_std


In [56]:
# verify catch_percentage results
# Re-check how many missing values remain in the three catch_percentage-related columns
final_missing_summary = wr_df[['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']].isna().sum()
final_missing_summary


catch_percentage             286
catch_percentage_scaled      286
catch_percentage_scaled_z    286
dtype: int64

In [58]:
# check df
# Sample a few rows with valid values to verify the imputation and scaling logic
check_rows = wr_df[
    wr_df['catch_percentage'].notna() &
    wr_df['catch_percentage_scaled'].notna() &
    wr_df['catch_percentage_scaled_z'].notna()
][['receptions', 'targets', 'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']].sample(10)

display(check_rows)


Unnamed: 0,receptions,targets,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z
5917,0,2,0.0,0.0,-2.05582
16682,1,2,50.0,0.5,-0.397307
7442,5,5,100.0,1.0,1.261205
5059,2,6,33.3,0.333,-0.95125
11055,1,1,100.0,1.0,1.261205
12587,4,4,100.0,1.0,1.261205
3542,8,11,72.7,0.727,0.355657
11884,2,3,66.666667,0.666667,0.15553
3609,1,3,33.333333,0.333333,-0.950145
4592,7,9,77.8,0.778,0.524826


In [60]:
# Show 10 sample rows where catch_percentage is still missing
remaining_na_rows = wr_df[wr_df['catch_percentage'].isna()][
    ['name', 'week', 'season', 'receptions', 'targets', 'catch_percentage']
].head(10)

remaining_na_rows


Unnamed: 0,name,week,season,receptions,targets,catch_percentage
81,Alex Erickson,8,2017,0,0,
174,ArDarius Stewart,5,2017,0,0,
177,ArDarius Stewart,13,2017,0,0,
198,Bernard Reedy,6,2017,0,0,
199,Bernard Reedy,7,2017,0,0,
201,Bobo Wilson,13,2017,0,0,
203,Bobo Wilson,17,2017,0,0,
268,Braxton Miller,10,2017,0,0,
333,Chad Williams,14,2017,0,0,
460,Curtis Samuel,6,2017,0,0,


In [62]:
# Feature Engineering - Imputation for remaining missing values
# Define masks for remaining missing values
mask_catch_pct = wr_df['catch_percentage'].isna()

# Impute dummy values
wr_df.loc[mask_catch_pct, 'catch_percentage'] = -10.0
wr_df.loc[mask_catch_pct, 'catch_percentage_scaled'] = -0.1
wr_df.loc[mask_catch_pct, 'catch_percentage_scaled_z'] = -4.0

# Add boolean flags for each
wr_df['is_missing_catch_pct'] = mask_catch_pct.astype(int)
wr_df['is_missing_catch_pct_scaled'] = mask_catch_pct.astype(int)
wr_df['is_missing_catch_pct_z'] = mask_catch_pct.astype(int)


In [64]:
# Verify no missing values
# Re-check how many missing values remain in the catch_percentage feature group
final_check = wr_df[
    ['catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z']
].isna().sum()

final_check


catch_percentage             0
catch_percentage_scaled      0
catch_percentage_scaled_z    0
dtype: int64

In [66]:
# check df
# Confirm that all dummy rows were flagged and values assigned correctly
check_dummy_rows = wr_df[wr_df['is_missing_catch_pct'] == 1][[
    'receptions', 'targets', 'catch_percentage',
    'catch_percentage_scaled', 'catch_percentage_scaled_z',
    'is_missing_catch_pct', 'is_missing_catch_pct_scaled', 'is_missing_catch_pct_z'
]].sample(10)

display(check_dummy_rows)


Unnamed: 0,receptions,targets,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z,is_missing_catch_pct,is_missing_catch_pct_scaled,is_missing_catch_pct_z
16559,0,0,-10.0,-0.1,-4.0,1,1,1
11085,0,0,-10.0,-0.1,-4.0,1,1,1
6091,0,0,-10.0,-0.1,-4.0,1,1,1
9720,0,0,-10.0,-0.1,-4.0,1,1,1
12485,0,0,-10.0,-0.1,-4.0,1,1,1
5495,0,0,-10.0,-0.1,-4.0,1,1,1
7995,0,0,-10.0,-0.1,-4.0,1,1,1
5024,0,0,-10.0,-0.1,-4.0,1,1,1
14367,0,0,-10.0,-0.1,-4.0,1,1,1
10454,0,0,-10.0,-0.1,-4.0,1,1,1


In [70]:
# Export the dataframe after imputing catch_percentage and related fields
wr_df.to_csv("step3_after_catch_percentage_imputation.csv", index=False)
print("‚úÖ CSV export complete: step3_after_catch_percentage_imputation.csv")


‚úÖ CSV export complete: step3_after_catch_percentage_imputation.csv


In [73]:
# Descriptive Stats - avg_cushion

# Get statistical summary of 'avg_cushion'
cushion_stats = wr_df[['avg_cushion']].describe().T

# Add missing count and percent
cushion_stats['missing_count'] = wr_df['avg_cushion'].isna().sum()
cushion_stats['missing_percent'] = wr_df['avg_cushion'].isna().mean() * 100

cushion_stats


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_cushion,8197.0,6.044809,1.505897,1.9,5.0,6.0,7.1,14.4,9252,53.023096


In [76]:
# Feature Engineering - Imputation for avg_cushion

# Identify missing values
mask_avg_cushion = wr_df['avg_cushion'].isna()

# Impute with dummy value
wr_df.loc[mask_avg_cushion, 'avg_cushion'] = -1.0

# Add boolean flag
wr_df['is_missing_avg_cushion'] = mask_avg_cushion.astype(int)


In [79]:
# Verify no missing values remain in 'avg_cushion'
missing_avg_cushion = wr_df['avg_cushion'].isna().sum()
missing_avg_cushion

0

In [82]:
# check df
check_cushion = wr_df[wr_df['is_missing_avg_cushion'] == 1][['avg_cushion', 'is_missing_avg_cushion']].sample(10)
display(check_cushion)


Unnamed: 0,avg_cushion,is_missing_avg_cushion
14795,-1.0,1
10937,-1.0,1
1270,-1.0,1
3286,-1.0,1
4838,-1.0,1
4860,-1.0,1
6693,-1.0,1
13795,-1.0,1
8606,-1.0,1
8333,-1.0,1


In [86]:
# Export the dataframe after imputing avg_cushion
wr_df.to_csv("step4_after_avg_cushion_imputation.csv", index=False)
print("‚úÖ CSV export complete: step4_after_avg_cushion_imputation.csv")


‚úÖ CSV export complete: step4_after_avg_cushion_imputation.csv


In [89]:
# Descriptive Stats - avg_separation

# check for the next feature: 'avg_separation'
separation_stats = wr_df[['avg_separation']].describe().T
separation_stats['missing_count'] = wr_df['avg_separation'].isna().sum()
separation_stats['missing_percent'] = wr_df['avg_separation'].isna().mean() * 100

separation_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_separation,8199.0,2.862666,0.937429,0.6,2.2,2.8,3.4,8.7,9250,53.011634


In [92]:
# Feature Engineering - Imputation for avg_separation

# Identify missing values
mask_avg_separation = wr_df['avg_separation'].isna()

# Impute with dummy value
wr_df.loc[mask_avg_separation, 'avg_separation'] = -1.0

# Add boolean flag
wr_df['is_missing_avg_separation'] = mask_avg_separation.astype(int)


In [94]:
# Verify that 'avg_separation' has no remaining missing values
missing_avg_separation = wr_df['avg_separation'].isna().sum()
missing_avg_separation

0

In [96]:
# check df
check_separation = wr_df[wr_df['is_missing_avg_separation'] == 1][['avg_separation', 'is_missing_avg_separation']].sample(10)
display(check_separation)


Unnamed: 0,avg_separation,is_missing_avg_separation
4781,-1.0,1
17282,-1.0,1
12321,-1.0,1
11570,-1.0,1
3373,-1.0,1
16782,-1.0,1
7818,-1.0,1
11984,-1.0,1
8067,-1.0,1
4255,-1.0,1


In [98]:
# Export the dataframe after imputing avg_separation
wr_df.to_csv("step5_after_avg_separation_imputation.csv", index=False)
print("‚úÖ CSV export complete: step5_after_avg_separation_imputation.csv")


‚úÖ CSV export complete: step5_after_avg_separation_imputation.csv


In [100]:
# Descriptive stats for NGS-related features
next_ngs_features = ['avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation']
next_ngs_stats = wr_df[next_ngs_features].describe().T
next_ngs_stats['missing_count'] = wr_df[next_ngs_features].isna().sum()
next_ngs_stats['missing_percent'] = wr_df[next_ngs_features].isna().mean() * 100

next_ngs_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
avg_yac,8167.0,4.49258,3.456574,-2.6,2.3,3.8,5.9,43.2,9282,53.195026
avg_expected_yac,8158.0,3.916082,2.3139,0.1,2.3,3.5,5.1,19.1,9291,53.246604
avg_yac_above_expectation,8158.0,0.5785,2.335047,-9.9,-0.6,0.2,1.2,37.7,9291,53.246604


In [103]:
# Feature Engineering - apply imputation for YAC-related Next Gen Stats

# Identify missing masks for each column
mask_yac = wr_df['avg_yac'].isna()
mask_exp_yac = wr_df['avg_expected_yac'].isna()
mask_yac_diff = wr_df['avg_yac_above_expectation'].isna()

# Apply smart imputation values
wr_df.loc[mask_yac, 'avg_yac'] = -5.0
wr_df.loc[mask_exp_yac, 'avg_expected_yac'] = -1.0
wr_df.loc[mask_yac_diff, 'avg_yac_above_expectation'] = -10.0

# Add boolean flags
wr_df['is_missing_avg_yac'] = mask_yac.astype(int)
wr_df['is_missing_avg_expected_yac'] = mask_exp_yac.astype(int)
wr_df['is_missing_avg_yac_above_expectation'] = mask_yac_diff.astype(int)


In [106]:
# Final verification: confirm no missing values remain in the YAC-related fields
yac_verification = wr_df[
    ['avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation']
].isna().sum()

yac_verification


avg_yac                      0
avg_expected_yac             0
avg_yac_above_expectation    0
dtype: int64

In [109]:
# check df
check_yac = wr_df[wr_df['is_missing_avg_yac'] == 1][[
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'is_missing_avg_yac', 'is_missing_avg_expected_yac', 'is_missing_avg_yac_above_expectation'
]].sample(10)
display(check_yac)


Unnamed: 0,avg_yac,avg_expected_yac,avg_yac_above_expectation,is_missing_avg_yac,is_missing_avg_expected_yac,is_missing_avg_yac_above_expectation
5567,-5.0,-1.0,-10.0,1,1,1
12119,-5.0,-1.0,-10.0,1,1,1
7155,-5.0,-1.0,-10.0,1,1,1
10091,-5.0,-1.0,-10.0,1,1,1
14646,-5.0,-1.0,-10.0,1,1,1
9186,-5.0,-1.0,-10.0,1,1,1
10380,-5.0,-1.0,-10.0,1,1,1
106,-5.0,-1.0,-10.0,1,1,1
5229,-5.0,-1.0,-10.0,1,1,1
11247,-5.0,-1.0,-10.0,1,1,1


In [112]:
# Export the dataframe after imputing YAC-related fields
wr_df.to_csv("step6_after_yac_imputation.csv", index=False)
print("‚úÖ CSV export complete: step6_after_yac_imputation.csv")


‚úÖ CSV export complete: step6_after_yac_imputation.csv


In [114]:
# Define important and secondary NGS features for evaluation
important_ngs = [
    'receiving_broken_tackles', 'receiving_air_yards',
    'receiving_epa', 'receiving_2pt_conversions'
]

potential_duplicates = [
    'receiving_first_downs', 'receiving_tds', 'rec_touchdowns'
]

all_ngs = important_ngs + potential_duplicates

# Generate descriptive statistics and missing value summary
ngs_stats = wr_df[all_ngs].describe().T
ngs_stats['missing_count'] = wr_df[all_ngs].isna().sum()
ngs_stats['missing_percent'] = wr_df[all_ngs].isna().mean() * 100

ngs_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_count,missing_percent
receiving_broken_tackles,14675.0,0.164634,0.480238,0.0,0.0,0.0,0.0,7.0,2774,15.897759
receiving_air_yards,17449.0,52.960399,45.542857,-32.0,17.0,43.0,78.0,334.0,0,0.0
receiving_epa,17163.0,1.139678,3.896251,-23.1,-1.0,0.8,3.2,23.6,286,1.639062
receiving_2pt_conversions,17449.0,0.01192,0.108531,0.0,0.0,0.0,0.0,1.0,0,0.0
receiving_first_downs,17449.0,1.882973,1.778176,0.0,1.0,1.0,3.0,14.0,0,0.0
receiving_tds,17449.0,0.237836,0.499024,0.0,0.0,0.0,0.0,4.0,0,0.0
rec_touchdowns,8199.0,0.377607,0.605227,0.0,0.0,0.0,1.0,4.0,9250,53.011634


In [117]:
# rec_touchdowns = receiving_tds if missing

# Identify rows where rec_touchdowns is missing but receiving_tds is available
mask_rec_td_missing = wr_df['rec_touchdowns'].isna() & wr_df['receiving_tds'].notna()

# Fill missing rec_touchdowns using receiving_tds
wr_df.loc[mask_rec_td_missing, 'rec_touchdowns'] = wr_df.loc[mask_rec_td_missing, 'receiving_tds']


In [119]:
# Export the dataframe after imputing rec_touchdowns using receiving_tds
wr_df.to_csv("step7_after_rec_touchdowns_imputation.csv", index=False)
print("‚úÖ CSV export complete: step7_after_rec_touchdowns_imputation.csv")


‚úÖ CSV export complete: step7_after_rec_touchdowns_imputation.csv


In [121]:
# Feature Engineering - imputation for receiving_air_yards 

# Identify masks for missing values
mask_air_yards = wr_df['receiving_air_yards'].isna()
mask_epa = wr_df['receiving_epa'].isna()

# Apply smart imputation logic for air yards
# If receptions or targets are 0, set air yards to 0
zero_air_mask = (
    (wr_df['receptions'] == 0) | (wr_df['targets'] == 0)
) & mask_air_yards

# For remaining missing air yards (actual NGS gaps), use -10.0
missing_air_yards_mask = mask_air_yards & ~zero_air_mask

# Apply both types of imputations
wr_df.loc[zero_air_mask, 'receiving_air_yards'] = 0
wr_df.loc[missing_air_yards_mask, 'receiving_air_yards'] = -10.0

# Add flag for imputed (non-zero) air yards only
wr_df['is_missing_receiving_air_yards'] = missing_air_yards_mask.astype(int)

In [128]:
# Export the dataframe after imputing receiving_air_yards
wr_df.to_csv("step8a_after_receiving_air_yards_imputation.csv", index=False)
print("‚úÖ CSV export complete: step8_after_receiving_air_yards_imputation.csv")


‚úÖ CSV export complete: step8_after_receiving_air_yards_imputation.csv


In [130]:
# Feature Engineering - imputation receiving_epa

# Identify missing values
mask_epa = wr_df['receiving_epa'].isna()

# Impute with a distinct dummy value (-30.0 falls well below the observed min of -23.1)
wr_df.loc[mask_epa, 'receiving_epa'] = -30.0

# Add boolean flag for rows that were imputed
wr_df['is_missing_receiving_epa'] = mask_epa.astype(int)

# Verify no missing values remain
print("Remaining missing in receiving_epa:", wr_df['receiving_epa'].isna().sum())


Remaining missing in receiving_epa: 0


In [132]:
# Export the dataframe after imputing receiving_epa
wr_df.to_csv("step9_after_receiving_epa_imputation.csv", index=False)
print("‚úÖ CSV export complete: step9_after_receiving_epa_imputation.csv")


‚úÖ CSV export complete: step9_after_receiving_epa_imputation.csv


In [135]:
# Imputation for receiving_broken_tackles

# Identify missing values
mask_broken_tackles = wr_df['receiving_broken_tackles'].isna()

# Impute with dummy value (sentinel)
wr_df.loc[mask_broken_tackles, 'receiving_broken_tackles'] = -1

# Add boolean flag ‚Äî always add, even if currently no missing
wr_df['is_missing_receiving_broken_tackles'] = mask_broken_tackles.astype(int)


In [139]:
# Confirm no missing values remain in 'receiving_broken_tackles'
missing_broken_tackles = wr_df['receiving_broken_tackles'].isna().sum()
missing_broken_tackles

0

In [141]:
# Export the dataframe after imputing receiving_broken_tackles
wr_df.to_csv("step10_after_receiving_broken_tackles_imputation.csv", index=False)
print("‚úÖ CSV export complete: step10_after_receiving_broken_tackles_imputation.csv")


‚úÖ CSV export complete: step10_after_receiving_broken_tackles_imputation.csv


In [145]:
# verify imputations for NGS stats
# Gather all columns that were either imputed or created as flags
imputation_columns = [
    
    # Catch percentage trio + flags
    'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z',
    'is_missing_catch_pct', 'is_missing_catch_pct_scaled', 'is_missing_catch_pct_z',
    
    # Spatial stats + flags
    'avg_cushion', 'is_missing_avg_cushion',
    'avg_separation', 'is_missing_avg_separation',
    
    # YAC stats + flags
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'is_missing_avg_yac', 'is_missing_avg_expected_yac', 'is_missing_avg_yac_above_expectation',
    
    # Other receiving stats + flags
    'receiving_air_yards', 'is_missing_receiving_air_yards',
    'receiving_epa', 'is_missing_receiving_epa',
    'receiving_2pt_conversions', 'rec_touchdowns', 'receiving_tds',

    # broken tackles
    'receiving_broken_tackles',
    'is_missing_receiving_broken_tackles'
]

# Extract sample rows for visual verification
imputed_preview = wr_df[imputation_columns].head(10)

# Summary of missing values in all imputed columns
missing_summary = wr_df[imputation_columns].isna().sum().sort_values(ascending=False)

missing_summary


catch_percentage                        0
is_missing_avg_yac                      0
receiving_broken_tackles                0
receiving_tds                           0
rec_touchdowns                          0
receiving_2pt_conversions               0
is_missing_receiving_epa                0
receiving_epa                           0
is_missing_receiving_air_yards          0
receiving_air_yards                     0
is_missing_avg_yac_above_expectation    0
is_missing_avg_expected_yac             0
avg_yac_above_expectation               0
catch_percentage_scaled                 0
avg_expected_yac                        0
avg_yac                                 0
is_missing_avg_separation               0
avg_separation                          0
is_missing_avg_cushion                  0
avg_cushion                             0
is_missing_catch_pct_z                  0
is_missing_catch_pct_scaled             0
is_missing_catch_pct                    0
catch_percentage_scaled_z         

In [147]:
# check df
# Combine all boolean imputation flags
flags = [col for col in wr_df.columns if col.startswith('is_missing_')]

# Select rows where at least one flag is triggered
imputed_rows = wr_df[wr_df[flags].sum(axis=1) > 0]

# Display selected columns from imputed rows
cols_to_check = [
    'name', 'week', 'season'
] + [col for col in imputation_columns if not col.startswith('is_missing_')] + flags

# Show a sample for visual inspection
display(imputed_rows[cols_to_check].sample(10))


Unnamed: 0,name,week,season,catch_percentage,catch_percentage_scaled,catch_percentage_scaled_z,avg_cushion,avg_separation,avg_yac,avg_expected_yac,...,is_missing_catch_pct_scaled,is_missing_catch_pct_z,is_missing_avg_cushion,is_missing_avg_separation,is_missing_avg_yac,is_missing_avg_expected_yac,is_missing_avg_yac_above_expectation,is_missing_receiving_air_yards,is_missing_receiving_epa,is_missing_receiving_broken_tackles
12741,Tyler Boyd,16,2022,75.0,0.75,0.431949,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
1906,Travis Rudolph,17,2017,25.0,0.25,-1.226564,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,1
5928,Taylor Gabriel,11,2019,50.0,0.5,-0.397307,7.5,2.6,3.3,3.4,...,0,0,0,0,0,0,0,0,0,1
14258,Kayshon Boutte,12,2023,50.0,0.5,-0.397307,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
9061,Dee Eskridge,14,2021,0.0,0.0,-2.05582,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
14339,Lil'Jordan Humphrey,1,2023,100.0,1.0,1.261205,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
5237,Kalif Raymond,9,2019,66.666667,0.666667,0.15553,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
16100,Jalen Brooks,2,2024,33.333333,0.333333,-0.950145,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
8238,Tyron Johnson,13,2020,100.0,1.0,1.261205,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0
8510,Ben Skowronek,14,2021,0.0,0.0,-2.05582,-1.0,-1.0,-5.0,-1.0,...,0,0,1,1,1,1,1,0,0,0


In [149]:
# Feature Engineering - Imputation for the value ratio group
# Identify columns for imputation
value_ratio_cols = [
    'value_ratio_dk', 'value_ratio_fd',
    'value_ratio_dk_log', 'value_ratio_fd_log',
    'value_ratio_dk_log_z'
]

# Impute value_ratio and log variants with 0.0
wr_df['value_ratio_dk'] = wr_df['value_ratio_dk'].fillna(0.0)
wr_df['value_ratio_fd'] = wr_df['value_ratio_fd'].fillna(0.0)
wr_df['value_ratio_dk_log'] = wr_df['value_ratio_dk_log'].fillna(0.0)
wr_df['value_ratio_fd_log'] = wr_df['value_ratio_fd_log'].fillna(0.0)

# Impute z-score variant with an extreme low and flag
z_mask = wr_df['value_ratio_dk_log_z'].isna()
wr_df.loc[z_mask, 'value_ratio_dk_log_z'] = -4.0
wr_df['is_missing_value_ratio_dk_log_z'] = z_mask.astype(int)


In [151]:
# Spot-check value ratio columns
wr_df[
    [
        'value_ratio_dk', 'value_ratio_fd',
        'value_ratio_dk_log', 'value_ratio_fd_log',
        'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
    ]
].sample(10)

# And confirm missing values
wr_df[
    [
        'value_ratio_dk', 'value_ratio_fd',
        'value_ratio_dk_log', 'value_ratio_fd_log',
        'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
    ]
].isna().sum()


value_ratio_dk                     0
value_ratio_fd                     0
value_ratio_dk_log                 0
value_ratio_fd_log                 0
value_ratio_dk_log_z               0
is_missing_value_ratio_dk_log_z    0
dtype: int64

In [156]:
# Check the actual value types and examples
print(wr_df['value_ratio_dk'].unique()[:10])
print(wr_df['value_ratio_dk'].dtype)

# Count how many entries are actually zero, empty string, or 'nan'
print((wr_df['value_ratio_dk'] == '').sum())  # empty string
print((wr_df['value_ratio_dk'] == 'nan').sum())  # string 'nan'


[0.]
float64
0
0


In [158]:
# How many rows had original missing values before fill?
# Check how many were flagged
wr_df['is_missing_value_ratio_dk_log_z'].sum()


2833

In [160]:
# Inspect actual value distribution and types
print(wr_df['value_ratio_dk'].value_counts(dropna=False).head(10))

# See how many are truly NaN (np.nan)
print("NaN count:", wr_df['value_ratio_dk'].isna().sum())

# See how many are empty strings
print("Empty string count:", (wr_df['value_ratio_dk'] == '').sum())

# See dtype
print("Data type:", wr_df['value_ratio_dk'].dtype)


value_ratio_dk
0.0    17449
Name: count, dtype: int64
NaN count: 0
Empty string count: 0
Data type: float64


In [162]:
## dataframe: correct columns ##

# Drop or overwrite old calculations to ensure clean slate
cols_to_reset = [
    'value_ratio_dk', 'value_ratio_dk_log', 'value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z'
]
wr_df.drop(columns=[col for col in cols_to_reset if col in wr_df.columns], inplace=True)


In [164]:
## dataframe: correct columns ##

# Recalculate safely only for valid rows
mask_valid_dk = (wr_df['dk_salary'].notna()) & (wr_df['dk_salary'] != 0)
wr_df.loc[mask_valid_dk, 'value_ratio_dk'] = wr_df.loc[mask_valid_dk, 'fpts'] / (wr_df.loc[mask_valid_dk, 'dk_salary'] / 1000)


In [167]:
## dataframe: correct columns ##

# Use log1p and clip to avoid errors on invalid/missing values
wr_df['value_ratio_dk_log'] = np.log1p(wr_df['value_ratio_dk'].clip(lower=0))


In [169]:
## dataframe: correct columns ##

# Use mean and std on the log-transformed version
mean_log = wr_df['value_ratio_dk_log'].mean(skipna=True)
std_log = wr_df['value_ratio_dk_log'].std(skipna=True)
wr_df['value_ratio_dk_log_z'] = (wr_df['value_ratio_dk_log'] - mean_log) / std_log


In [171]:
## dataframe: correct columns ##

z_mask = wr_df['value_ratio_dk_log_z'].isna()
wr_df.loc[z_mask, 'value_ratio_dk_log_z'] = -4.0
wr_df['is_missing_value_ratio_dk_log_z'] = z_mask.astype(int)


In [173]:
# Export the dataframe after recalculating value_ratio_dk and related features
wr_df.to_csv("step11_after_value_ratio_dk_recalculation.csv", index=False)
print("‚úÖ CSV export complete: step11_after_value_ratio_dk_recalculation.csv")


‚úÖ CSV export complete: step11_after_value_ratio_dk_recalculation.csv


In [175]:
## dataframe: correct columns ##

cols_to_reset = [
    'value_ratio_fd', 'value_ratio_fd_log', 'value_ratio_fd_log_z', 'is_missing_value_ratio_fd_log_z'
]
wr_df.drop(columns=[col for col in cols_to_reset if col in wr_df.columns], inplace=True)


In [179]:
## dataframe: correct columns ##

mask_valid_fd = (wr_df['fd_salary'].notna()) & (wr_df['fd_salary'] != 0)
wr_df.loc[mask_valid_fd, 'value_ratio_fd'] = wr_df.loc[mask_valid_fd, 'fpts'] / (wr_df.loc[mask_valid_fd, 'fd_salary'] / 1000)


In [181]:
## dataframe: correct columns ##
wr_df['value_ratio_fd_log'] = np.log1p(wr_df['value_ratio_fd'].clip(lower=0))


In [183]:
## dataframe: correct columns ##

mean_log_fd = wr_df['value_ratio_fd_log'].mean(skipna=True)
std_log_fd = wr_df['value_ratio_fd_log'].std(skipna=True)
wr_df['value_ratio_fd_log_z'] = (wr_df['value_ratio_fd_log'] - mean_log_fd) / std_log_fd

In [185]:
# Recreate the missing mask and fallback imputation
z_mask_fd = wr_df['value_ratio_fd_log_z'].isna()
wr_df.loc[z_mask_fd, 'value_ratio_fd_log_z'] = -4.0
wr_df['is_missing_value_ratio_fd_log_z'] = z_mask_fd.astype(int)


In [186]:
# Export the dataframe after recalculating value_ratio_fd and related features
wr_df.to_csv("step12_fixed_value_ratio_fd_flags.csv", index=False)


In [188]:
check_value_ratio_z = wr_df[wr_df['is_missing_value_ratio_dk_log_z'] == 1][
    ['value_ratio_dk_log_z', 'is_missing_value_ratio_dk_log_z',
     'value_ratio_dk', 'value_ratio_fd', 'value_ratio_dk_log', 'value_ratio_fd_log']
]

if check_value_ratio_z.shape[0] > 0:
    display(check_value_ratio_z.sample(10))
else:
    print("‚úÖ No rows were imputed with -4.0 for value_ratio_dk_log_z ‚Äî all values were originally valid.")


Unnamed: 0,value_ratio_dk_log_z,is_missing_value_ratio_dk_log_z,value_ratio_dk,value_ratio_fd,value_ratio_dk_log,value_ratio_fd_log
9857,-4.0,1,,,,
13702,-4.0,1,,,,
12426,-4.0,1,,,,
13782,-4.0,1,,,,
2284,-4.0,1,,,,
9776,-4.0,1,,,,
6819,-4.0,1,,,,
10040,-4.0,1,,,,
12066,-4.0,1,,,,
7090,-4.0,1,,,,


In [191]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")

# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

tgt_rz                                   : 16330
rec_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_delta                           : 10664
fpts_7wk_avg_z                           : 10268
fpts_7wk_avg                             : 10268
rec_air_yards_7wk_avg_clipped            : 9961
tgt_7wk_avg                              : 9961
tgt_7wk_avg_z                            : 9961
rec_7wk_avg                              : 9961
rec_air_yards_7wk_delta                  : 9961
rec_yds_7wk_avg                          : 9961
rec_air_yards_7wk_avg                    : 9961
rec_yds_7wk_delta                        : 9961
tgt_7wk_delta                            : 9961
rec_7wk_avg_z                            : 9961
rec_7wk_delta                            : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_yds_7wk_avg_z               

In [194]:
# columns already imputed with remaining a NaNs - will impute with 0
value_ratio_cols = [
    'value_ratio_dk', 'value_ratio_fd',
    'value_ratio_dk_log', 'value_ratio_fd_log',
    'value_ratio_dk_log_z', 'value_ratio_fd_log_z'
]

for col in value_ratio_cols:
    print(f"{col:<25}: {wr_df[col].isna().sum()} missing")


value_ratio_dk           : 2833 missing
value_ratio_fd           : 2833 missing
value_ratio_dk_log       : 2833 missing
value_ratio_fd_log       : 2833 missing
value_ratio_dk_log_z     : 0 missing
value_ratio_fd_log_z     : 0 missing


In [196]:
# columns already imputed with remaining a NaNs - will impute with 0

# Fill missing base and log columns for modeling compatibility
wr_df['value_ratio_dk'] = wr_df['value_ratio_dk'].fillna(0.0)
wr_df['value_ratio_fd'] = wr_df['value_ratio_fd'].fillna(0.0)
wr_df['value_ratio_dk_log'] = wr_df['value_ratio_dk_log'].fillna(0.0)
wr_df['value_ratio_fd_log'] = wr_df['value_ratio_fd_log'].fillna(0.0)


In [199]:
# columns already imputed with remaining a NaNs - will impute with 0

# Confirm all cleared
print(wr_df[['value_ratio_dk', 'value_ratio_fd', 'value_ratio_dk_log', 'value_ratio_fd_log']].isna().sum())


value_ratio_dk        0
value_ratio_fd        0
value_ratio_dk_log    0
value_ratio_fd_log    0
dtype: int64


In [201]:
# columns already imputed with remaining a NaNs - will impute with 0

# Columns to fill with 0.0 that were already imputed but still contain NaNs
# Exclude value_ratio_dk and value_ratio_fd (and their variants)
columns_to_fill = [
    'avg_cushion', 'avg_separation',
    'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation',
    'receiving_air_yards', 'receiving_epa',
    'rec_touchdowns',
    'catch_percentage', 'catch_percentage_scaled', 'catch_percentage_scaled_z'
]

# Apply fillna(0.0) to those columns
wr_df[columns_to_fill] = wr_df[columns_to_fill].fillna(0.0)

# Confirm cleanup
nan_summary_post_fill = wr_df[columns_to_fill].isna().sum()
nan_summary_post_fill

avg_cushion                  0
avg_separation               0
avg_yac                      0
avg_expected_yac             0
avg_yac_above_expectation    0
receiving_air_yards          0
receiving_epa                0
rec_touchdowns               0
catch_percentage             0
catch_percentage_scaled      0
catch_percentage_scaled_z    0
dtype: int64

In [203]:
# Save to CSV
output_path = "step13_after_1st_round_imputations_zero_fill.csv"
wr_df.to_csv(output_path, index=False)

In [205]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

rec_rz                                   : 16330
tgt_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_delta                           : 10664
fpts_7wk_avg                             : 10268
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg                    : 9961
rec_yds_7wk_avg                          : 9961
tgt_7wk_delta                            : 9961
rec_7wk_avg                              : 9961
rec_air_yards_7wk_avg_clipped            : 9961
tgt_7wk_avg                              : 9961
rec_7wk_delta                            : 9961
rec_yds_7wk_delta                        : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_delta                  : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                   

In [208]:
# feature engineering - rolling averages

# === 1. Setup refined base mapping ===
refined_base_mapping = {
    'receptions': 'rec_',
    'targets': 'tgt_',
    'receiving_yards': 'rec_yds_',
    'fpts': 'fpts_',
    'receiving_air_yards': 'rec_air_yards_'
}

# === 2. Define stricter regex to match ONLY valid rolling columns ===
rolling_avg_pattern = re.compile(r'^.*_\d+wk_(avg|delta|z)$')

# === 3. Exclude known non-numeric or categorical suffixes ===
non_numeric_suffixes = ('_bin', '_tier', '_clipped', '_flag')

# === 4. Identify safe rolling average columns ===
rolling_avg_cols = [
    col for col in wr_df.columns
    if rolling_avg_pattern.search(col)
    and not any(col.endswith(suffix) for suffix in non_numeric_suffixes)
]

# === 5. Match rolling columns to base stats ===
rolling_to_base_pairs = []
for base_stat, base_prefix in refined_base_mapping.items():
    for col in rolling_avg_cols:
        if col.startswith(base_prefix):
            rolling_to_base_pairs.append((col, base_stat))

# === 6. Apply group-wise expanding mean imputation ===
wr_df.sort_values(by=['name', 'season', 'week'], inplace=True)

for rolling_col, base_stat in rolling_to_base_pairs:
    base_cols_matching = [
        col for col in wr_df.columns
        if col.startswith(refined_base_mapping[base_stat])
        and np.issubdtype(wr_df[col].dtype, np.number)
    ]

    for base_col in base_cols_matching:
        if base_col in wr_df.columns and rolling_col in wr_df.columns:
            progressive_avg = (
                wr_df.groupby(['name', 'season'])[base_col]
                .transform(lambda x: x.expanding().mean())
            )
            wr_df[rolling_col] = wr_df[rolling_col].fillna(progressive_avg)

# === 7. Optional: Visual summary ===
print("\nüìä Summary of remaining nulls in rolling average columns:")
missing_summary_rolling = wr_df[rolling_avg_cols].isna().sum().sort_values(ascending=False)
display(missing_summary_rolling)

print("\nüëÄ Sample rows where fallback expanding mean likely applied:")
rolling_imputed_rows = wr_df[wr_df[rolling_avg_cols].isna().sum(axis=1) == 0]
display(rolling_imputed_rows[['name', 'season', 'week'] + rolling_avg_cols].sample(10))



üìä Summary of remaining nulls in rolling average columns:


tgt_3wk_avg                0
tgt_5wk_avg                0
fpts_5wk_delta             0
fpts_3wk_delta             0
rec_air_yards_7wk_delta    0
rec_air_yards_5wk_delta    0
rec_air_yards_3wk_delta    0
rec_yds_7wk_delta          0
rec_yds_5wk_delta          0
rec_yds_3wk_delta          0
rec_7wk_delta              0
rec_5wk_delta              0
rec_3wk_delta              0
tgt_7wk_delta              0
tgt_5wk_delta              0
tgt_3wk_delta              0
fpts_7wk_avg               0
fpts_5wk_avg               0
fpts_3wk_avg               0
rec_air_yards_7wk_avg      0
rec_air_yards_5wk_avg      0
rec_air_yards_3wk_avg      0
rec_yds_7wk_avg            0
rec_yds_5wk_avg            0
rec_yds_3wk_avg            0
rec_7wk_avg                0
rec_5wk_avg                0
rec_3wk_avg                0
tgt_7wk_avg                0
fpts_7wk_delta             0
dtype: int64


üëÄ Sample rows where fallback expanding mean likely applied:


Unnamed: 0,name,season,week,tgt_3wk_avg,tgt_5wk_avg,tgt_7wk_avg,rec_3wk_avg,rec_5wk_avg,rec_7wk_avg,rec_yds_3wk_avg,...,rec_7wk_delta,rec_yds_3wk_delta,rec_yds_5wk_delta,rec_yds_7wk_delta,rec_air_yards_3wk_delta,rec_air_yards_5wk_delta,rec_air_yards_7wk_delta,fpts_3wk_delta,fpts_5wk_delta,fpts_7wk_delta
15225,Zay Flowers,2023,20,6.0,7.2,6.6,4.3,4.8,4.6,61.7,...,-0.6,-20.7,-13.0,-14.1,-3.3,-16.4,-16.9,2.85625,2.85625,2.85625
13401,Darnell Mooney,2023,4,1.0,1.0,1.0,0.333333,0.333333,0.333333,0.333333,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.133333,0.133333,0.133333
7249,Jamison Crowder,2020,9,11.0,1.0,1.0,7.3,0.6,0.6,89.3,...,0.6,-63.3,0.6,0.6,-71.3,0.6,0.6,-2.3,5.72,5.72
8675,Chase Claypool,2021,3,7.0,4.333333,4.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.766667,1.766667,1.766667
11262,DeAndre Hopkins,2022,8,14.0,7.5,7.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,10.75,10.75,10.75
2575,D.J. Moore,2018,13,7.3,6.0,5.7,6.3,5.0,4.6,89.3,...,-0.6,-45.3,-30.8,-22.0,81.7,80.4,84.4,-4.8,-3.8,-1.8
14847,Ronnie Bell,2023,16,1.0,1.2,1.0,0.3,0.8,0.333333,6.7,...,0.333333,5.3,1.8,0.333333,-2.7,-1.0,0.333333,6.5,5.0,-2.316667
5123,John Brown,2019,11,8.7,7.4,7.6,4.7,4.8,4.7,69.0,...,4.3,68.0,64.0,67.7,37.0,56.4,55.1,18.8,17.2,17.9
1147,Kenny Britt,2017,13,2.3,4.0,4.6,2.0,2.2,2.1,31.7,...,-0.1,-21.7,-20.8,-20.0,-7.3,-26.0,-35.0,-4.2,-3.3,-3.7
9619,K.J. Osborn,2021,1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8,0.8


In [212]:
# Check for any remaining missing values in rolling columns
rolling_cols = [col for col in wr_df.columns if rolling_avg_pattern.search(col) or col.endswith(('_3wk_avg', '_5wk_avg', '_7wk_avg'))]
missing_summary_rolling = wr_df[rolling_cols].isna().sum().sort_values(ascending=False)

print("\nüìä Null values in rolling average columns:")
display(missing_summary_rolling)

# Show a few rows with 0.0 values (likely imputed fallback)
rolling_imputed_rows = wr_df[wr_df[rolling_cols].eq(0.0).any(axis=1)]
print("\nüîç Sample rows where at least one rolling column was imputed with 0.0:")
display(rolling_imputed_rows[['name', 'season', 'week'] + rolling_cols].sample(10))



üìä Null values in rolling average columns:


tgt_3wk_avg                0
tgt_5wk_avg                0
fpts_5wk_delta             0
fpts_3wk_delta             0
rec_air_yards_7wk_delta    0
rec_air_yards_5wk_delta    0
rec_air_yards_3wk_delta    0
rec_yds_7wk_delta          0
rec_yds_5wk_delta          0
rec_yds_3wk_delta          0
rec_7wk_delta              0
rec_5wk_delta              0
rec_3wk_delta              0
tgt_7wk_delta              0
tgt_5wk_delta              0
tgt_3wk_delta              0
fpts_7wk_avg               0
fpts_5wk_avg               0
fpts_3wk_avg               0
rec_air_yards_7wk_avg      0
rec_air_yards_5wk_avg      0
rec_air_yards_3wk_avg      0
rec_yds_7wk_avg            0
rec_yds_5wk_avg            0
rec_yds_3wk_avg            0
rec_7wk_avg                0
rec_5wk_avg                0
rec_3wk_avg                0
tgt_7wk_avg                0
fpts_7wk_delta             0
dtype: int64


üîç Sample rows where at least one rolling column was imputed with 0.0:


Unnamed: 0,name,season,week,tgt_3wk_avg,tgt_5wk_avg,tgt_7wk_avg,rec_3wk_avg,rec_5wk_avg,rec_7wk_avg,rec_yds_3wk_avg,...,rec_7wk_delta,rec_yds_3wk_delta,rec_yds_5wk_delta,rec_yds_7wk_delta,rec_air_yards_3wk_delta,rec_air_yards_5wk_delta,rec_air_yards_7wk_delta,fpts_3wk_delta,fpts_5wk_delta,fpts_7wk_delta
6284,Andy Isabella,2020,9,2.7,3.0,2.0,1.0,1.8,0.285714,12.3,...,0.285714,-2.3,-7.4,0.285714,-26.7,-18.4,0.285714,-0.2,-3.0,-2.128571
15573,Collin Johnson,2024,16,1.0,0.5,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.9,-5.9,-5.9
12265,Nico Collins,2022,3,6.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.566667,-1.566667,-1.566667
12669,Trent Sherfield,2022,6,3.0,2.6,1.95,2.0,1.8,0.0,21.7,...,0.0,-15.7,-10.4,0.0,-15.7,-4.0,0.0,-1.6,-1.0,-3.883333
16317,John Metchie,2024,18,4.7,4.0,4.0,3.0,2.6,2.6,27.0,...,-0.6,-3.0,0.0,-3.7,-10.3,-2.6,-6.9,-0.3,0.0,-1.2
2704,DeVante Parker,2018,9,4.3,2.325,2.325,2.7,0.0,0.0,58.0,...,0.0,-50.0,0.0,0.0,-75.0,0.0,0.0,-5.0,-1.5,-1.5
3622,Richie James,2018,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.3,-5.3,-5.3
2583,DaeSean Hamilton,2018,12,2.0,1.75,1.75,1.3,0.0,0.0,16.0,...,0.0,-3.0,0.0,0.0,6.7,0.0,0.0,-0.3,-4.625,-4.625
977,Josh Reynolds,2017,11,1.0,0.666667,0.666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.3,-4.3,-4.3
16074,Jake Bobo,2024,4,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.9,-2.9,-2.9


In [215]:
# Export the dataframe after recalculating value_ratio_dk and related features
wr_df.to_csv("step14_after_rolling_imputations.csv", index=False)
print("‚úÖ CSV export complete: step14_after_rolling_imputations.csv")


‚úÖ CSV export complete: step14_after_rolling_imputations.csv


In [219]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

tgt_rz                                   : 16330
rec_rz                                   : 16330
rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_avg_clipped            : 9961
yards                                    : 9277
avg_cushion_performance_bin              : 9252
catch_percentage_performance_bin         : 9250
avg_separation_performance_bin           : 9250
avg_intended_air_yards_performance_bin   : 9250
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards_performance_bin : 9250
value_ratio_dk_log_performance_bin       : 2833
z_value_ratio_fd       

In [228]:
# feature engineering - redzone imputations

# Red Zone Feature Imputation
red_zone_cols = ['tgt_rz', 'rec_rz']

# Fill missing values with 0.0
wr_df[red_zone_cols] = wr_df[red_zone_cols].fillna(0.0)

# Optional: add flags to trace what was imputed
for col in red_zone_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Verify cleanup
print(wr_df[red_zone_cols].isna().sum())


tgt_rz    0
rec_rz    0
dtype: int64


In [230]:
# feature engineering - rolling averages 

# Define all multi-week average columns to impute
multiweek_avg_cols = [
    'rec_3wk_avg', 'rec_5wk_avg', 'rec_7wk_avg',
    'tgt_3wk_avg', 'tgt_5wk_avg', 'tgt_7wk_avg',
    'fpts_3wk_avg', 'fpts_5wk_avg', 'fpts_7wk_avg',
    'rec_air_yards_3wk_avg', 'rec_air_yards_5wk_avg', 'rec_air_yards_7wk_avg',
    'rec_yds_3wk_avg', 'rec_yds_5wk_avg', 'rec_yds_7wk_avg'
]

# Fill NaNs with 0.0
wr_df[multiweek_avg_cols] = wr_df[multiweek_avg_cols].fillna(0.0)

# Add missingness flags
for col in multiweek_avg_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Verify cleanup
print(wr_df[multiweek_avg_cols].isna().sum())

rec_3wk_avg              0
rec_5wk_avg              0
rec_7wk_avg              0
tgt_3wk_avg              0
tgt_5wk_avg              0
tgt_7wk_avg              0
fpts_3wk_avg             0
fpts_5wk_avg             0
fpts_7wk_avg             0
rec_air_yards_3wk_avg    0
rec_air_yards_5wk_avg    0
rec_air_yards_7wk_avg    0
rec_yds_3wk_avg          0
rec_yds_5wk_avg          0
rec_yds_7wk_avg          0
dtype: int64


In [234]:
# ‚úÖ Export CSV after Step 15
wr_df.to_csv("step15_after_multiweek_avg_imputation.csv", index=False)
print("‚úÖ CSV export complete: step15_after_multiweek_avg_imputation.csv")

‚úÖ CSV export complete: step15_after_multiweek_avg_imputation.csv


In [236]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg_clipped            : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
yards                                    : 9277
avg_cushion_performance_bin              : 9252
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards_performance_bin : 9250
percent_share_of_intended_air_yards      : 9250
avg_separation_performance_bin           : 9250
avg_intended_air_yards_performance_bin   : 9250
catch_percentage_performance_bin         : 9250
z_value_ratio_fd                         : 2833
value_ratio_dk_log_performance_bin       : 2833
z_fpts_diff_fd                           : 2833
value_ratio_fd_log_perfor

In [240]:
# Feature Engineering - Performance Bin Imputations

# Define performance bin columns
performance_bin_cols = [
    'avg_cushion_performance_bin',
    'catch_percentage_performance_bin',
    'avg_separation_performance_bin',
    'avg_intended_air_yards_performance_bin',
    'percent_share_of_intended_air_yards_performance_bin',
    'value_ratio_dk_log_performance_bin',
    'value_ratio_fd_log_performance_bin',
    'fpts_performance_bin',
    'target_share_performance_bin',
]

# Impute missing values with -1 (sentinel)
wr_df[performance_bin_cols] = wr_df[performance_bin_cols].fillna(-1)

# Add imputation flags
for col in performance_bin_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].eq(-1).astype(int)

# Confirm no missing values remain in those columns
print("\n‚úÖ Missing values after performance bin imputation:")
print(wr_df[performance_bin_cols].isna().sum())


‚úÖ Missing values after performance bin imputation:
avg_cushion_performance_bin                            0
catch_percentage_performance_bin                       0
avg_separation_performance_bin                         0
avg_intended_air_yards_performance_bin                 0
percent_share_of_intended_air_yards_performance_bin    0
value_ratio_dk_log_performance_bin                     0
value_ratio_fd_log_performance_bin                     0
fpts_performance_bin                                   0
target_share_performance_bin                           0
dtype: int64


  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-1).astype(int)


In [244]:
# Export updated dataframe
wr_df.to_csv("step16_after_performance_bin_imputation.csv", index=False)
print("\nüìÅ CSV export complete: step16_after_performance_bin_imputation.csv")


üìÅ CSV export complete: step16_after_performance_bin_imputation.csv


In [246]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
rec_air_yards_7wk_avg_clipped            : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
z_value_ratio_fd                         : 2833
z_value_ratio_dk                         : 2833
fpts_diff_dk                             : 2833
z_fpts_diff_dk                           : 2833
z_fpts_diff_fd                           : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
expected_fpts_dk                         : 2193
expected_fpts_fd                    

In [249]:
# Feature Engineering - Z-Score Imputations

# Define all z-score columns for imputation
z_score_cols = [
    'z_value_ratio_fd', 'z_value_ratio_dk',
    'z_fpts_diff_fd', 'z_fpts_diff_dk'
]

# Impute missing z-score values with -4.0
wr_df[z_score_cols] = wr_df[z_score_cols].fillna(-4.0)

# Add imputation flags for each z-score column
for col in z_score_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)

# ‚úÖ Confirm no remaining missing values
print("\n‚úÖ Missing values after z-score imputation:")
print(wr_df[z_score_cols].isna().sum())



‚úÖ Missing values after z-score imputation:
z_value_ratio_fd    0
z_value_ratio_dk    0
z_fpts_diff_fd      0
z_fpts_diff_dk      0
dtype: int64


  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)
  wr_df[flag_col] = wr_df[col].eq(-4.0).astype(int)


In [253]:
# Export after z-score imputation
wr_df.to_csv("step17_after_z_score_imputation.csv", index=False)
print("‚úÖ CSV export complete: step17_after_z_score_imputation.csv")


‚úÖ CSV export complete: step17_after_z_score_imputation.csv


In [259]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_7wk_avg_z                            : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
tgt_7wk_avg_z                            : 9961
rec_air_yards_7wk_avg_clipped            : 9961
rec_yds_7wk_avg_z                        : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
fd_salary                                : 2193
expected_fpts_fd                         : 2193
expected_fpts_dk                         : 2193
dk_salary                                : 2193
fpts_lag_1                               : 2075
rec_yds_lag_1                       

In [263]:
# Feature Engineering - Lag Feature Imputations

# Define lag columns to impute
lag_cols = [
    'fpts_lag_1', 'tgt_lag_1', 'rec_lag_1',
    'rec_yds_lag_1', 'rec_air_yards_lag_1'
]

# Impute missing values with 0.0
wr_df[lag_cols] = wr_df[lag_cols].fillna(0.0)

# Add flags to trace what was imputed
for col in lag_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# ‚úÖ Confirm cleanup
print("\n‚úÖ Missing values after lag feature imputation:")
print(wr_df[lag_cols].isna().sum())



‚úÖ Missing values after lag feature imputation:
fpts_lag_1             0
tgt_lag_1              0
rec_lag_1              0
rec_yds_lag_1          0
rec_air_yards_lag_1    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [265]:
# Export after imputing lag features
wr_df.to_csv("step18_after_lag_feature_imputation.csv", index=False)
print("‚úÖ CSV export complete: step18_after_lag_feature_imputation.csv")


‚úÖ CSV export complete: step18_after_lag_feature_imputation.csv


In [267]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

rec_3wk                                  : 14678
rec_5wk                                  : 13571
rec_7wk                                  : 12894
fpts_7wk_avg_z                           : 10268
rec_air_yards_7wk_avg_clipped            : 9961
rec_7wk_avg_z                            : 9961
tgt_7wk_avg_z                            : 9961
rec_yds_7wk_avg_z                        : 9961
rec_air_yards_7wk_avg_clipped_z          : 9961
yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_fd                             : 2833
fpts_diff_dk                             : 2833
receiving_rat                            : 2774
expected_fpts_fd                         : 2193
expected_fpts_dk                         : 2193
dk_salary                                : 2193
fd_salary                                : 2193
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                

In [269]:
# Feature Engineering - rec, fpts, avg Feature Imputations

# Define next group of columns for zero imputation
next_zero_impute_cols = [
    'rec_3wk', 'rec_5wk', 'rec_7wk',
    'fpts_7wk_avg_z',
    'rec_air_yards_7wk_avg_clipped', 'rec_air_yards_7wk_avg_clipped_z',
    'rec_7wk_avg_z', 'tgt_7wk_avg_z', 'rec_yds_7wk_avg_z'
]

# Impute with 0.0
wr_df[next_zero_impute_cols] = wr_df[next_zero_impute_cols].fillna(0.0)

# Add is_missing flags
for col in next_zero_impute_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)


# ‚úÖ Confirm cleanup
print("\n‚úÖ Missing values after feature imputation:")
print(wr_df[flag_col].isna().sum())


‚úÖ Missing values after feature imputation:
0


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [271]:
# Export after z-score imputation
wr_df.to_csv("step19_after_7wk_and_air_yard_clip_zero_imputation.csv")
print("‚úÖ CSV export complete: step19_after_7wk_and_air_yard_clip_zero_imputation")

‚úÖ CSV export complete: step19_after_7wk_and_air_yard_clip_zero_imputation


In [273]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
percent_share_of_intended_air_yards      : 9250
avg_intended_air_yards                   : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
fd_salary                                : 2193
expected_fpts_dk                         : 2193
expected_fpts_fd                         : 2193
dk_salary                                : 2193
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
rost                                     : 780
fpts_above_pos_avg                       : 780
fpts                                     : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share_z                           : 28

In [275]:
# Feature Engineering - dk and fd salary and expected fpts

# Define columns to impute
salary_and_expected_cols = [
    'dk_salary', 'fd_salary',
    'expected_fpts_dk', 'expected_fpts_fd'
]

# Fill with 0.0
wr_df[salary_and_expected_cols] = wr_df[salary_and_expected_cols].fillna(0.0)

# Add flags for traceability
for col in salary_and_expected_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\n‚úÖ Missing values after salary/expected fpts imputation:")
print(wr_df[salary_and_expected_cols].isna().sum())



‚úÖ Missing values after salary/expected fpts imputation:
dk_salary           0
fd_salary           0
expected_fpts_dk    0
expected_fpts_fd    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [277]:
# Export after salary and expected fpts imputation
wr_df.to_csv("step20_after_salary_expected_fpts_zero_imputation.csv", index=False)
print("‚úÖ CSV export complete: step20_after_salary_expected_fpts_zero_imputation")


‚úÖ CSV export complete: step20_after_salary_expected_fpts_zero_imputation


In [279]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
avg_intended_air_yards                   : 9250
percent_share_of_intended_air_yards      : 9250
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
fpts                                     : 780
fpts_above_pos_avg                       : 780
rost                                     : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share_z                           : 286
air_yards_share                          : 286
target_share                             : 286
wopr                                     : 286
fpts_3wk                                 : 119
sp

In [304]:
# Feature Engineering - Air Yard Metrics Imputation

# Define columns to impute
air_yard_cols = [
    'avg_intended_air_yards',
    'percent_share_of_intended_air_yards'
]

# Impute missing with 0.0
wr_df[air_yard_cols] = wr_df[air_yard_cols].fillna(0.0)

# Add is_missing flags
for col in air_yard_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\n‚úÖ Missing values after air yard metrics imputation:")
print(wr_df[air_yard_cols].isna().sum())


‚úÖ Missing values after air yard metrics imputation:
avg_intended_air_yards                 0
percent_share_of_intended_air_yards    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [307]:
# Export CSV
wr_df.to_csv("step21_after_air_yard_metrics_zero_imputation.csv", index=False)
print("‚úÖ CSV export complete: step21_after_air_yard_metrics_zero_imputation")

‚úÖ CSV export complete: step21_after_air_yard_metrics_zero_imputation


In [310]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
value_ratio_dk_3wk                       : 1633
rolling_fpts_diff_dk                     : 1543
rolling_fpts_diff_fd                     : 1543
value_ratio_dk_5wk                       : 1496
value_ratio_dk_7wk                       : 1484
fpts                                     : 780
rost                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
air_yards_share                          : 286
wopr                                     : 286
target_share_z                           : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7

In [312]:
# Feature Engineering - Value Ratio DK N-week Imputation

# Define columns
value_ratio_dk_nwk_cols = [
    'value_ratio_dk_3wk', 
    'value_ratio_dk_5wk', 
    'value_ratio_dk_7wk'
]

# Impute with 0.0
wr_df[value_ratio_dk_nwk_cols] = wr_df[value_ratio_dk_nwk_cols].fillna(0.0)

# Add trace flags
for col in value_ratio_dk_nwk_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# ‚úÖ Confirm cleanup
print("\n‚úÖ Missing values after value_ratio_dk_nwk imputation:")
print(wr_df[value_ratio_dk_nwk_cols].isna().sum())



‚úÖ Missing values after value_ratio_dk_nwk imputation:
value_ratio_dk_3wk    0
value_ratio_dk_5wk    0
value_ratio_dk_7wk    0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [315]:
# üìÅ Export after value_ratio_dk_nwk imputation
wr_df.to_csv("step22_after_value_ratio_dk_nwk_zero_imputation.csv", index=False)
print("‚úÖ CSV export complete: step22_after_value_ratio_dk_nwk_zero_imputation")


‚úÖ CSV export complete: step22_after_value_ratio_dk_nwk_zero_imputation


In [318]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
fpts_diff_dk                             : 2833
fpts_diff_fd                             : 2833
receiving_rat                            : 2774
rolling_fpts_diff_fd                     : 1543
rolling_fpts_diff_dk                     : 1543
fpts                                     : 780
rost                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
target_share_z                           : 286
air_yards_share                          : 286
wopr                                     : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7wk                                 : 19

üü° Total columns with missing values: 19


In [320]:
# Differential & Rolling Performance Metrics

next_group_cols = [
    'fpts_diff_dk', 'fpts_diff_fd',
    'rolling_fpts_diff_dk', 'rolling_fpts_diff_fd',
    'receiving_rat'  # same logic: 0 implies no receptions or rate unrecorded
]

# Impute with 0.0
wr_df[next_group_cols] = wr_df[next_group_cols].fillna(0.0)

# Add is_missing flags
for col in next_group_cols:
    flag_col = f'is_missing_{col}'
    wr_df[flag_col] = wr_df[col].isna().astype(int)

# Confirm cleanup
print("\n‚úÖ Missing values after diff/rolling/rat imputation:")
print(wr_df[next_group_cols].isna().sum())



‚úÖ Missing values after diff/rolling/rat imputation:
fpts_diff_dk            0
fpts_diff_fd            0
rolling_fpts_diff_dk    0
rolling_fpts_diff_fd    0
receiving_rat           0
dtype: int64


  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)
  wr_df[flag_col] = wr_df[col].isna().astype(int)


In [322]:
# ‚úÖ Export after diff/rolling/receiving_rate imputation
wr_df.to_csv("step23_after_diff_rolling_rat_zero_imputation.csv", index=False)
print("‚úÖ CSV export complete: step23_after_diff_rolling_rat_zero_imputation")


‚úÖ CSV export complete: step23_after_diff_rolling_rat_zero_imputation


In [325]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

yards                                    : 9277
rost                                     : 780
fpts                                     : 780
fpts_above_pos_avg                       : 780
pos_avg_fpts                             : 763
racr                                     : 328
target_share                             : 286
target_share_z                           : 286
air_yards_share                          : 286
wopr                                     : 286
fpts_3wk                                 : 119
spread                                   : 39
fpts_5wk                                 : 19
fpts_7wk                                 : 19

üü° Total columns with missing values: 14


In [None]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

In [None]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

In [None]:
# Dataframe Spot Check #
missing_summary = wr_df.isna().sum()
missing_summary = wr_df.isna().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

for col, count in missing_summary.items():
    print(f"{col:<40} : {count}")
    
# Total number of columns with missing values
print(f"\nüü° Total columns with missing values: {len(missing_summary)}")

In [None]:
# End: feature engineering - imputations

In [None]:
# --- Inspect the hit_value_dk target column ---
print("üîç Value Counts:")
print(df["hit_value_dk"].value_counts(dropna=False))

print("\nüîç Data Type:", df["hit_value_dk"].dtype)
print("‚ùì Missing Values:", df["hit_value_dk"].isnull().sum())

proportion = df["hit_value_dk"].mean()
print(f"\n‚úÖ Proportion of hits (1's): {proportion:.2%}")

In [None]:
### Begin: Determine Targets ###

In [None]:
# check potential targets
targets_to_check = [
    "hit_value_dk",
    "hit_value_fd",
    "rec_ge_7",
    "fpts_performance_bin",
    "value_ratio_dk_log_performance_bin"
]

for col in targets_to_check:
    print(f"\nüìä {col}")
    print("Value Counts:\n", df[col].value_counts(dropna=False))
    print("Data Type:", df[col].dtype)
    print("Missing Values:", df[col].isnull().sum())

    if df[col].dropna().nunique() == 2 and df[col].dtype != 'object':
        prop = df[col].mean()
        print(f"Proportion of 1's: {prop:.2%}")


In [None]:
# Simulates value return tiers using a 3-tier system based on scaled value ratios.

# Parameters:
# - df: DataFrame containing 'fpts' and salary columns
# - lower: Lower threshold (float), e.g., 0.5
# - upper: Upper threshold (float), e.g., 2.0
# - platform: 'dk' or 'fd' to determine salary column

# Returns:
# - Simulated tier Series with labels: underperformed, good_return, elite_return

def simulate_value_tiers(df, lower, upper, platform="dk"):
  
    if platform not in ["dk", "fd"]:
        raise ValueError("Platform must be 'dk' or 'fd'")

    salary_col = "dk_salary" if platform == "dk" else "fd_salary"
    value_ratio_scaled = df["fpts"] / (df[salary_col] / 1000)

    # Apply 3-tier binning
    bins = [-np.inf, lower, upper, np.inf]
    labels = ["underperformed", "good_return", "elite_return"]

    simulated_tiers = pd.cut(
        value_ratio_scaled,
        bins=bins,
        labels=labels,
        include_lowest=True
    )

    # Print output
    print(f"\nüìä Tier Distribution ({platform.upper()}):")
    print(simulated_tiers.value_counts(dropna=False))

    print(f"\nüìà Tier Proportions ({platform.upper()}):")
    print(simulated_tiers.value_counts(normalize=True, dropna=False).apply(lambda x: f"{x:.2%}"))

    return simulated_tiers


In [None]:
# Define threshold boundaries
lower = 0.5
upper = 2.0

# Run DK and FD simulations using shared thresholds
simulate_value_tiers(df, lower=lower, upper=upper, platform="dk")
simulate_value_tiers(df, lower=lower, upper=upper, platform="fd")


In [None]:
# Define threshold boundaries
lower = 1.0
upper = 2.0

# Run DK and FD simulations using shared thresholds
simulate_value_tiers(df, lower=lower, upper=upper, platform="dk")
simulate_value_tiers(df, lower=lower, upper=upper, platform="fd")


In [None]:
### End: Determine Targets ###

In [None]:
### Begin: experimental logistic regression modeling  ###

In [None]:
# Create  Binary Target
dk_tiers = simulate_value_tiers(df, lower=1.0, upper=2.0, platform="dk")
is_elite_return_dk = dk_tiers == "elite_return"


# Select Feature Columns
features = [
    "targets",
    "receptions",
    "rolling_fpts_diff_dk",
    "value_ratio_dk_log",
    "z_fpts_diff_dk",
    "tgt_ge_7"
]


In [None]:
# subset features from df into a temporary X variable
X = df[features].copy()
y = is_elite_return_dk.copy()

In [None]:
# setup the train-test split
# train on seasons < 2023
# validation on season 2023
# final test on 2024


# Mask for seasons
mask_2024 = df["season"] == 2024
mask_2023 = df["season"] == 2023
mask_pre_2023 = df["season"] < 2023

# Full null check mask
mask_all_valid = X.notnull().all(axis=1) & y.notnull()

# Training set: Pre-2023
mask_train = mask_all_valid & mask_pre_2023
X_train = X[mask_train]
y_train = y[mask_train]

# Validation set: 2023
mask_val = mask_all_valid & mask_2023
X_val = X[mask_val]
y_val = y[mask_val]

# Final test set: 2024
mask_test = mask_all_valid & mask_2024
X_test_final = X[mask_test]
y_test_final = y[mask_test]

# Summary
print(f"Training samples:         {len(X_train)}")
print(f"Validation (2023):        {len(X_val)}")
print(f"Final Test (2024):        {len(X_test_final)}")

print("\nClass Distribution:")
print("Train:\n", y_train.value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))
print("Val:\n", y_val.value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))
print("Test:\n", y_test_final.value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))


In [None]:
# Import and Initialize the Model

# Create a logistic regression model object
# Use balanced class weights to account for slight imbalance
logreg_model = LogisticRegression(
    penalty=None,               # No regularization for now (keep math pure)
    solver='lbfgs',             # Robust optimizer
    class_weight='balanced',    # Compensate for 21/79 class ratio
    max_iter=1000,              # Extra room for convergence
    random_state=42
)


In [None]:
# Fit the model on training data
logreg_model.fit(X_train, y_train)


In [None]:
# View and Interpret the Model Coefficients
# View learned coefficients with their corresponding feature names
coefficients = pd.Series(
    logreg_model.coef_[0],
    index=X_train.columns
).sort_values(ascending=False)

print("üîé Logistic Regression Coefficients:")
print(coefficients)


In [None]:
# Correlation Matrix Among Features
# Spot collinearity and detect proxy behavior

# Only use the features from the model
feature_cols = [
    "value_ratio_dk_log",
    "z_fpts_diff_dk",
    "receptions",
    "targets",
    "rolling_fpts_diff_dk",
    "tgt_ge_7"
]

# Compute correlation matrix (drop NaNs just in case)
corr_matrix = df[feature_cols].corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Logistic Regression Features")
plt.show()


In [None]:
# Apply regression to determine the following:
# How much z_fpts_diff_dk explains the variation in value_ratio_dk_log

# Drop rows with missing values in either variable
mask = df["value_ratio_dk_log"].notnull() & df["z_fpts_diff_dk"].notnull()
x = df.loc[mask, "z_fpts_diff_dk"]
y = df.loc[mask, "value_ratio_dk_log"]

# Add constant for intercept
x_with_const = sm.add_constant(x)

# Fit linear regression
model = sm.OLS(y, x_with_const).fit()

# View results
print(model.summary())


In [None]:
# Drop NaNs for both features
mask = df["receptions"].notnull() & df["targets"].notnull()
x = df.loc[mask, "targets"]
y = df.loc[mask, "receptions"]

# Add constant for intercept
x_with_const = sm.add_constant(x)

# Fit linear regression model
model_targets_to_rec = sm.OLS(y, x_with_const).fit()

# Display summary
print(model_targets_to_rec.summary())


In [None]:
# Revised feature list
features_revised = [
    "value_ratio_dk_log",
    "receptions",
    "rolling_fpts_diff_dk",
    "tgt_ge_7"
]

# Compute correlation matrix (drop NaNs)
corr_matrix_revised = df[features_revised].dropna().corr()

# Plot heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(corr_matrix_revised, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix (Revised Features)")
plt.show()


In [None]:
# explore more features
candidate_features = [
    "catch_percentage",
    "catch_percentage_scaled",
    "catch_percentage_scaled_z",
    "fpts_3wk_avg",
    "avg_cushion",
    "avg_separation",
    "avg_intended_air_yards",
    "percent_share_of_intended_air_yards"
]

# Count non-null values
df[candidate_features].notnull().sum().sort_values(ascending=False)


In [None]:
# Revised features list
features_final = [
    "value_ratio_dk_log",
    "receptions",
    "rolling_fpts_diff_dk",
    "tgt_ge_7",
    "fpts_3wk_avg"
]


In [None]:
# Correlation check on final feature set
df[features_final].dropna().corr()

plt.figure(figsize=(6, 5))
sns.heatmap(df[features_final].dropna().corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix ‚Äì Final 5 Features")
plt.show()


In [None]:
# Create Train / Validation / Test Sets
# Create season masks
mask_2024 = df["season"] == 2024
mask_2023 = df["season"] == 2023
mask_pre_2023 = df["season"] < 2023

# Create modeling mask (no missing values)
valid_mask = df[features_final].notnull().all(axis=1) & y.notnull()

# Create X and y matrices
X = df[features_final]

X_train = X[valid_mask & mask_pre_2023]
y_train = y[valid_mask & mask_pre_2023]

X_val = X[valid_mask & mask_2023]
y_val = y[valid_mask & mask_2023]

X_test = X[valid_mask & mask_2024]
y_test = y[valid_mask & mask_2024]


In [None]:
# rerun the logistic mode based on modifications to the features

# Re-initialize the logistic model (no changes needed here)
logreg_model_final = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)

# Fit model on finalized training data
logreg_model_final.fit(X_train, y_train)


In [None]:
# This pulls from the correct model object and feature set
pd.Series(
    logreg_model_final.coef_[0],
    index=features_final
).sort_values(ascending=False)


In [None]:
# Revised model and features

# # --- Define Final Feature Sets ---
features_simplified = [
    "receptions",
    "rolling_fpts_diff_dk",
    "fpts_3wk_avg"
]

# # --- Define Final Feature Sets ---
# features_simplified = [
#     "targets",
#     "rolling_fpts_diff_dk",
#     "fpts_3wk_avg"
# ]

features_with_tgt = features_simplified + ["tgt_ge_7"]

# add targets back into the training set
X_train["targets"] = df.loc[X_train.index, "targets"]

# --- Prepare Clean Subsets (Drop Rows with Missing Values) ---
X_train_simple = X_train[features_simplified].dropna()
y_train_simple = y_train.loc[X_train_simple.index]

X_train_with_tgt = X_train[features_with_tgt].dropna()
y_train_with_tgt = y_train.loc[X_train_with_tgt.index]

# --- Initialize and Fit Logistic Models ---
logreg_simple = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)
logreg_with_tgt = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)

logreg_simple.fit(X_train_simple, y_train_simple)
logreg_with_tgt.fit(X_train_with_tgt, y_train_with_tgt)

# --- View Coefficients ---
print("üîπ Simplified Model Coefficients:")
print(pd.Series(logreg_simple.coef_[0], index=features_simplified))

print("\nüîπ With tgt_ge_7 Included:")
print(pd.Series(logreg_with_tgt.coef_[0], index=features_with_tgt))


In [None]:
# --- Define minimum thresholds ---
min_targets = 15
min_receptions = 15

# --- Filter main dataframe before model prep ---
df_filtered = df[
    (df['targets'] >= min_targets) &
    (df['receptions'] >= min_receptions)
]

# --- Update train splits using filtered indices ---
X_train_filtered = X_train.loc[df_filtered.index.intersection(X_train.index)]
y_train_filtered = y_train.loc[X_train_filtered.index]

# --- Drop missing values for selected features ---
# features_final = ["targets", "rolling_fpts_diff_dk", "fpts_3wk_avg"]
features_receptions = ["receptions", "rolling_fpts_diff_dk", "fpts_3wk_avg"]

# --- Drop rows with NaNs in selected features ---
X_train_filtered = X_train_filtered[features_final].dropna()
y_train_filtered = y_train_filtered.loc[X_train_filtered.index]

# --- Refit model ---
logreg_filtered = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)
logreg_filtered.fit(X_train_filtered, y_train_filtered)

# --- View updated coefficients ---
print("üîç Coefficients with Fringe Players Excluded:")
print(pd.Series(logreg_filtered.coef_[0], index=features_final))


In [None]:
# --- Filter full dataframe ---
df_filtered = df[
    (df['targets'] >= 15) &
    (df['receptions'] >= 15)
].copy()

# --- Select only relevant features ---
features_receptions = ["receptions", "rolling_fpts_diff_dk", "fpts_3wk_avg"]

# --- Rebuild training input/output from scratch ---
X_train_filtered = df_filtered.loc[
    df_filtered.index.intersection(X_train.index),
    features_receptions
].dropna()

y_train_filtered = y_train.loc[X_train_filtered.index]

# --- Fit logistic regression ---
logreg_receptions = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)
logreg_receptions.fit(X_train_filtered, y_train_filtered)

# --- Output coefficients ---
print("üîç Final Coefficients using 'receptions' only:")
print(pd.Series(logreg_receptions.coef_[0], index=features_receptions))


In [None]:
# *** Initial Logistical Regression Model ***
# - Not Suitable because it contains Data Leakage *** ###
# Model needs adjusting - x- inputs should be from previous rows

# --- Configuration ---
receptions_threshold = 1
targets_threshold = 1
features_final = ["receptions", "rolling_fpts_diff_dk", "fpts_3wk_avg"]

# --- Filter dataset for meaningful usage (per week) ---
df_filtered = df[
    (df["receptions"] >= receptions_threshold) &
    (df["targets"] >= targets_threshold)
].copy()

# --- TRAIN / VALIDATION SPLIT ---
mask_train = df_filtered["season"] < 2023
mask_val = df_filtered["season"] == 2023

X_train = df_filtered.loc[mask_train, features_final].dropna()
y_train = df_filtered.loc[X_train.index, "hit_value_dk"].astype(int)

X_val = df_filtered.loc[mask_val, features_final].dropna()
y_val = df_filtered.loc[X_val.index, "hit_value_dk"].astype(int)

# --- Fit logistic regression model ---
logreg_model = LogisticRegression(
    penalty=None,
    solver='lbfgs',
    class_weight='balanced',
    max_iter=10000,
    random_state=42
)
logreg_model.fit(X_train, y_train)

# --- Evaluate on validation set ---
y_val_pred = logreg_model.predict(X_val)
y_val_proba = logreg_model.predict_proba(X_val)[:, 1]

# --- Print results ---
print("‚úÖ Classification Report (2023 Validation Set):")
print(classification_report(y_val, y_val_pred, zero_division=0))

print("üìä Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("üéØ ROC AUC Score:", round(roc_auc_score(y_val, y_val_proba), 3))

print("\nüìà Logistic Regression Coefficients:")
print(pd.Series(logreg_model.coef_[0], index=features_final))


In [None]:
### End: experimental logistic regression modeling  ###

In [None]:
### Begin: value-based logistic regression classifier model  ###

In [None]:
# Load the value-engineered dataset
wr_df_updated = pd.read_csv("wr_nfl_df_sorted_new_features_final.csv")

In [None]:
# Define value-based feature list for modeling
value_based_features = [
    'O_U', 'Total', 'dk_salary',
    'value_ratio_dk', 'value_ratio_dk_3wk', 'value_ratio_dk_5wk', 'value_ratio_dk_7wk',
    'value_ratio_dk_75th_percentile_1wk', 'value_ratio_dk_75th_percentile_3wk',
    'value_ratio_dk_75th_percentile_5wk', 'value_ratio_dk_75th_percentile_7wk',
    'value_ratio_dk_90th_percentile_3wk', 'value_ratio_dk_90th_percentile_5wk',
    'value_ratio_dk_90th_percentile_7wk', 'value_ratio_dk_95th_percentile_3wk',
    'value_ratio_dk_95th_percentile_5wk', 'value_ratio_dk_95th_percentile_7wk',
    'z_value_ratio_dk', 'value_ratio_fd', 'z_value_ratio_fd',
    'tgt_ge_5', 'tgt_ge_7', 'rec_ge_5', 'rec_ge_7',
    'target_share_ge_20', 'target_share_ge_30',
    'over_100_yds'
]

In [None]:
# Construct feature matrix and target vector
X_value = wr_df_updated[value_based_features].copy()
y_value = wr_df_updated['hit_value_dk'].copy()

# Drop rows with missing values
X_value_clean = X_value.dropna()

# Drop non-numeric column(s) for correlation only
X_corr = X_value_clean.drop(columns=['O_U'])

# Compute correlation matrix
corr_matrix = X_corr.corr()

# Plot correlation heatmap
plt.figure(figsize=(18, 12))
sns.heatmap(
    corr_matrix,
    annot=False,
    cmap='coolwarm',
    center=0,
    linewidths=0.5
)
plt.title("Correlation Matrix: Value-Based Features", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# upated feature list

# Define cleaned value-based feature list for modeling
value_based_features = [
    'Total', 'dk_salary',
    'value_ratio_dk', 'value_ratio_dk_3wk', 'value_ratio_dk_5wk', 'value_ratio_dk_7wk',
    'value_ratio_dk_90th_percentile_3wk', 'value_ratio_dk_90th_percentile_5wk',
    'value_ratio_dk_90th_percentile_7wk',
    'z_value_ratio_dk'
]


In [None]:

# Construct X and y
X_value = wr_df_updated[value_based_features].copy()
y_value = wr_df_updated['hit_value_dk'].copy()

# Drop rows with missing values for correlation matrix
X_value_clean = X_value.dropna()

# Compute and plot the updated correlation matrix
corr_matrix_updated = X_value_clean.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix_updated,
    annot=True,
    fmt=".2f",
    cmap='coolwarm',
    center=0,
    linewidths=0.5
)
plt.title("Updated Correlation Matrix: Cleaned Value-Based Features", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Get the shape of the train, validate, test data

# Extract feature matrix and target vector
X_all = wr_df_updated[value_based_features].copy()
y_all = wr_df_updated['hit_value_dk'].copy()

# Include the season column to guide the split
season_series = wr_df_updated['season']

# Drop rows with missing values in X
X_all_clean = X_all.dropna()
y_all_clean = y_all.loc[X_all_clean.index]
season_clean = season_series.loc[X_all_clean.index]

# Create masks for each dataset
train_mask = season_clean < 2023
val_mask = season_clean == 2023
test_mask = season_clean == 2024

# Apply the masks
X_train = X_all_clean[train_mask]
y_train = y_all_clean[train_mask]

X_val = X_all_clean[val_mask]
y_val = y_all_clean[val_mask]

X_test = X_all_clean[test_mask]
y_test = y_all_clean[test_mask]

# Return the shapes of each split to confirm
X_train.shape, X_val.shape, X_test.shape


In [None]:
# train the model

# Initialize and fit logistic regression model on training data only
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Output training set score (accuracy) for quick verification
train_accuracy = model.score(X_train, y_train)
train_accuracy


In [None]:
# feature impportance
# Extract feature names and coefficients
coefficients = pd.Series(model.coef_[0], index=X_train.columns)

# Sort by absolute value for importance
coeff_sorted = coefficients.reindex(coefficients.abs().sort_values(ascending=False).index)

coeff_sorted


In [None]:
# updated features

# Remove 'z_value_ratio_dk' from the feature list
features_no_zscore = [
    'Total', 'dk_salary',
    'value_ratio_dk', 'value_ratio_dk_3wk', 'value_ratio_dk_5wk', 'value_ratio_dk_7wk',
    'value_ratio_dk_90th_percentile_3wk', 'value_ratio_dk_90th_percentile_5wk',
    'value_ratio_dk_90th_percentile_7wk'
]

# Create new training feature matrix
X_train_nz = X_train[features_no_zscore]
X_val_nz = X_val[features_no_zscore]

# Refit logistic regression model
model_nz = LogisticRegression(max_iter=1000)
model_nz.fit(X_train_nz, y_train)

# Extract and sort coefficients again
coefficients_nz = pd.Series(model_nz.coef_[0], index=X_train_nz.columns)
coeff_sorted_nz = coefficients_nz.reindex(coefficients_nz.abs().sort_values(ascending=False).index)

coeff_sorted_nz


In [None]:
# Define final feature list excluding dk_salary and value_ratio_dk
features_cleaned = [
    'Total',
    'value_ratio_dk_3wk', 'value_ratio_dk_5wk', 'value_ratio_dk_7wk',
    'value_ratio_dk_90th_percentile_3wk', 'value_ratio_dk_90th_percentile_5wk',
    'value_ratio_dk_90th_percentile_7wk'
]

# Extract cleaned X and y
X_all = wr_df_updated[features_cleaned].copy()
y_all = wr_df_updated['hit_value_dk'].copy()
season_series = wr_df_updated['season']

# Drop NaNs
X_all_clean = X_all.dropna()
y_all_clean = y_all.loc[X_all_clean.index]
season_clean = season_series.loc[X_all_clean.index]

# Create season-based splits
train_mask = season_clean < 2023
val_mask = season_clean == 2023

X_train = X_all_clean[train_mask]
y_train = y_all_clean[train_mask]

X_val = X_all_clean[val_mask]
y_val = y_all_clean[val_mask]

# Fit model on updated training set
model_cleaned = LogisticRegression(max_iter=1000)
model_cleaned.fit(X_train, y_train)

# Predict and evaluate on 2023 validation set
y_val_pred = model_cleaned.predict(X_val)
y_val_prob = model_cleaned.predict_proba(X_val)[:, 1]


val_report = classification_report(y_val, y_val_pred, output_dict=True)
val_roc_auc = roc_auc_score(y_val, y_val_prob)

val_report, val_roc_auc


In [None]:
# Format the classification report dictionary for better readability
def format_classification_report(report_dict, auc_score):
    # Extract main classes
    rows = []
    for label in ['False', 'True', 'accuracy', 'macro avg', 'weighted avg']:
        if label == 'accuracy':
            rows.append(['accuracy', '', '', f"{report_dict['accuracy']:.3f}", ''])
        else:
            row = report_dict[label]
            rows.append([
                label,
                f"{row['precision']:.3f}",
                f"{row['recall']:.3f}",
                f"{row['f1-score']:.3f}",
                f"{row['support']:.0f}"
            ])
    # Append AUC
    rows.append(['roc_auc', '', '', f"{auc_score:.3f}", ''])

    # Create a DataFrame for display
    return pd.DataFrame(rows, columns=['Metric', 'Precision', 'Recall', 'F1-Score', 'Support'])

# Apply formatting
formatted_val_report = format_classification_report(val_report, val_roc_auc)
formatted_val_report


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Apply the same cleaned feature set to 2024 test data
test_mask = season_clean == 2024
X_test = X_all_clean[test_mask]
y_test = y_all_clean[test_mask]

# Predict and evaluate on 2024 test set
y_test_pred = model_cleaned.predict(X_test)
y_test_prob = model_cleaned.predict_proba(X_test)[:, 1]

# Compute evaluation metrics
test_report = classification_report(y_test, y_test_pred, output_dict=True)
test_roc_auc = roc_auc_score(y_test, y_test_prob)

# Format the output
def format_classification_report(report_dict, auc_score):
    rows = []
    for label in ['False', 'True', 'accuracy', 'macro avg', 'weighted avg']:
        if label == 'accuracy':
            rows.append(['accuracy', '', '', f"{report_dict['accuracy']:.3f}", ''])
        else:
            row = report_dict[label]
            rows.append([
                label,
                f"{row['precision']:.3f}",
                f"{row['recall']:.3f}",
                f"{row['f1-score']:.3f}",
                f"{row['support']:.0f}"
            ])
    rows.append(['roc_auc', '', '', f"{auc_score:.3f}", ''])
    return pd.DataFrame(rows, columns=['Metric', 'Precision', 'Recall', 'F1-Score', 'Support'])

formatted_test_report = format_classification_report(test_report, test_roc_auc)
formatted_test_report


In [None]:
# Create a PDF summary of training, validation, and test results
class ModelSummaryPDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 14)
        self.cell(0, 10, "WR Hit Value Classification Model Summary", ln=True, align="C")
        self.ln(5)

    def add_section(self, title, dataframe):
        self.set_font("Arial", "B", 12)
        self.cell(0, 10, title, ln=True)
        self.set_font("Arial", "", 10)
        self.ln(2)
        col_widths = [30, 25, 25, 25, 25]
        for i, col in enumerate(dataframe.columns):
            self.cell(col_widths[i], 8, col, border=1)
        self.ln()
        for _, row in dataframe.iterrows():
            for i, item in enumerate(row):
                self.cell(col_widths[i], 8, str(item), border=1)
            self.ln()
        self.ln(5)

# Format 2023 and 2024 reports again using the existing function
formatted_val_report = format_classification_report(val_report, val_roc_auc)
formatted_test_report = format_classification_report(test_report, test_roc_auc)

# Create and populate PDF
pdf = ModelSummaryPDF()
pdf.add_page()
pdf.add_section("Validation Set (2023)", formatted_val_report)
pdf.add_section("Test Set (2024)", formatted_test_report)

# Save PDF
pdf_output_path = "wr_value_model_summary.pdf"
pdf.output(pdf_output_path)

pdf_output_path


In [None]:
# *** Value-based salary model: fully validated, documented, and reproducible  ***
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Load and define features ---
df = pd.read_csv("wr_nfl_df_sorted_new_features_final.csv")

value_based_features = [
    'Total',
    'value_ratio_dk_3wk', 'value_ratio_dk_5wk', 'value_ratio_dk_7wk',
    'value_ratio_dk_90th_percentile_3wk', 'value_ratio_dk_90th_percentile_5wk',
    'value_ratio_dk_90th_percentile_7wk'
]

X_all = df[value_based_features].copy()
y_all = df['hit_value_dk'].copy()
season = df['season']

# --- Step 2: Drop rows with missing values ---
X_all_clean = X_all.dropna()
y_all_clean = y_all.loc[X_all_clean.index]
season_clean = season.loc[X_all_clean.index]

# --- Step 3: Create season-based splits ---
train_mask = season_clean < 2023
val_mask = season_clean == 2023
test_mask = season_clean == 2024

X_train = X_all_clean[train_mask]
y_train = y_all_clean[train_mask]

X_val = X_all_clean[val_mask]
y_val = y_all_clean[val_mask]

X_test = X_all_clean[test_mask]
y_test = y_all_clean[test_mask]

# --- Step 4: Fit model on training data ---
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# --- Step 5: Evaluate on validation set ---
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]

val_report = classification_report(y_val, y_val_pred, output_dict=True)
val_roc_auc = roc_auc_score(y_val, y_val_prob)

# --- Step 6: Evaluate on test set ---
y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

test_report = classification_report(y_test, y_test_pred, output_dict=True)
test_roc_auc = roc_auc_score(y_test, y_test_prob)

# --- Step 7: Format reports (optional) ---
def format_classification_report(report_dict, auc_score):
    rows = []
    for label in ['False', 'True', 'accuracy', 'macro avg', 'weighted avg']:
        if label == 'accuracy':
            rows.append(['accuracy', '', '', f"{report_dict['accuracy']:.3f}", ''])
        else:
            row = report_dict[label]
            rows.append([
                label,
                f"{row['precision']:.3f}",
                f"{row['recall']:.3f}",
                f"{row['f1-score']:.3f}",
                f"{row['support']:.0f}"
            ])
    rows.append(['roc_auc', '', '', f"{auc_score:.3f}", ''])
    return pd.DataFrame(rows, columns=['Metric', 'Precision', 'Recall', 'F1-Score', 'Support'])

val_report_df = format_classification_report(val_report, val_roc_auc)
test_report_df = format_classification_report(test_report, test_roc_auc)

# --- Step 8: Optional: Plot correlation matrix ---
plt.figure(figsize=(10, 8))
sns.heatmap(X_train.corr(), annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title("Correlation Matrix: Value-Based Features (Train Set)")
plt.tight_layout()
plt.show()

# --- Output reports ---
print("Validation Report (2023):")
print(val_report_df.to_string(index=False))

print("\nTest Report (2024):")
print(test_report_df.to_string(index=False))


In [None]:
### End: value-based logistic regression classifier model ###

In [None]:
### Begin: performance-based logistic regression classifier models ###

In [None]:
#

# Load the dataset
df = pd.read_csv("wr_nfl_df_sorted_new_features_final.csv")

# Get full column list
all_columns = df.columns.tolist()

# Display columns that contain 'rec_', 'tgt_', 'rec_yds_', or 'rec_air_yards_'
performance_raw_like = sorted([
    col for col in all_columns
    if col.startswith(('rec_', 'tgt_', 'rec_yds_', 'rec_air_yards_'))
])

# view list (optional)
# performance_raw_like


In [None]:
# Define inclusion criteria for safe performance-based features
include_keywords = ['_avg', '_lag_', '_delta', '_z', '_percentile']
exclude_keywords = ['fpts', 'hit_', 'value_', 'salary', 'position', 'over_', 'ge_', 'rec_touchdowns', 'Total', 'O_U']

# Build candidate feature list
initial_perf_features = [
    col for col in df.columns
    if any(kw in col for kw in include_keywords)
    and not any(kw in col for kw in exclude_keywords)
]

# Sort alphabetically for review
initial_perf_features = sorted(initial_perf_features)

# view list
# initial_perf_features


In [None]:
# Subset the dataframe using the selected features
X_perf = df[initial_perf_features].copy()

# Drop rows with missing values
X_perf_clean = X_perf.dropna()

# Compute correlation matrix
corr_matrix = X_perf_clean.corr()

# Plot correlation heatmap
plt.figure(figsize=(18, 14))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title("Correlation Matrix - Performance-Based Features", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Final approved feature list
performance_features = [
    'rec_3wk_avg', 'rec_yds_5wk_avg', 'rec_air_yards_7wk_avg', 'tgt_3wk_avg',
    'tgt_3wk_delta', 'rec_3wk_delta', 'rec_yds_3wk_delta',
    'rec_lag_1', 'tgt_lag_1', 'rec_yds_lag_1', 'rec_air_yards_lag_1',
    'targets_75th_percentile_3wk', 'rec_75th_percentile_3wk', 'receiving_yards_75th_percentile_3wk',
    'rec_7wk_avg_z', 'rec_air_yards_7wk_avg_clipped_z', 'target_share_z'
]

# Subset and drop NA for clean correlation
X_perf_refined = df[performance_features].dropna()

# Correlation matrix
corr_matrix_final = X_perf_refined.corr()

# Plot
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix_final, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title("Final Correlation Matrix - Refined Performance Features", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# performance-based model using fpts>= a defined threshold
# view the shape of the training, validation, and test sets

# Step 1: Define the fixed threshold target
fpts_threshold = 10
df['hit_perf_fixed'] = (df['fpts'] >= fpts_threshold).astype(int)

# Step 2: Prepare feature matrix (X) and target vector (y)
X = df[performance_features].copy()
y = df['hit_perf_fixed'].copy()

# Step 3: Add season column for splitting
df_season = df[['season']].copy()
X['season'] = df_season
y.index = df_season.index  # Align indices

# Step 4: Apply train/validate/test split by season
X_train = X[X['season'] < 2023].drop(columns='season')
y_train = y[X['season'] < 2023]

X_val = X[X['season'] == 2023].drop(columns='season')
y_val = y[X['season'] == 2023]

X_test = X[X['season'] == 2024].drop(columns='season')
y_test = y[X['season'] == 2024]

# Confirm shape
(X_train.shape, y_train.shape), (X_val.shape, y_val.shape), (X_test.shape, y_test.shape)


In [None]:
# performance-based model using fpts>= a defined threshold
# train the model - feature analysis

# Drop missing values from training set only
X_train_clean = X_train.dropna()
y_train_clean = y_train.loc[X_train_clean.index]

# Fit logistic regression on cleaned training set
logreg_perf = LogisticRegression(max_iter=1000, solver='liblinear')
logreg_perf.fit(X_train_clean, y_train_clean)

# Get and display feature importances
coefficients = pd.Series(logreg_perf.coef_[0], index=X_train_clean.columns).sort_values(key=abs, ascending=False)
coefficients


In [None]:
# performance-based model using fpts>= a defined threshold
# Revise the features list based on results
# Step 1: Define revised feature list (remove lag features)
performance_features_revised = [
    'rec_3wk_avg', 'rec_yds_5wk_avg', 'rec_air_yards_7wk_avg', 'tgt_3wk_avg',
    'tgt_3wk_delta', 'rec_3wk_delta', 'rec_yds_3wk_delta',
    'targets_75th_percentile_3wk', 'rec_75th_percentile_3wk', 'receiving_yards_75th_percentile_3wk',
    'rec_7wk_avg_z', 'rec_air_yards_7wk_avg_clipped_z', 'target_share_z'
]

# Step 2: Subset data and drop NAs for correlation
X_perf_revised = df[performance_features_revised].dropna()

# Step 3: Compute and display correlation matrix
corr_matrix_revised = X_perf_revised.corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix_revised, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title("Correlation Matrix - Revised Performance Features (No Lag)", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# performance-based model using fpts>= a defined threshold
# Step 4: Prepare revised feature matrix (X) and target vector (y)
X = df[performance_features_revised].copy()
y = df['hit_perf_fixed'].copy()

# Add season column for splitting
X['season'] = df['season']
y.index = df.index

# Apply train/val/test split by season
X_train = X[X['season'] < 2023].drop(columns='season')
y_train = y[X['season'] < 2023]

X_val = X[X['season'] == 2023].drop(columns='season')
y_val = y[X['season'] == 2023]

X_test = X[X['season'] == 2024].drop(columns='season')
y_test = y[X['season'] == 2024]

# Drop missing values in training set
X_train_clean = X_train.dropna()
y_train_clean = y_train.loc[X_train_clean.index]

# Check shapes after dropping NA
X_train_clean.shape, y_train_clean.shape


In [None]:
# performance-based model using fpts>= a defined threshold
# Retrain logistic regression with revised feature set
logreg_perf_revised = LogisticRegression(max_iter=1000, solver='liblinear')
logreg_perf_revised.fit(X_train_clean, y_train_clean)

# Display sorted coefficients
coefficients_revised = pd.Series(
    logreg_perf_revised.coef_[0], index=X_train_clean.columns
).sort_values(key=abs, ascending=False)

coefficients_revised


In [None]:
# performance-based model using fpts>= a defined threshold
# Validate the model

# Drop missing values in validation set
X_val_clean = X_val.dropna()
y_val_clean = y_val.loc[X_val_clean.index]

# Predict on validation set
y_val_pred = logreg_perf_revised.predict(X_val_clean)
y_val_proba = logreg_perf_revised.predict_proba(X_val_clean)[:, 1]

# Classification report and ROC AUC
val_report = classification_report(y_val_clean, y_val_pred, output_dict=True)
val_auc = roc_auc_score(y_val_clean, y_val_proba)

val_report, val_auc


In [None]:
# performance-based model using fpts>= a defined threshold
# Validate the model

# Pretty print a classification report and ROC AUC score
def print_classification_summary(report_dict, roc_auc_value):
  
    df = pd.DataFrame(report_dict).transpose().round(3)
    
    # Ensure 'support' is always an integer for readability
    if 'support' in df.columns:
        df['support'] = df['support'].astype(int)
    
    # Add ROC AUC row to the bottom
    df.loc["ROC AUC"] = ["", "", "", round(roc_auc_value, 3)]
    
    # Convert to string and print
    print(df.to_string(index=True))

print("\n--- 2023 VALIDATION RESULTS (fpts>= defined threshold ) ---")
print_classification_summary(val_report, val_auc)


In [None]:
# performance-based model using fpts>= a defined threshold
# Test the model

# Drop missing values in test set
X_test_clean = X_test.dropna()
y_test_clean = y_test.loc[X_test_clean.index]

# Predict on test set
y_test_pred = logreg_perf_revised.predict(X_test_clean)
y_test_proba = logreg_perf_revised.predict_proba(X_test_clean)[:, 1]

# Classification report and ROC AUC
test_report = classification_report(y_test_clean, y_test_pred, output_dict=True)
test_auc = roc_auc_score(y_test_clean, y_test_proba)

# Format the report for readability
test_report_df = pd.DataFrame(test_report).transpose()
test_report_df['support'] = test_report_df['support'].astype(int)
test_report_df_rounded = test_report_df.round(3)
test_report_df_rounded.loc["ROC AUC"] = ["", "", "", test_auc]

# Print readable output
# Print output
print("\n--- 2024 TEST RESULTS (fpts>= defined threshold ) ---")
print(test_report_df_rounded.to_string(index=True))
print(f"\nROC AUC: {test_auc:.3f}")


In [None]:
print_classification_summary(test_report, test_auc)


In [None]:
# performance-based model using a percentile-based threshold 
# create the target

# --- Define the percentile threshold for the target (adjustable) ---
performance_percentile_threshold = 0.85  # This can be changed to 0.75, 0.90, etc.

# --- Filter training data for seasons < 2023 ---
training_fpts = df[df['season'] < 2023]['fpts']

# --- Calculate the fantasy point value at the desired percentile ---
fpts_percentile_value = training_fpts.quantile(performance_percentile_threshold)

# --- Create binary target column based on this threshold ---
df['hit_perf_percentile'] = (df['fpts'] >= fpts_percentile_value).astype(int)

# --- Output the threshold value ---
fpts_percentile_value


In [None]:
# performance-based model using a percentile-based threshold 
# get the shape of the training data

# Rebuild feature matrix (X) and updated target (y) after redefining percentile target
X = df[performance_features].copy()
y = df['hit_perf_percentile'].copy()

# Add season for splitting
X['season'] = df['season']
y.index = df.index

# Train: seasons < 2023
X_train = X[X['season'] < 2023].drop(columns='season').dropna()
y_train = y.loc[X_train.index]

# Validation: season == 2023
X_val = X[X['season'] == 2023].drop(columns='season').dropna()
y_val = y.loc[X_val.index]

# Test: season == 2024
X_test = X[X['season'] == 2024].drop(columns='season').dropna()
y_test = y.loc[X_test.index]

# Display the shape of all datasets for verification
{
    "Train Features": X_train.shape,
    "Train Target": y_train.shape,
    "Validation Features": X_val.shape,
    "Validation Target": y_val.shape,
    "Test Features": X_test.shape,
    "Test Target": y_test.shape
}


In [None]:
# performance-based model using a percentile-based threshold 
# train the model - show the features and coefficients

# Train logistic regression model on the 85th percentile target
logreg_perf_percentile = LogisticRegression(max_iter=1000, solver='liblinear')
logreg_perf_percentile.fit(X_train, y_train)

# Display coefficients for interpretability
coefficients_percentile = pd.Series(
    logreg_perf_percentile.coef_[0],
    index=X_train.columns
).sort_values(key=abs, ascending=False)

coefficients_percentile


In [None]:
# performance-based model using a percentile-based threshold 
# validate the model

# Predict on validation set (2023)
y_val_pred = logreg_perf_percentile.predict(X_val)
y_val_proba = logreg_perf_percentile.predict_proba(X_val)[:, 1]

# Evaluate
val_report_percentile = classification_report(y_val, y_val_pred, output_dict=True)
val_auc_percentile = roc_auc_score(y_val, y_val_proba)

# Format for readable output
val_df_percentile = pd.DataFrame(val_report_percentile).transpose().round(3)
val_df_percentile['support'] = val_df_percentile['support'].astype(int)
val_df_percentile.loc["ROC AUC"] = ["", "", "", round(val_auc_percentile, 3)]

# Print output
print("\n--- 2023 VALIDATION RESULTS (85th Percentile Target) ---")
print(val_df_percentile.to_string(index=True))


In [None]:
print_classification_summary(val_report_percentile, val_auc_percentile)

In [None]:
# performance-based model using a percentile-based threshold 
# test the model

# Predict on 2024 test set
y_test_pred = logreg_perf_percentile.predict(X_test)
y_test_proba = logreg_perf_percentile.predict_proba(X_test)[:, 1]

# Evaluate
test_report_percentile = classification_report(y_test, y_test_pred, output_dict=True)
test_auc_percentile = roc_auc_score(y_test, y_test_proba)

# Format output
test_df_percentile = pd.DataFrame(test_report_percentile).transpose().round(3)
test_df_percentile['support'] = test_df_percentile['support'].astype(int)
test_df_percentile.loc["ROC AUC"] = ["", "", "", round(test_auc_percentile, 3)]

# Print test results
print("\n--- 2024 TEST RESULTS (85th Percentile Target) ---")
print(test_df_percentile.to_string(index=True))


In [None]:
### End: performance-based logistic regression classifier model ###

In [None]:
### Begin: Prediction Dataframes  ###

In [None]:
# --- Your helper functions ---
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)
    return ((current_date - season_start_date).days // 7) + 1

def get_year_range(current_year, current_week, start_year=2017):
    return list(range(start_year, current_year + 1)) if current_week <= 18 else list(range(start_year, current_year))


In [None]:
# List and print all columns from the loaded dataframe
df_columns = df.columns.tolist()

print(f"üß† Total Columns: {len(df_columns)}\n")
for col in df_columns:
    print(col)


In [None]:
# ‚úÖ Baseline metadata columns
baseline_cols = [
    "season", "season_type", "week", "name", "position", "recent_team",
    "player_display_name", "team_abbr_x", "rost", "dk_salary", "fd_salary"
]

# ‚úÖ Target columns
target_cols = [
    "hit_value_dk",         # Value-based threshold
    "hit_perf_fixed",       # FPTS ‚â• 10 (formerly hit_perf_fpts10)
    "hit_perf_percentile"   # Top 15% WR performance
]

# ‚úÖ Optional display columns (for dashboard or inspection)
optional_display_cols = [
    "fpts", "value_ratio_dk", "fpts_3wk_avg", 
    "targets", "receptions", "receiving_yards", 
    "receiving_air_yards", "target_share", "catch_percentage"
]

# ‚úÖ Feature columns for logistic regression (hit_value_dk)
feature_cols = [
    "dk_salary", "rost", "fpts_3wk_avg", "value_ratio_dk",
    "targets", "receptions", "receiving_yards", 
    "receiving_air_yards", "target_share", "catch_percentage"
]


In [None]:
# rename df to wr
wr_df = df.copy()

In [None]:
# Value-Based Prediction Dataframe

# column cleaning
for col in feature_cols:
    non_numeric = wr_df[col][~wr_df[col].apply(lambda x: isinstance(x, (int, float)))]
    if not non_numeric.empty:
        print(f"\nüö® Column: {col}")
        print(non_numeric.unique()[:5])


In [None]:
# Value-Based Prediction Dataframe
# column cleaning and data formatting

# Patch: recompute value_ratio_dk if missing or all zero
if df["value_ratio_dk"].sum() == 0:
    df["value_ratio_dk"] = df["fpts"] / df["dk_salary"]

# scale the ratio to get integers and not tiny decimals
df["value_ratio_dk"] = df["fpts"] / (df["dk_salary"] / 1000)
df["value_ratio_dk"] = df["value_ratio_dk"].replace([np.inf, -np.inf], np.nan)


In [None]:
# Value-Based Prediction Dataframe

# check to ensure cleaning was successful
df["value_ratio_dk"].describe()
df["value_ratio_dk"].value_counts().head()


In [None]:
## ** INPUT REQUIRED **
# add filtering option for backtest or live predictions

# filter option for backtest or live predictions
backtest_mode = True  # Set to False for in-season use

# Define current season context
current_year = datetime.now().year
current_week = get_current_week()
years = get_year_range(current_year, current_week)

# Filter the main df
if backtest_mode:
    df = df[df["season"].isin(years)].copy()
else:
    df = df[df["season"] == current_year].copy()



In [None]:
# *** Create New Dataframe: Value-Based Prediction Dataframe ***

# --- Step 1: Filter dataset ---
wr_df = wr_df[wr_df["position"] == "WR"]
wr_df = wr_df.dropna(subset=["hit_value_dk"])


# --- Step 2: Filter dataset ---
percent_cols = ["catch_percentage", "target_share", "rost"]

for col in percent_cols:
    if col in wr_df.columns:
        wr_df[col] = (
            wr_df[col]
            .astype(str)
            .str.strip()
            .str.replace('%', '', regex=False)
        )
        wr_df[col] = pd.to_numeric(wr_df[col], errors='coerce')


# ‚úÖ Now drop rows with any remaining NA in features
wr_df = wr_df.dropna(subset=feature_cols)


X = wr_df[feature_cols].copy()
y = wr_df["hit_value_dk"]

# --- Step 3: Standardize features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 4: Train logistic regression ---
model = LogisticRegression(max_iter=1000)
model.fit(X_scaled, y)

# --- Step 5: Generate predictions ---
pred_probs = model.predict_proba(X_scaled)[:, 1]
pred_classes = model.predict(X_scaled)

# --- Step 6: Add predictions to dataframe ---
wr_df["pred_prob_value"] = pred_probs
wr_df["pred_class_value"] = pred_classes

# --- Step 7: Build prediction dataframe ---
prediction_df_value = wr_df[
    baseline_cols +
    target_cols[:1] +  # Only 'hit_value_dk'
    ["pred_prob_value", "pred_class_value"] +
    optional_display_cols
].copy()


In [None]:
# Quick inspection of prediction_df_value
print(f"‚úÖ Rows: {len(prediction_df_value)} | Columns: {prediction_df_value.shape[1]}")
display(prediction_df_value.head())


In [None]:
# Verify distribution of new value-based prediction dataframe ***

# Distribution of predicted probabilities
prediction_df_value["pred_prob_value"].describe()

# Value counts for predicted vs. actual
print("üìä Predicted Classes:")
print(prediction_df_value["pred_class_value"].value_counts())

print("\nüéØ Actual Outcomes (hit_value_dk):")
print(prediction_df_value["hit_value_dk"].value_counts())


In [None]:
# *** New Dataframe: performance-based prediction dataframe fpts>= threshold ***

# --- Step 1: Filter dataset for FPTS >= 10 target ---
wr_df = df.copy()
wr_df = wr_df[wr_df["position"] == "WR"]
wr_df = wr_df.dropna(subset=["hit_perf_fixed"])

# --- Step 2: Fix percentage columns ---
percent_cols = ["catch_percentage", "target_share", "rost"]
for col in percent_cols:
    if col in wr_df.columns:
        wr_df[col] = (
            wr_df[col]
            .astype(str)
            .str.strip()
            .str.replace('%', '', regex=False)
        )
        wr_df[col] = pd.to_numeric(wr_df[col], errors='coerce')

# --- Step 3: Drop rows with missing feature values ---
wr_df = wr_df.dropna(subset=feature_cols)

# --- Step 4: Build model inputs ---
X = wr_df[feature_cols].copy()
y = wr_df["hit_perf_fixed"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression(max_iter=1000)
model.fit(X_scaled, y)

# --- Step 5: Predictions ---
wr_df["pred_prob_fpts10"] = model.predict_proba(X_scaled)[:, 1]
wr_df["pred_class_fpts10"] = model.predict(X_scaled)

# --- Step 6: Build prediction dataframe ---
prediction_df_fpts10 = wr_df[
    baseline_cols +
    target_cols[1:2] +  # Only 'hit_perf_fixed'
    ["pred_prob_fpts10", "pred_class_fpts10"] +
    optional_display_cols
].copy()


In [None]:
# performance-based prediction dataframe fpts>= threshold 
# check the distribution and df

# Size and preview
print(f"‚úÖ Rows: {len(prediction_df_fpts10)} | Columns: {prediction_df_fpts10.shape[1]}")
display(prediction_df_fpts10.head())

# Distribution of predicted classes
print("\nüìä Predicted Classes (FPTS10):")
print(prediction_df_fpts10["pred_class_fpts10"].value_counts())

# Distribution of actual outcomes
print("\nüéØ Actual Outcomes (hit_perf_fixed):")
print(prediction_df_fpts10["hit_perf_fixed"].value_counts())

# Optional: Look at prediction probabilities
print("\nüìà Probability Distribution:")
print(prediction_df_fpts10["pred_prob_fpts10"].describe())


In [None]:
# *** New Dataframe: performance-based prediction dataframe percentage>= threshold ***
# --- Step 1: Filter dataset for 85th percentile target ---
wr_df = df.copy()
wr_df = wr_df[wr_df["position"] == "WR"]
wr_df = wr_df.dropna(subset=["hit_perf_percentile"])

# --- Step 2: Fix percentage columns ---
percent_cols = ["catch_percentage", "target_share", "rost"]
for col in percent_cols:
    if col in wr_df.columns:
        wr_df[col] = (
            wr_df[col]
            .astype(str)
            .str.strip()
            .str.replace('%', '', regex=False)
        )
        wr_df[col] = pd.to_numeric(wr_df[col], errors='coerce')

# --- Step 3: Drop NA in features ---
wr_df = wr_df.dropna(subset=feature_cols)

# --- Step 4: Prepare model inputs ---
X = wr_df[feature_cols].copy()
y = wr_df["hit_perf_percentile"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression(max_iter=1000)
model.fit(X_scaled, y)

# --- Step 5: Predictions ---
wr_df["pred_prob_85pct"] = model.predict_proba(X_scaled)[:, 1]
wr_df["pred_class_85pct"] = model.predict(X_scaled)

# --- Step 6: Build prediction dataframe ---
prediction_df_85pct = wr_df[
    baseline_cols +
    target_cols[2:] +  # Only 'hit_perf_percentile'
    ["pred_prob_85pct", "pred_class_85pct"] +
    optional_display_cols
].copy()


In [None]:
# performance-based prediction dataframe percentage>= threshold 
# check the distribution and df
# ‚úÖ Basic preview
print(f"‚úÖ Rows: {len(prediction_df_85pct)} | Columns: {prediction_df_85pct.shape[1]}")
display(prediction_df_85pct.head())

# üìä Predicted class distribution
print("\nüìä Predicted Classes (85th Percentile):")
print(prediction_df_85pct["pred_class_85pct"].value_counts())

# üéØ Actual class distribution
print("\nüéØ Actual Outcomes (hit_perf_percentile):")
print(prediction_df_85pct["hit_perf_percentile"].value_counts())

# üìà Probability distribution
print("\nüìà Probability Distribution:")
print(prediction_df_85pct["pred_prob_85pct"].describe())



In [None]:
# prepare to merge the prediction dataframes
# adjust column order
final_column_order = (
    baseline_cols +                        # Core metadata
    ["hit_value_dk", "hit_perf_fixed", "hit_perf_percentile"] +  # Backtest-only truth values
    ["pred_prob_value", "pred_class_value",
     "pred_prob_fpts10", "pred_class_fpts10",
     "pred_prob_85pct", "pred_class_85pct"] +                   # All model predictions
    optional_display_cols                  # Contextual stats for dashboard/visuals
)


In [None]:
# *** New Merged Prediction Dataframes and csv files *** 
# --- Merge all three predictions on identifying columns ---
merge_keys = ["season", "week", "name", "recent_team"]

merged_df = (
    prediction_df_value
    .merge(prediction_df_fpts10, on=merge_keys, suffixes=("", "_f10"))
    .merge(prediction_df_85pct, on=merge_keys, suffixes=("", "_p85"))
)

# --- Rename final output for backtesting ---
wr_logit_predictions_all_models_backtest = merged_df[
    final_column_order  # defined earlier to control output structure
].copy()


In [None]:
# ** New csv file of the merged prediction dataframes **


# --- Calculate year range and export filename ---
current_year = datetime.now().year
current_week = get_current_week()
years = get_year_range(current_year, current_week)

csv_filename = f"wr_logit_predictions_all_models_{years[0]}_{years[-1]}.csv"

# --- Export to CSV ---
wr_logit_predictions_all_models_backtest.to_csv(csv_filename, index=False)
print(f"‚úÖ Backtest prediction CSV saved: {csv_filename}")


In [None]:
### End: Prediction Dataframes ###

In [None]:
# Begin: Simulate Monte Carlo Trials (Basic Version)

In [None]:
# Run Monte Carlo simulations for player-week predictions.
def run_monte_carlo_simulation(df, player_col="name", season_col="season", week_col="week",
                                prob_col="pred_prob_value", n_simulations=1000):
    results = []

    for (player, season), group in df.groupby([player_col, season_col]):
        probs = group.sort_values(week_col)[prob_col].values
        weeks = group.sort_values(week_col)[week_col].values

        # Bernoulli trials: n_simulations x n_weeks
        sim_matrix = np.random.rand(n_simulations, len(probs)) < probs
        total_hits = sim_matrix.sum(axis=1)

        results.append({
            "player": player,
            "season": season,
            "weeks_played": len(weeks),
            "avg_hits": total_hits.mean(),
            "min_hits": total_hits.min(),
            "max_hits": total_hits.max(),
            "std_hits": total_hits.std(),
            "p_hit_all_weeks": np.mean(total_hits == len(probs)),
            "p_hit_half_or_more": np.mean(total_hits >= (len(probs) // 2)),
        })

    return pd.DataFrame(results)


In [None]:
# Monte Carlo for hit value
monte_carlo_results_value = run_monte_carlo_simulation(prediction_df_value)
print(f"‚úÖ Monte Carlo complete for {len(monte_carlo_results_value)} players")
display(monte_carlo_results_value.head())
monte_carlo_results_value.to_csv("wr_monte_carlo_value.csv", index=False)


In [None]:
# Monte Carlo for hit_perf_fpts
monte_carlo_results_fpts10 = run_monte_carlo_simulation(
    prediction_df_fpts10,
    prob_col="pred_prob_fpts10"
)

print(f"‚úÖ Monte Carlo complete for FPTS10: {len(monte_carlo_results_fpts10)} players")
display(monte_carlo_results_fpts10.head())

# Optional export
monte_carlo_results_fpts10.to_csv("wr_monte_carlo_fpts.csv", index=False)


In [None]:
# Monte Carlo for hit_perf_percentile
monte_carlo_results_85pct = run_monte_carlo_simulation(
    prediction_df_85pct,
    prob_col="pred_prob_85pct"
)

print(f"‚úÖ Monte Carlo complete for 85th Percentile: {len(monte_carlo_results_85pct)} players")
display(monte_carlo_results_85pct.head())

# Optional export
monte_carlo_results_85pct.to_csv("wr_monte_carlo_pct.csv", index=False)


In [None]:
# Inspect unique player seasons for monte carlo simulations

# For validation only ‚Äî avoid relying on UI load section
logit_df = pd.read_csv("wr_logit_predictions_all_models_2017_2024.csv")
n_player_seasons = logit_df[['name', 'season']].drop_duplicates()

# output should match
mc_value_df = pd.read_csv("wr_monte_carlo_value.csv")

print(f"üîç Total unique player-seasons in predictions: {len(n_player_seasons)}")
print(f"üìâ Rows in monte_carlo_results_value: {len(mc_value_df)}")


In [None]:
# End: Simulate Monte Carlo Trials (Basic Version)

In [None]:
### Begin: User Interface ###

In [None]:
!pip install fuzzywuzzy[speedup]


In [None]:
from fuzzywuzzy import process
import pandas as pd
import numpy as np
from IPython.display import display
import xlsxwriter
from fpdf import FPDF

In [None]:
# Load logistic regression results
logit_df = pd.read_csv("wr_logit_predictions_all_models_2017_2024.csv")

mc_dict = {
    "monte_carlo_value": pd.read_csv("wr_monte_carlo_value.csv").rename(columns={"player": "name"}),
    "monte_carlo_fpts10": pd.read_csv("wr_monte_carlo_fpts.csv").rename(columns={"player": "name"}),
    "monte_carlo_85pct": pd.read_csv("wr_monte_carlo_pct.csv").rename(columns={"player": "name"})
}


# # Load Monte Carlo data into dictionary (you already have this)
# mc_dict = {
#     "monte_carlo_value": pd.read_csv("wr_monte_carlo_value.csv"),
#     "monte_carlo_fpts10": pd.read_csv("wr_monte_carlo_fpts.csv"),
#     "monte_carlo_85pct": pd.read_csv("wr_monte_carlo_pct.csv")
# }


In [None]:
column_groups = {
    "fpts": ["fpts", "value_ratio_dk", "value_ratio_fd"],
    "touches_athleticism": ["targets", "receptions", "receiving_yards", "receiving_yards_after_catch"],
    "efficiency": ["catch_percentage", "target_share", "receiving_air_yards"],
    "separation": ["avg_cushion", "avg_separation"],
    "zscore_fpts": ["fpts_zscore", "value_ratio_dk_zscore", "value_ratio_fd_zscore"],
    "rolling_avgs": ["fpts_3wk_avg", "receptions_3wk_avg", "targets_3wk_avg"],
}


In [None]:
def run_prediction_lookup_ui_menu(logit_df, mc_dict):
    """
    Unified interface to look up player-season predictions with fuzzy name matching,
    and export options to Excel or PDF.
    """

    print("=== Logit + Monte Carlo Prediction Lookup ===")
    print("Type 'exit' at any prompt to quit.\n")

    while True:
        name_input = input("Enter player name (e.g., A.J. Brown): ").strip().lower()
        if name_input == "exit":
            break

        # Fuzzy match
        all_names = logit_df['name'].dropna().unique()
        best_match, score = process.extractOne(name_input, all_names)
        if score < 80:
            print(f"‚ùå No good match found. Closest was '{best_match}' (score: {score}). Try again.\n")
            continue
        matched_name = best_match
        print(f"üîç Best match: {matched_name} (score: {score})")

        season_input = input("Enter season (e.g., 2024): ").strip()
        if season_input == "exit":
            break
        if not season_input.isdigit():
            print("‚ö†Ô∏è Invalid season. Try again.\n")
            continue
        season_input = int(season_input)

        mode_input = input("Select mode ('logit', 'mc', or 'all'): ").strip().lower()
        if mode_input == "exit":
            break
        if mode_input not in ["logit", "mc", "all"]:
            print("‚ö†Ô∏è Invalid mode. Choose from 'logit', 'mc', or 'all'.\n")
            continue

        # Filter logit data
        player_logit = logit_df[
            (logit_df['name'] == matched_name) &
            (logit_df['season'] == season_input)
        ]

        # Filter Monte Carlo data
        player_mc = {}
        for key, df in mc_dict.items():
            match = df[
                (df['name'] == matched_name) &
                (df['season'] == season_input)
            ]
            player_mc[key] = match

        # No data found
        if player_logit.empty and all(df.empty for df in player_mc.values()):
            print("‚ùå No data found for that player and season.\n")
            continue

        # Display logit
        if mode_input in ["logit", "all"] and not player_logit.empty:
            print("\n--- Logistic Regression Prediction ---")
            display(player_logit)

        # Display Monte Carlo
        if mode_input in ["mc", "all"]:
            print("\n--- Monte Carlo Forecasts ---")
            for key, df in player_mc.items():
                if not df.empty:
                    print(f"\nüìä {key.replace('_', ' ').title()}")
                    display(df)
                else:
                    print(f"‚ö†Ô∏è No data in {key} for this player-season.")

        # Export options
        export = input("Export results? (excel/pdf/none): ").strip().lower()
        if export == "exit":
            break

        # Excel Export
        if export == "excel":
            filename = f"{matched_name.replace(' ', '_')}_{season_input}_predictions.xlsx"
            writer = pd.ExcelWriter(filename, engine="xlsxwriter")
            if not player_logit.empty:
                player_logit.to_excel(writer, sheet_name="Logit", index=False)
            for key, df in player_mc.items():
                if not df.empty:
                    sheet = key[:31]  # Excel sheet name limit
                    df.to_excel(writer, sheet_name=sheet, index=False)
            writer.close()
            print(f"‚úÖ Exported to Excel: {filename}")

        # PDF Export
        elif export == "pdf":
            filename = f"{matched_name.replace(' ', '_')}_{season_input}_predictions.pdf"
            pdf = FPDF()
            pdf.add_page()
            pdf.set_font("Arial", size=12)
            pdf.cell(200, 10, txt=f"{matched_name} - {season_input} Predictions", ln=True)

            if not player_logit.empty:
                pdf.set_font("Arial", "B", 12)
                pdf.cell(200, 10, txt="--- Logistic Regression ---", ln=True)
                pdf.set_font("Arial", size=11)
                for col in player_logit.columns:
                    val = str(player_logit.iloc[0][col])
                    pdf.cell(200, 8, txt=f"{col}: {val}", ln=True)

            for key, df in player_mc.items():
                if not df.empty:
                    pdf.set_font("Arial", "B", 12)
                    pdf.cell(200, 10, txt=f"--- {key.replace('_', ' ').title()} ---", ln=True)
                    pdf.set_font("Arial", size=11)
                    for col in df.columns:
                        val = str(df.iloc[0][col])
                        pdf.cell(200, 8, txt=f"{col}: {val}", ln=True)

            pdf.output(filename)
            print(f"‚úÖ Exported to PDF: {filename}")

        elif export not in ["none", ""]:
            print("‚ö†Ô∏è Invalid export option. Skipped export.")

        print("\n‚úì Lookup complete.\n")


In [None]:
def run_multi_player_visuals(df=wr_df, column_groups=column_groups):
    """
    Compare up to 3 players across a selected stat category.
    Supports z-score normalization and works across seasons.
    """

    print("=== Multi-Player Comparison ===")
    
    players = []
    for i in range(3):
        name = input(f"Enter player {i+1} name (or press Enter to skip): ").strip()
        if name:
            players.append(name)
    
    if not players:
        print("‚ùå No players entered.")
        return

    season_input = input("Enter season (e.g., 2023): ").strip()
    if not season_input.isdigit():
        print("‚ö†Ô∏è Invalid season.")
        return
    season_input = int(season_input)

    stat_group = input("Enter stat group (e.g., fpts, touches_athleticism): ").strip()
    if stat_group not in column_groups:
        print("‚ö†Ô∏è Invalid stat group.")
        return

    use_zscore = input("Use z-score mode? (yes/no): ").strip().lower() == "yes"
    stat_cols = column_groups[stat_group]

    # Set up plots
    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(len(stat_cols), len(players), figsize=(5 * len(players), 4 * len(stat_cols)))
    if len(stat_cols) == 1:
        axes = [axes]
    if len(players) == 1:
        axes = [[ax] for ax in axes]

    for col_idx, stat in enumerate(stat_cols):
        for player_idx, player in enumerate(players):
            df_player = df[
                (df['name'].str.lower() == player.lower()) &
                (df['season'] == season_input)
            ]
            if df_player.empty:
                axes[col_idx][player_idx].set_title(f"{player} ‚Äî No Data")
                axes[col_idx][player_idx].axis("off")
                continue

            plot_data = df_player.copy()
            if use_zscore:
                mean = plot_data[stat].mean()
                std = plot_data[stat].std()
                plot_data[stat] = (plot_data[stat] - mean) / std

            axes[col_idx][player_idx].plot(plot_data['week'], plot_data[stat], marker='o')
            axes[col_idx][player_idx].set_title(f"{player} ‚Äî {stat}")
            axes[col_idx][player_idx].set_xlabel("Week")
            axes[col_idx][player_idx].set_ylabel("Z-Score" if use_zscore else stat)

    plt.tight_layout()
    plt.show()


In [None]:
def run_single_player_visuals(df=wr_df, column_groups=column_groups):
    """
    Show weekly trendlines for a single player across a stat group.
    """

    print("=== Single Player Trendline Visualization ===")
    
    name_input = input("Enter player name: ").strip().lower()
    season_input = input("Enter season (e.g., 2024): ").strip()
    stat_group = input("Enter stat group (e.g., fpts, touches_athleticism): ").strip()
    use_zscore = input("Use z-score mode? (yes/no): ").strip().lower() == "yes"

    if stat_group not in column_groups:
        print("‚ùå Invalid stat group.")
        return

    season_input = int(season_input)
    stat_cols = column_groups[stat_group]

    player_df = df[
        (df['name'].str.lower() == name_input) &
        (df['season'] == season_input)
    ]

    if player_df.empty:
        print("‚ùå No data found for that player/season.")
        return

    import matplotlib.pyplot as plt
    fig, axes = plt.subplots(len(stat_cols), 1, figsize=(8, 4 * len(stat_cols)))

    if len(stat_cols) == 1:
        axes = [axes]

    for idx, col in enumerate(stat_cols):
        y_data = player_df[col]
        if use_zscore:
            y_data = (y_data - y_data.mean()) / y_data.std()

        axes[idx].plot(player_df['week'], y_data, marker='o')
        axes[idx].set_title(f"{name_input.title()} ‚Äî {col} ({'Z-Score' if use_zscore else 'Raw'})")
        axes[idx].set_xlabel("Week")
        axes[idx].set_ylabel(col)

    plt.tight_layout()
    plt.show()


In [None]:
def run_player_dashboard_summary(logit_df, mc_dict):
    """
    Smart interactive dashboard tool for in-season or off-season.
    Uses current year/week defaults unless overridden.
    """
    from fuzzywuzzy import process
    from datetime import datetime

    print("\n=== Player Performance Dashboard ===")
    print("Type 'exit' at any prompt to cancel.\n")

    # --- Set current context ---
    current_year = datetime.now().year
    current_week = get_current_week()
    years = get_year_range(current_year, current_week)
    
    # Smart season + week defaults
    if 1 <= current_week <= 18:
        default_season = years[-1]
        default_week = current_week
    else:
        # If offseason, use most recent complete season
        default_season = years[-1]  # not years[-2]
        default_week = 18


    # --- Player Input ---
    player_input = input("Enter player name: ").strip().lower()
    if player_input == "exit":
        return

    all_names = logit_df['name'].dropna().unique()
    best_match, score = process.extractOne(player_input, all_names)
    if score < 80:
        print(f"‚ùå No good match found. Closest was '{best_match}' (score: {score})")
        return
    matched_name = best_match
    print(f"üîç Best match: {matched_name} (score: {score})")

    # --- Season Input ---
    season_input = input(f"Enter season (default: {default_season}): ").strip()
    if season_input == "exit":
        return
    season = int(season_input) if season_input.isdigit() else default_season

    # --- Week Input ---
    week_input = input(f"Enter week (default: {default_week}): ").strip()
    if week_input == "exit":
        return
    week = int(week_input) if week_input.isdigit() else default_week

    # --- Lookup Logit Row ---
    logit_row = logit_df[
        (logit_df['name'] == matched_name) &
        (logit_df['season'] == season) &
        (logit_df['week'] == week)
    ]

    if logit_row.empty:
        print("‚ùå No logistic prediction found for that player and week.")
        return
    logit_row = logit_row.iloc[0]

    # --- Get Monte Carlo Rows ---
    mc_summary = {}
    for key, df in mc_dict.items():
        df_player = df[
            (df['name'].str.lower() == matched_name.lower()) &
            (df['season'] == season)
        ]
        mc_summary[key] = df_player.iloc[0] if not df_player.empty else None

    # --- Display Summary ---
    print("\nüìä === Player Dashboard Summary ===")
    print(f"Player: {matched_name} | Week: {week} | Season: {season}")
    print(f"DK Salary: ${int(logit_row['dk_salary'])}")
    print(f"Actual FPTS (Week {week}): {logit_row.get('fpts', '‚Äî')}")

    print("\nüî¢ Logistic Regression Predictions:")
    def classify(prob, threshold=0.5):
        if prob >= 0.85: return f"{prob:.2f} üîµ High"
        if prob >= threshold: return f"{prob:.2f} üü° Medium"
        return f"{prob:.2f} üî¥ Low"

    print(f" - P(Hit DK Value):     {classify(logit_row['pred_prob_value'])} ‚Äî Predicted: {'‚úÖ' if logit_row['pred_class_value'] else '‚ùå'}")
    print(f" - P(FPTS ‚â• 10):        {classify(logit_row['pred_prob_fpts10'])} ‚Äî Predicted: {'‚úÖ' if logit_row['pred_class_fpts10'] else '‚ùå'}")
    print(f" - P(Elite Tier 85%):   {classify(logit_row['pred_prob_85pct'])} ‚Äî Predicted: {'‚úÖ' if logit_row['pred_class_85pct'] else '‚ùå'}")

    print("\nüîÆ Monte Carlo Forecasts (Rest of Season):")
    for key in mc_dict.keys():
        mc_row = mc_summary[key]
        label = key.replace("monte_carlo_", "").upper()
        if mc_row is not None:
            print(f"\n--- {label} ---")
            print(f" Avg Hits:         {mc_row['avg_hits']:.2f}")
            print(f" Min‚ÄìMax Hits:     {mc_row['min_hits']} ‚Äì {mc_row['max_hits']}")
            print(f" Std Dev:          {mc_row['std_hits']:.2f}")
            print(f" P(Hit All Weeks): {mc_row['p_hit_all_weeks']:.2f}")
            print(f" P(Hit ‚â• Half):    {mc_row['p_hit_half_or_more']:.2f}")
        else:
            print(f"\n--- {label} ---\nNo Monte Carlo data available.")

    print("\n‚úÖ Summary Complete.\n")


In [None]:
def run_main_nfl_model_tools_menu():
    """
    Interactive menu to run NFL model tools from one interface.
    """

    while True:
        print("\n=== Main NFL Model Tools Menu ===")
        print("1. üß† Player Performance Dashboard")
        print("2. üîç Lookup Predictions (Logit + Monte Carlo)")
        print("3. üìà Single Player Trendline Visualization")
        print("4. üë• Multi-Player Comparison")
        print("5. üìä Season Summary Reports")
        print("0. ‚ùå Exit")

        choice = input("Enter your choice: ").strip()

        if choice == "1":
            run_player_dashboard_summary(logit_df, mc_dict)
        elif choice == "2":
            run_prediction_lookup_ui_menu(logit_df, mc_dict)
        elif choice == "3":
            run_single_player_visuals()
        elif choice == "4":
            run_multi_player_visuals()
        elif choice == "5":
            run_season_visuals()
        elif choice == "0":
            print("üëã Exiting. See you next time!")
            break
        else:
            print("‚ö†Ô∏è Invalid option. Please enter a number from 0 to 5.")


In [None]:
### ** USER INPUT ** ###
run_main_nfl_model_tools_menu()


In [None]:
### End: User Interface ###