In [351]:
%pip install imbalanced-learn xgboost shap scipy scikit-learn pandas numpy autogluon ctgan sentence-transformers mlflow

StatementMeta(pocsparkpool, 51, 70, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.





In [352]:
# import sys
# import json
# import os

# def load_athlete_input():
#     # Look for real, user-supplied arguments (not -f or .json connection files)
#     cli_args = [arg for arg in sys.argv[1:] if not (arg.startswith('-f') or arg.endswith('.json'))]
#     if cli_args:
#         arg = cli_args[0]
#         try:
#             athlete_input = json.loads(arg)
#             print("Loaded athlete input from JSON string.")
#         except json.JSONDecodeError:
#             if os.path.isfile(arg):
#                 try:
#                     with open(arg, "r") as f:
#                         athlete_input = json.load(f)
#                     print(f"Loaded athlete input from file: {arg}")
#                 except Exception as e:
#                     print(f"Failed to parse JSON file: {e}")
#                     athlete_input = None
#             else:
#                 print(f"Input is neither a valid JSON string nor a file: {arg}")
#                 athlete_input = None
#     else:
#         # Default for notebook/test/dev
#         athlete_input = {
#             'Senior_Yds': 1123, 'Senior_Avg': 17.3, 'Senior_Rec': 65, 'Senior_TD': 12, 'Senior_Rush_Yds': 100,
#             'Height_Inches': 71, 'Weight_Lbs': 180, 'Forty_Yard_Dash': 4.40, 'Vertical_Jump': 39, 'Shuttle': 4.05,
#             'Broad_Jump': 125, 'State': 'TX', 'position': 'WR', 'grad_year': 2025
#         }
#         print("No valid user input detected – using default test athlete.")
#     if not isinstance(athlete_input, dict):
#         raise ValueError("No valid athlete input found (check input or Synapse pipeline config).")
#     return athlete_input

# athlete_input = load_athlete_input()


StatementMeta(pocsparkpool, 51, 72, Finished, Available, Finished)

In [353]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
from scipy.stats import mstats, percentileofscore 
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
np.random.seed(42)

# CRITICAL: Add XGBoost safeguard function to prevent duplicate column errors
def xgboost_safeguard(X_train, X_test, step_name="Model Training"):
    """
    Comprehensive safeguard function to ensure XGBoost compatibility
    Removes duplicate columns and validates data before training
    """
    print(f"\n🛡️  XGBoost Safeguard - {step_name}")
    print("-" * 50)
    
    # Step 1: Check for duplicate column names
    train_duplicates = X_train.columns.duplicated()
    test_duplicates = X_test.columns.duplicated()
    
    if train_duplicates.any():
        duplicate_cols = X_train.columns[train_duplicates].unique()
        print(f"⚠️  Found {len(duplicate_cols)} duplicate columns in training data: {list(duplicate_cols)}")
        X_train = X_train.loc[:, ~X_train.columns.duplicated(keep='first')]
        print(f"✅ Removed duplicates from training data: {X_train.shape}")
    
    if test_duplicates.any():
        duplicate_cols = X_test.columns[test_duplicates].unique()
        print(f"⚠️  Found {len(duplicate_cols)} duplicate columns in test data: {list(duplicate_cols)}")
        X_test = X_test.loc[:, ~X_test.columns.duplicated(keep='first')]
        print(f"✅ Removed duplicates from test data: {X_test.shape}")
    
    # Step 2: Ensure both datasets have the same columns
    train_cols = set(X_train.columns)
    test_cols = set(X_test.columns)
    
    if train_cols != test_cols:
        print(f"⚠️  Column mismatch detected")
        print(f"   Training columns: {len(train_cols)}")
        print(f"   Test columns: {len(test_cols)}")
        
        # Use intersection of columns
        common_cols = list(train_cols & test_cols)
        print(f"   Using {len(common_cols)} common columns")
        
        X_train = X_train[common_cols]
        X_test = X_test[common_cols]
    
    # Step 3: Validate data types for XGBoost compatibility
    invalid_dtypes = []
    for col in X_train.columns:
        if X_train[col].dtype not in ['int64', 'float64', 'int32', 'float32', 'bool', 'int8', 'float16']:
            invalid_dtypes.append((col, X_train[col].dtype))
    
    if invalid_dtypes:
        print(f"⚠️  Found {len(invalid_dtypes)} columns with invalid dtypes:")
        for col, dtype in invalid_dtypes[:5]:  # Show first 5
            print(f"   {col}: {dtype}")
        
        # Convert to numeric
        for col, _ in invalid_dtypes:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce').fillna(0)
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)
        print(f"✅ Converted invalid dtypes to numeric")
    
    # Step 4: Check for infinite values
    train_inf = np.isinf(X_train).any().any()
    test_inf = np.isinf(X_test).any().any()
    
    if train_inf or test_inf:
        print(f"⚠️  Found infinite values - replacing with NaN then 0")
        X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
        X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)
        print(f"✅ Replaced infinite values")
    
    # Step 5: Final validation - absolutely no duplicates
    final_train_dups = X_train.columns.duplicated().any()
    final_test_dups = X_test.columns.duplicated().any()
    
    if final_train_dups or final_test_dups:
        print(f"🚨 CRITICAL: Still have duplicates after safeguards!")
        # Nuclear option: rename all columns to generic names
        n_cols = len(X_train.columns)
        new_col_names = [f"feature_{i:03d}" for i in range(n_cols)]
        X_train.columns = new_col_names
        X_test.columns = new_col_names
        print(f"🔧 Applied nuclear fix: renamed all columns to generic names")
    
    print(f"✅ XGBoost Safeguard Complete")
    print(f"   Training data: {X_train.shape}")
    print(f"   Test data: {X_test.shape}")
    print(f"   Data types: {X_train.dtypes.value_counts().to_dict()}")
    
    return X_train, X_test

# Modular functions for easy upgrades
def augment_data(X, y):
    """ADASYN for now, but easy to swap with GAN/ctgan/SMOTE."""
    try:
        adasyn = ADASYN(random_state=42, sampling_strategy='auto')
        X_aug, y_aug = adasyn.fit_resample(X, y)
        print(f"ADASYN: {X.shape} -> {X_aug.shape}")
        return X_aug, y_aug
    except ValueError as e:
        print(f"ADASYN failed ({e}), using original data")
        return X, y

def train_model(X, y):
    """XGBClassifier for now, easy to swap with AutoGluon/CatBoost/Ordinal."""
    # Apply XGBoost safeguard before training
    if len(X.shape) == 2:  # Only apply to feature matrices
        # Create dummy test set for validation (will be ignored)
        X_dummy = X.iloc[:5].copy() if hasattr(X, 'iloc') else X[:5].copy()
        X, X_dummy = xgboost_safeguard(X, X_dummy, "Pre-Training Validation")
    
    model = XGBClassifier(
        n_estimators=100, 
        max_depth=3, 
        learning_rate=0.1, 
        eval_metric='mlogloss', 
        random_state=42
    )
    model.fit(X, y)
    return model

StatementMeta(pocsparkpool, 51, 73, Finished, Available, Finished)

In [354]:
# Updated Cell 3: Enhanced Data Acquisition with Intelligent Benchmark Imputation

def load_base_csv_enhanced(position):
    """Enhanced data loading with intelligent missing data handling"""
    # Use abfss for Synapse/ADLS Gen2 (recommended with Linked Service)
    paths = {
        'qb': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/221 QB FINAL - Sheet1.csv',
        'rb': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/RB list 1 - Sheet1.csv',
        'wr': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/wr final - Sheet1.csv',
        'db': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/db.csv',
        'lb': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/lb.csv',
        'te': 'abfss://data@recruitrevealstorage2026.dfs.core.windows.net/te.csv'
    }
    path = paths.get(position, paths['qb'])
    try:
        df_spark = spark.read.csv(path, header=True, inferSchema=True)
        df = df_spark.toPandas()
        df.columns = df.columns.str.strip().str.lower()
        print(f"Loaded {len(df)} rows for {position.upper()}")
        if 'division' in df.columns:
            print(f"Unique divisions for {position.upper()}: {df['division'].unique()}")
        else:
            print(f"No 'division' column found for {position.upper()}")
    except Exception as e:
        print(f"Load failed for {position}: {e}")
        df = pd.DataFrame(columns=[
            'name','division','state','height_inches','weight_lbs','senior_yds','senior_avg','senior_rec',
            'senior_td','junior_yds','junior_avg','junior_rec','junior_td','senior_ypg','senior_tds',
            'senior_comp_pct','senior_ypc','senior_rush_yds','grad_year'
        ])
        df.columns = df.columns.str.strip().str.lower()
    
    df.columns = df.columns.str.strip().str.lower()

    # Normalize division spelling everywhere!
    if 'division' in df.columns:
        df['division'] = (
            df['division'].astype(str)
            .str.strip()
            .str.upper()
            .str.replace('POWER5', 'POWER 5', regex=False)
            .str.replace('FBS', 'POWER 5', regex=False)
            .str.replace('D3/NAIA', 'D3', regex=False)
            .str.replace('NAIA', 'NAIA', regex=False)
        )
    
    # Always provide 'position' for downstream logic
    df['position'] = position.lower()
    return df

# High School Football Recruiting Guidelines - Combine Benchmarks by Position/Division
# Based on industry standards and recruiting data
COMBINE_BENCHMARKS = {
    'qb': {
        'POWER 5': {'forty_yard_dash': (4.6, 4.9), 'vertical_jump': (30, 34), 'shuttle': (4.3, 4.6), 'broad_jump': (108, 118)},
        'FCS': {'forty_yard_dash': (4.7, 5.0), 'vertical_jump': (28, 32), 'shuttle': (4.4, 4.7), 'broad_jump': (102, 112)},
        'D2': {'forty_yard_dash': (4.8, 5.1), 'vertical_jump': (26, 30), 'shuttle': (4.5, 4.8), 'broad_jump': (96, 106)},
        'D3': {'forty_yard_dash': (4.9, 5.3), 'vertical_jump': (24, 28), 'shuttle': (4.6, 4.9), 'broad_jump': (90, 100)},
        'NAIA': {'forty_yard_dash': (4.8, 5.2), 'vertical_jump': (25, 29), 'shuttle': (4.5, 4.8), 'broad_jump': (92, 102)}
    },
    'rb': {
        'POWER 5': {'forty_yard_dash': (4.2, 4.5), 'vertical_jump': (34, 38), 'shuttle': (4.0, 4.3), 'broad_jump': (120, 130)},
        'FCS': {'forty_yard_dash': (4.3, 4.6), 'vertical_jump': (32, 36), 'shuttle': (4.1, 4.4), 'broad_jump': (110, 120)},
        'D2': {'forty_yard_dash': (4.4, 4.7), 'vertical_jump': (30, 34), 'shuttle': (4.2, 4.5), 'broad_jump': (100, 110)},
        'D3': {'forty_yard_dash': (4.5, 4.8), 'vertical_jump': (28, 32), 'shuttle': (4.3, 4.6), 'broad_jump': (95, 105)},
        'NAIA': {'forty_yard_dash': (4.4, 4.7), 'vertical_jump': (29, 33), 'shuttle': (4.2, 4.5), 'broad_jump': (98, 108)}
    },
    'wr': {
        'POWER 5': {'forty_yard_dash': (4.4, 4.7), 'vertical_jump': (34, 38), 'shuttle': (4.1, 4.4), 'broad_jump': (120, 130)},
        'FCS': {'forty_yard_dash': (4.5, 4.8), 'vertical_jump': (33, 37), 'shuttle': (4.2, 4.5), 'broad_jump': (110, 120)},
        'D2': {'forty_yard_dash': (4.6, 4.9), 'vertical_jump': (31, 35), 'shuttle': (4.3, 4.6), 'broad_jump': (100, 110)},
        'D3': {'forty_yard_dash': (4.7, 5.0), 'vertical_jump': (29, 33), 'shuttle': (4.4, 4.7), 'broad_jump': (95, 105)},
        'NAIA': {'forty_yard_dash': (4.6, 4.9), 'vertical_jump': (30, 34), 'shuttle': (4.3, 4.6), 'broad_jump': (98, 108)}
    }
}

def intelligent_combine_imputation(df, position):
    """Intelligent imputation using benchmark ranges with Bayesian-inspired priors"""
    df = df.copy()
    position = position.lower()
    
    # Normalize division for lookup
    df['division_lookup'] = df['division'].str.upper()
    
    combine_metrics = ['forty_yard_dash', 'vertical_jump', 'shuttle', 'broad_jump']
    position_benchmarks = COMBINE_BENCHMARKS.get(position, COMBINE_BENCHMARKS['qb'])
    
    imputation_log = []
    
    for metric in combine_metrics:
        if metric not in df.columns:
            df[metric] = np.nan
            df[f'{metric}_imputed'] = True
            imputation_log.append(f"Created missing column {metric}")
        else:
            df[f'{metric}_imputed'] = df[metric].isna()
        
        missing_mask = df[metric].isna()
        if missing_mask.any():
            missing_count = missing_mask.sum()
            imputation_log.append(f"Imputing {missing_count} missing {metric} values")
            
            # Impute based on division-specific benchmarks
            for division in df['division_lookup'].unique():
                if pd.isna(division):
                    continue
                    
                div_mask = (df['division_lookup'] == division) & missing_mask
                if not div_mask.any():
                    continue
                
                # Get benchmark range for this position/division
                if division in position_benchmarks:
                    min_val, max_val = position_benchmarks[division][metric]
                else:
                    # Fallback to D3 benchmarks if division not found
                    min_val, max_val = position_benchmarks['D3'][metric]
                
                # Bayesian-inspired imputation: use normal distribution centered on range midpoint
                mean_val = (min_val + max_val) / 2
                std_val = (max_val - min_val) / 4  # Assume 95% of values within range
                
                # Generate values and clip to realistic range
                n_samples = div_mask.sum()
                imputed_values = np.random.normal(mean_val, std_val, n_samples)
                imputed_values = np.clip(imputed_values, min_val * 0.9, max_val * 1.1)
                
                df.loc[div_mask, metric] = imputed_values
                imputation_log.append(f"  {division}: {n_samples} values from N({mean_val:.2f}, {std_val:.2f})")
    
    print(f"Combine imputation for {position.upper()}:")
    for log_entry in imputation_log:
        print(f"  {log_entry}")
    
    return df

def enrich_data_enhanced(df, position, year=2025):
    """Enhanced data enrichment with more balanced synthetic samples"""
    enrich_data = {
        'qb': [
            # Power 5 samples
            {'name': 'Elite Power 5 QB', 'height_inches': 75, 'weight_lbs': 215, 'senior_ypg': 285, 'senior_tds': 28, 'senior_comp_pct': 68, 'state': 'TX', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 32, 'shuttle': 4.4, 'broad_jump': 112},
            {'name': 'Good Power 5 QB', 'height_inches': 73, 'weight_lbs': 205, 'senior_ypg': 255, 'senior_tds': 24, 'senior_comp_pct': 64, 'state': 'CA', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 31, 'shuttle': 4.5, 'broad_jump': 110},
            # FCS samples
            {'name': 'Elite FCS QB', 'height_inches': 73, 'weight_lbs': 200, 'senior_ypg': 225, 'senior_tds': 22, 'senior_comp_pct': 62, 'state': 'FL', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 30, 'shuttle': 4.5, 'broad_jump': 108},
            {'name': 'Good FCS QB', 'height_inches': 72, 'weight_lbs': 195, 'senior_ypg': 200, 'senior_tds': 18, 'senior_comp_pct': 58, 'state': 'GA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 29, 'shuttle': 4.6, 'broad_jump': 105},
            {'name': 'Solid FCS QB', 'height_inches': 71, 'weight_lbs': 190, 'senior_ypg': 180, 'senior_tds': 16, 'senior_comp_pct': 55, 'state': 'NC', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 28, 'shuttle': 4.6, 'broad_jump': 103},
            # D2 samples
            {'name': 'Elite D2 QB', 'height_inches': 71, 'weight_lbs': 190, 'senior_ypg': 165, 'senior_tds': 16, 'senior_comp_pct': 58, 'state': 'OH', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 28, 'shuttle': 4.6, 'broad_jump': 101},
            {'name': 'Good D2 QB', 'height_inches': 70, 'weight_lbs': 185, 'senior_ypg': 145, 'senior_tds': 14, 'senior_comp_pct': 54, 'state': 'MI', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 5.0, 'vertical_jump': 27, 'shuttle': 4.7, 'broad_jump': 98},
            # D3 samples
            {'name': 'Elite D3 QB', 'height_inches': 70, 'weight_lbs': 180, 'senior_ypg': 125, 'senior_tds': 12, 'senior_comp_pct': 52, 'state': 'IL', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 5.1, 'vertical_jump': 26, 'shuttle': 4.7, 'broad_jump': 95},
            {'name': 'Good D3 QB', 'height_inches': 69, 'weight_lbs': 175, 'senior_ypg': 105, 'senior_tds': 10, 'senior_comp_pct': 48, 'state': 'PA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 5.2, 'vertical_jump': 25, 'shuttle': 4.8, 'broad_jump': 92},
            # NAIA samples
            {'name': 'Elite NAIA QB', 'height_inches': 70, 'weight_lbs': 185, 'senior_ypg': 135, 'senior_tds': 13, 'senior_comp_pct': 55, 'state': 'KS', 'division': 'NAIA', 'grad_year': 2025, 'forty_yard_dash': 5.0, 'vertical_jump': 27, 'shuttle': 4.6, 'broad_jump': 97}
        ],
        'rb': [
            # Power 5 samples
            {'name': 'Elite Power 5 RB', 'height_inches': 70, 'weight_lbs': 205, 'senior_ypg': 145, 'senior_tds': 18, 'senior_ypc': 5.8, 'state': 'TX', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.35, 'vertical_jump': 36, 'shuttle': 4.1, 'broad_jump': 125},
            {'name': 'Good Power 5 RB', 'height_inches': 69, 'weight_lbs': 195, 'senior_ypg': 125, 'senior_tds': 15, 'senior_ypc': 5.2, 'state': 'FL', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.4, 'vertical_jump': 35, 'shuttle': 4.2, 'broad_jump': 122},
            # FCS samples
            {'name': 'Elite FCS RB', 'height_inches': 69, 'weight_lbs': 190, 'senior_ypg': 115, 'senior_tds': 14, 'senior_ypc': 4.8, 'state': 'CA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.4, 'vertical_jump': 34, 'shuttle': 4.2, 'broad_jump': 115},
            {'name': 'Good FCS RB', 'height_inches': 68, 'weight_lbs': 185, 'senior_ypg': 95, 'senior_tds': 12, 'senior_ypc': 4.4, 'state': 'GA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.5, 'vertical_jump': 33, 'shuttle': 4.3, 'broad_jump': 112},
            {'name': 'Solid FCS RB', 'height_inches': 67, 'weight_lbs': 180, 'senior_ypg': 85, 'senior_tds': 10, 'senior_ypc': 4.1, 'state': 'NC', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.6, 'vertical_jump': 32, 'shuttle': 4.4, 'broad_jump': 110},
            # D2 samples
            {'name': 'Elite D2 RB', 'height_inches': 68, 'weight_lbs': 180, 'senior_ypg': 85, 'senior_tds': 11, 'senior_ypc': 4.2, 'state': 'OH', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.5, 'vertical_jump': 32, 'shuttle': 4.3, 'broad_jump': 105},
            {'name': 'Good D2 RB', 'height_inches': 67, 'weight_lbs': 175, 'senior_ypg': 75, 'senior_tds': 9, 'senior_ypc': 3.9, 'state': 'MI', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.6, 'vertical_jump': 31, 'shuttle': 4.4, 'broad_jump': 102},
            # D3 samples
            {'name': 'Elite D3 RB', 'height_inches': 67, 'weight_lbs': 170, 'senior_ypg': 65, 'senior_tds': 8, 'senior_ypc': 3.6, 'state': 'IL', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.6, 'vertical_jump': 30, 'shuttle': 4.4, 'broad_jump': 98},
            {'name': 'Good D3 RB', 'height_inches': 66, 'weight_lbs': 165, 'senior_ypg': 55, 'senior_tds': 7, 'senior_ypc': 3.3, 'state': 'PA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 29, 'shuttle': 4.5, 'broad_jump': 96},
            # NAIA samples
            {'name': 'Elite NAIA RB', 'height_inches': 67, 'weight_lbs': 175, 'senior_ypg': 75, 'senior_tds': 9, 'senior_ypc': 3.8, 'state': 'KS', 'division': 'NAIA', 'grad_year': 2025, 'forty_yard_dash': 4.5, 'vertical_jump': 31, 'shuttle': 4.3, 'broad_jump': 103}
        ],
        'wr': [
            # Power 5 samples
            {'name': 'Elite Power 5 WR', 'height_inches': 72, 'weight_lbs': 185, 'senior_yds': 1100, 'senior_avg': 18.5, 'senior_rec': 60, 'senior_td': 14, 'state': 'TX', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.45, 'vertical_jump': 36, 'shuttle': 4.2, 'broad_jump': 125},
            {'name': 'Good Power 5 WR', 'height_inches': 71, 'weight_lbs': 180, 'senior_yds': 950, 'senior_avg': 16.8, 'senior_rec': 55, 'senior_td': 12, 'state': 'FL', 'division': 'POWER 5', 'grad_year': 2025, 'forty_yard_dash': 4.5, 'vertical_jump': 35, 'shuttle': 4.3, 'broad_jump': 122},
            # FCS samples
            {'name': 'Elite FCS WR', 'height_inches': 71, 'weight_lbs': 175, 'senior_yds': 850, 'senior_avg': 16.0, 'senior_rec': 52, 'senior_td': 10, 'state': 'CA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.6, 'vertical_jump': 35, 'shuttle': 4.3, 'broad_jump': 115},
            {'name': 'Good FCS WR', 'height_inches': 70, 'weight_lbs': 170, 'senior_yds': 750, 'senior_avg': 15.2, 'senior_rec': 48, 'senior_td': 8, 'state': 'GA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 34, 'shuttle': 4.4, 'broad_jump': 112},
            {'name': 'Solid FCS WR', 'height_inches': 69, 'weight_lbs': 165, 'senior_yds': 650, 'senior_avg': 14.5, 'senior_rec': 44, 'senior_td': 7, 'state': 'NC', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 33, 'shuttle': 4.4, 'broad_jump': 110},
            # D2 samples
            {'name': 'Elite D2 WR', 'height_inches': 70, 'weight_lbs': 170, 'senior_yds': 600, 'senior_avg': 14.0, 'senior_rec': 42, 'senior_td': 7, 'state': 'OH', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 33, 'shuttle': 4.4, 'broad_jump': 105},
            {'name': 'Good D2 WR', 'height_inches': 69, 'weight_lbs': 165, 'senior_yds': 520, 'senior_avg': 13.2, 'senior_rec': 38, 'senior_td': 6, 'state': 'MI', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 32, 'shuttle': 4.5, 'broad_jump': 102},
            # D3 samples
            {'name': 'Elite D3 WR', 'height_inches': 69, 'weight_lbs': 165, 'senior_yds': 450, 'senior_avg': 12.5, 'senior_rec': 35, 'senior_td': 5, 'state': 'IL', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 31, 'shuttle': 4.5, 'broad_jump': 98},
            {'name': 'Good D3 WR', 'height_inches': 68, 'weight_lbs': 160, 'senior_yds': 380, 'senior_avg': 11.8, 'senior_rec': 32, 'senior_td': 4, 'state': 'PA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 30, 'shuttle': 4.6, 'broad_jump': 96},
            # NAIA samples
            {'name': 'Elite NAIA WR', 'height_inches': 69, 'weight_lbs': 165, 'senior_yds': 500, 'senior_avg': 13.0, 'senior_rec': 38, 'senior_td': 6, 'state': 'KS', 'division': 'NAIA', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 32, 'shuttle': 4.4, 'broad_jump': 100}
        ]
    }
    
    position = position.lower()
    enrich_df = pd.DataFrame(enrich_data.get(position, []))
    if not enrich_df.empty:
        enrich_df.columns = enrich_df.columns.str.strip().str.lower()
        df = pd.concat([df, enrich_df], ignore_index=True)
        print(f"Added {len(enrich_df)} enhanced synthetic samples for {position.upper()}")
    
    # Add hoops_vert feature for multi-sport athletes
    df['hoops_vert'] = df.get('vertical_jump', 32)
    
    return df

StatementMeta(pocsparkpool, 51, 74, Finished, Available, Finished)

In [355]:
# Cell 3: Enrich Data (adds baseline FCS/D2/D3 to all positions)
def enrich_data(df, position, year=2025):
    enrich_data = {
        'qb': [
            {'name': 'Sample FCS QB', 'height_inches': 72, 'weight_lbs': 195, 'senior_ypg': 180, 'senior_tds': 20, 'senior_comp_pct': 60, 'state': 'CA', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 28, 'shuttle': 4.5, 'broad_jump': 105},
            {'name': 'Sample D2 QB', 'height_inches': 71, 'weight_lbs': 185, 'senior_ypg': 140, 'senior_tds': 15, 'senior_comp_pct': 55, 'state': 'FL', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 26, 'shuttle': 4.6, 'broad_jump': 100},
            {'name': 'Sample D3 QB', 'height_inches': 70, 'weight_lbs': 175, 'senior_ypg': 100, 'senior_tds': 10, 'senior_comp_pct': 50, 'state': 'GA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 5.0, 'vertical_jump': 24, 'shuttle': 4.7, 'broad_jump': 95}
        ],
        'rb': [
            {'name': 'Sample FCS RB', 'height_inches': 68, 'weight_lbs': 185, 'senior_ypg': 110, 'senior_tds': 15, 'senior_ypc': 4.5, 'state': 'TX', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 30, 'shuttle': 4.4, 'broad_jump': 110},
            {'name': 'Sample D2 RB', 'height_inches': 67, 'weight_lbs': 175, 'senior_ypg': 90, 'senior_tds': 10, 'senior_ypc': 4.0, 'state': 'FL', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 28, 'shuttle': 4.5, 'broad_jump': 105},
            {'name': 'Sample D3 RB', 'height_inches': 66, 'weight_lbs': 165, 'senior_ypg': 70, 'senior_tds': 8, 'senior_ypc': 3.5, 'state': 'CA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.9, 'vertical_jump': 26, 'shuttle': 4.6, 'broad_jump': 100}
        ],
        'wr': [
            {'name': 'Sample FCS WR', 'height_inches': 70, 'weight_lbs': 175, 'senior_yds': 800, 'senior_avg': 15, 'senior_rec': 50, 'senior_td': 8, 'state': 'TX', 'division': 'FCS', 'grad_year': 2025, 'forty_yard_dash': 4.6, 'vertical_jump': 32, 'shuttle': 4.4, 'broad_jump': 110},
            {'name': 'Sample D2 WR', 'height_inches': 69, 'weight_lbs': 165, 'senior_yds': 600, 'senior_avg': 13, 'senior_rec': 40, 'senior_td': 6, 'state': 'FL', 'division': 'D2', 'grad_year': 2025, 'forty_yard_dash': 4.7, 'vertical_jump': 30, 'shuttle': 4.5, 'broad_jump': 105},
            {'name': 'Sample D3 WR', 'height_inches': 68, 'weight_lbs': 160, 'senior_yds': 400, 'senior_avg': 11, 'senior_rec': 30, 'senior_td': 4, 'state': 'CA', 'division': 'D3', 'grad_year': 2025, 'forty_yard_dash': 4.8, 'vertical_jump': 28, 'shuttle': 4.6, 'broad_jump': 100}
        ]
    }
    position = position.lower()
    enrich_df = pd.DataFrame(enrich_data.get(position, []))
    enrich_df.columns = enrich_df.columns.str.strip().str.lower()
    df = pd.concat([df, enrich_df], ignore_index=True)
    df['hoops_vert'] = df.get('vertical_jump', 32)
    return df


StatementMeta(pocsparkpool, 51, 75, Finished, Available, Finished)

In [356]:
# Updated Cell 4: Enhanced Preprocessing with Intelligent Imputation, Embeddings, and Advanced Features

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

def create_state_embeddings(df):
    """Create state embeddings for talent hotbed representation"""
    # DUPLICATE PREVENTION: Check if state features already exist
    if 'state_talent_score' in df.columns:
        print("    State embeddings already exist - skipping creation to prevent duplicates")
        return df
    
    # Define state talent tiers based on recruiting density and college production
    state_tiers = {
        'TX': 'tier_1',  # Elite talent hotbeds
        'FL': 'tier_1',
        'CA': 'tier_1', 
        'GA': 'tier_1',
        
        'OH': 'tier_2',  # Strong talent states
        'PA': 'tier_2',
        'NC': 'tier_2',
        'VA': 'tier_2',
        'MI': 'tier_2',
        'IL': 'tier_2',
        'LA': 'tier_2',
        'AL': 'tier_2',
        'TN': 'tier_2',
        'SC': 'tier_2',
        'AZ': 'tier_2',
        'NJ': 'tier_2',
        'MD': 'tier_2',
        
        'IN': 'tier_3',  # Moderate talent states
        'MO': 'tier_3',
        'WI': 'tier_3',
        'MN': 'tier_3',
        'IA': 'tier_3',
        'KY': 'tier_3',
        'OK': 'tier_3',
        'AR': 'tier_3',
        'MS': 'tier_3',
        'KS': 'tier_3',
        'CO': 'tier_3',
        'OR': 'tier_3',
        'WA': 'tier_3',
        'CT': 'tier_3',
        'NV': 'tier_3',
        'UT': 'tier_3'
    }
    
    # Create state embeddings using simple numeric encoding (avoid object columns)
    df['state_talent_score'] = df['state'].str.upper().map({
        'TX': 4, 'FL': 4, 'CA': 4, 'GA': 4,  # Elite
        'OH': 3, 'PA': 3, 'NC': 3, 'VA': 3, 'MI': 3, 'IL': 3, 'LA': 3, 'AL': 3, 'TN': 3, 'SC': 3, 'AZ': 3, 'NJ': 3, 'MD': 3,  # Strong
        'IN': 2, 'MO': 2, 'WI': 2, 'MN': 2, 'IA': 2, 'KY': 2, 'OK': 2, 'AR': 2, 'MS': 2, 'KS': 2, 'CO': 2, 'OR': 2, 'WA': 2, 'CT': 2, 'NV': 2, 'UT': 2  # Moderate
    }).fillna(1).astype(int)  # Default for other states, ensure int type
    
    # Create binary indicators for state tiers (avoid object columns)
    df['state_tier_1'] = (df['state_talent_score'] == 4).astype(int)  # Elite states
    df['state_tier_2'] = (df['state_talent_score'] == 3).astype(int)  # Strong states
    df['state_tier_3'] = (df['state_talent_score'] == 2).astype(int)  # Moderate states
    df['state_tier_4'] = (df['state_talent_score'] == 1).astype(int)  # Other states
    
    return df

def enhanced_feature_engineering(df, position):
    """Enhanced feature engineering with interaction terms and advanced metrics"""
    df = df.copy()
    df.columns = df.columns.str.strip().str.lower()

    # DUPLICATE PREVENTION: Check if enhanced features already exist
    if 'state_eff' in df.columns or 'bmi_ypg' in df.columns:
        print(f"    Enhanced features already exist - removing duplicates and reprocessing")
        # Remove existing enhanced features to prevent conflicts
        enhanced_cols = ['state_eff', 'bmi_ypg', 'height_traj', 'speed_power_ratio', 'combine_confidence']
        for col in enhanced_cols:
            if col in df.columns:
                df = df.drop(columns=[col])

    # Ensure essential columns exist with intelligent defaults
    essential_cols = ['height_inches', 'weight_lbs', 'position', 'division', 'state']
    for col in essential_cols:
        if col not in df.columns:
            if col == 'height_inches':
                df[col] = 70
            elif col == 'weight_lbs':
                df[col] = 180
            elif col == 'position':
                df[col] = position
            elif col == 'division':
                df[col] = 'D3'
            elif col == 'state':
                df[col] = 'ZZ'

    # Apply intelligent combine imputation
    df = intelligent_combine_imputation(df, position)
    
    # Create state embeddings (now returns only numeric columns)
    df = create_state_embeddings(df)
    
    # Position-aware engineered features
    df['games'] = 12
    if 'senior_rec' in df.columns:
        wr_mask = df['position'].str.lower() == 'wr'
        df.loc[wr_mask, 'games'] = df.loc[wr_mask, 'senior_rec'].replace(0, np.nan).fillna(12).clip(8, 15)
    if 'senior_yds' in df.columns and 'senior_ypg' in df.columns:
        rb_qb_mask = df['position'].str.lower().isin(['rb', 'qb'])
        with np.errstate(divide='ignore', invalid='ignore'):
            games_calc = df.loc[rb_qb_mask, 'senior_yds'] / df.loc[rb_qb_mask, 'senior_ypg']
            games_calc = games_calc.replace([np.inf, -np.inf], np.nan).fillna(12).clip(8, 15)
            df.loc[rb_qb_mask, 'games'] = games_calc

    # Enhanced all_purpose_game calculation for RBs
    rb_mask = df['position'].str.lower() == 'rb'
    if 'senior_yds' in df.columns:
        if 'senior_rec_yds' in df.columns:
            df.loc[rb_mask, 'all_purpose_game'] = (
                df.loc[rb_mask, 'senior_yds'] + df.loc[rb_mask, 'senior_rec_yds']
            ) / df.loc[rb_mask, 'games']
        else:
            df.loc[rb_mask, 'all_purpose_game'] = df.loc[rb_mask, 'senior_yds'] / df.loc[rb_mask, 'games']
    else:
        df['all_purpose_game'] = df.get('ypg', 0) + df.get('rec_ypg', 0)

    # Derived per-game stats
    df['rec_ypg'] = 0.0
    df['ypg'] = 0.0
    df['tds_game'] = 0.0
    df['td_game'] = 0.0
    
    if 'senior_yds' in df.columns:
        wr_mask = df['position'].str.lower() == 'wr'
        df.loc[wr_mask, 'rec_ypg'] = df.loc[wr_mask, 'senior_yds'] / df.loc[wr_mask, 'games']
        rb_qb_mask = df['position'].str.lower().isin(['rb', 'qb'])
        df.loc[rb_qb_mask, 'ypg'] = df.loc[rb_qb_mask, 'senior_yds'] / df.loc[rb_qb_mask, 'games']
    if 'senior_td' in df.columns:
        wr_mask = df['position'].str.lower() == 'wr'
        df.loc[wr_mask, 'tds_game'] = df.loc[wr_mask, 'senior_td'] / df.loc[wr_mask, 'games']
        rb_qb_mask = df['position'].str.lower().isin(['rb', 'qb'])
        df.loc[rb_qb_mask, 'td_game'] = df.loc[rb_qb_mask, 'senior_td'] / df.loc[rb_qb_mask, 'games']

    # Trajectory calculation
    if 'senior_ypg' in df.columns and 'junior_ypg' in df.columns:
        df['trajectory'] = np.maximum(df['senior_ypg'] - df['junior_ypg'], 0)
    else:
        df['trajectory'] = 0.0

    # Core engineered features (ensure numeric types)
    df['bmi'] = ((df['weight_lbs'] / (df['height_inches'] ** 2)) * 703).astype(float)
    df['eff_ratio'] = (df.get('senior_tds', 0) / (df.get('senior_ypg', 1) + 1e-6)).astype(float)
    df['ath_power'] = (df.get('vertical_jump', 0) * df.get('broad_jump', 0)).astype(float)
    df['is_strong_state'] = df['state'].str.upper().isin(['TX', 'FL', 'CA', 'GA']).astype(int)

    # ENHANCED INTERACTION FEATURES (ensure numeric types)
    
    # BMI × YPG (power efficiency)
    primary_ypg = df.get('senior_ypg', df.get('ypg', df.get('rec_ypg', 0)))
    df['bmi_ypg'] = (df['bmi'] * primary_ypg).astype(float)
    
    # Height × Trajectory (growth potential with size)
    df['height_traj'] = (df['height_inches'] * df['trajectory']).astype(float)
    
    # State efficiency (talent hotbed × efficiency)
    df['state_eff'] = (df['state_talent_score'] * df['eff_ratio']).astype(float)
    
    # Speed-power ratio (athleticism efficiency)
    df['speed_power_ratio'] = (df['ath_power'] / (df['forty_yard_dash'] + 1e-6)).astype(float)
    
    # Position-specific interaction features
    if position.lower() == 'qb':
        # Completion percentage × YPG (accuracy under volume)
        df['comp_ypg'] = (df.get('senior_comp_pct', 60) * primary_ypg / 100).astype(float)
        # Height × Completion % (pocket presence)
        df['height_comp'] = (df['height_inches'] * df.get('senior_comp_pct', 60)).astype(float)
    elif position.lower() == 'rb':
        # YPC × Speed (breakaway ability)
        df['ypc_speed'] = (df.get('senior_ypc', 0) * (5.0 - df.get('forty_yard_dash', 4.8))).astype(float)
        # Weight × YPC (power running ability)
        df['weight_ypc'] = (df['weight_lbs'] * df.get('senior_ypc', 0)).astype(float)
    elif position.lower() == 'wr':
        # Catch radius (height × vertical)
        df['catch_radius'] = (df['height_inches'] * df.get('vertical_jump', 0)).astype(float)
        # Speed × YAC (big play ability)
        df['speed_yac'] = ((5.0 - df.get('forty_yard_dash', 4.8)) * df.get('senior_avg', 0)).astype(float)

    # Combine confidence scores (0-1 based on real vs imputed data)
    combine_cols = ['forty_yard_dash', 'vertical_jump', 'shuttle', 'broad_jump']
    imputed_cols = [f'{col}_imputed' for col in combine_cols if f'{col}_imputed' in df.columns]
    
    if imputed_cols:
        df['combine_confidence'] = (1.0 - (df[imputed_cols].sum(axis=1) / len(imputed_cols))).astype(float)
    else:
        df['combine_confidence'] = 1.0

    # Trajectory z-score by position
    df['trajectory_z'] = 0.0
    for pos in df['position'].unique():
        mask = df['position'] == pos
        if mask.sum() > 1:
            mean_traj = df.loc[mask, 'trajectory'].mean()
            std_traj = df.loc[mask, 'trajectory'].std()
            if std_traj > 0:
                df.loc[mask, 'trajectory_z'] = ((df.loc[mask, 'trajectory'] - mean_traj) / std_traj).astype(float)

    # Create position dummies (ensure int type)
    position_dummies = pd.get_dummies(df['position'].str.lower(), prefix='pos', dtype=int)
    for pos in ['qb', 'rb', 'wr']:
        if f'pos_{pos}' not in position_dummies.columns:
            position_dummies[f'pos_{pos}'] = 0
    df = pd.concat([df, position_dummies], axis=1)

    # CRITICAL: Remove duplicate columns after all feature engineering
    print(f"    Before duplicate removal: {df.shape}")
    df = df.loc[:, ~df.columns.duplicated(keep='first')]
    print(f"    After duplicate removal: {df.shape}")
    
    # Ensure all numeric columns have proper dtypes for XGBoost
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        # Convert boolean columns to int
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)

    print(f"Enhanced feature engineering completed for {position.upper()}")
    print(f"  - Applied intelligent combine imputation")
    print(f"  - Created state embeddings and talent scores")
    print(f"  - Generated interaction features: bmi_ypg, height_traj, state_eff, speed_power_ratio")
    print(f"  - Added position-specific features")
    print(f"  - Calculated combine confidence scores")
    print(f"  - Applied duplicate column removal")
    print(f"  - Ensured all columns are XGBoost-compatible (numeric types only)")
    
    return df

def preprocess_with_winsorization(df, position):
    """Compatibility wrapper for enhanced feature engineering"""
    return enhanced_feature_engineering(df, position)

def winsorize_and_scale(train_df, test_df, numeric_features):
    """Legacy wrapper for advanced winsorization and scaling"""
    return advanced_winsorize_and_scale(train_df, test_df, numeric_features)

def advanced_winsorize_and_scale(train_df, test_df, numeric_features):
    """Advanced winsorization and scaling with percentile features"""
    train_processed = train_df.copy()
    test_processed = test_df.copy()
    
    winsorization_log = []
    
    for feature in numeric_features:
        if feature in train_df.columns and train_df[feature].dtype in ['int64', 'float64']:
            # Winsorize on training data (1st-99th percentile)
            feature_values = train_df[feature].dropna()
            
            if len(feature_values) > 0:
                p1, p99 = np.percentile(feature_values, [1, 99])
                
                # Apply winsorization to both train and test
                train_processed[feature] = np.clip(train_df[feature], p1, p99)
                test_processed[feature] = np.clip(test_df[feature], p1, p99)
                
                # Percentile scaling based on training data
                train_values = train_processed[feature].dropna()
                if len(train_values) > 0:
                    # Create percentile features
                    train_processed[f'{feature}_pctile'] = train_processed[feature].apply(
                        lambda x: np.percentile(train_values, 100 * (train_values <= x).mean()) if pd.notnull(x) else 50
                    ).astype(float)
                    test_processed[f'{feature}_pctile'] = test_processed[feature].apply(
                        lambda x: np.percentile(train_values, 100 * (train_values <= x).mean()) if pd.notnull(x) else 50
                    ).astype(float)
                    
                    winsorization_log.append(f"{feature}: [{p1:.2f}, {p99:.2f}]")
    
    print(f"Advanced winsorization applied to {len(winsorization_log)} features")
    for log_entry in winsorization_log[:5]:  # Show first 5
        print(f"  {log_entry}")
    if len(winsorization_log) > 5:
        print(f"  ... and {len(winsorization_log) - 5} more features")
    
    # DUPLICATE PREVENTION: Remove any duplicate columns created during winsorization
    print("Removing duplicates after winsorization...")
    train_processed = train_processed.loc[:, ~train_processed.columns.duplicated(keep='first')]
    test_processed = test_processed.loc[:, ~test_processed.columns.duplicated(keep='first')]
    
    return train_processed, test_processed

# Legacy support for older function names
def load_base_csv(position):
    """Legacy wrapper for enhanced data loading"""
    return load_base_csv_enhanced(position)

def enrich_data(df, position, year=2025):
    """Legacy wrapper for enhanced data enrichment"""
    return enrich_data_enhanced(df, position, year)

StatementMeta(pocsparkpool, 51, 76, Finished, Available, Finished)

In [357]:
# Cell 5: Tiers & Tier Base Assignments (robust to lower-case columns)
tiers_qb = {
    'Power 5': {'base': 90, 'ypg_min': 250, 'height_min': 74, 'height_max': 78, 'weight_min': 200, 'weight_max': 240,
                '40_min': 4.6, '40_max': 4.9, 'vertical_min': 30, 'vertical_max': 34, 'broad_min': 108, 'shuttle_max': 4.5},
    'FCS': {'base': 70, 'ypg_min': 200, 'height_min': 72, 'height_max': 76, 'weight_min': 190, 'weight_max': 220,
            '40_min': 4.7, '40_max': 5.0, 'vertical_min': 28, 'vertical_max': 32, 'broad_min': 102, 'shuttle_max': 4.6},
    'D2': {'base': 50, 'ypg_min': 150, 'height_min': 71, 'height_max': 74, 'weight_min': 180, 'weight_max': 210,
           '40_min': 4.8, '40_max': 5.1, 'vertical_min': 26, 'vertical_max': 30, 'broad_min': 96, 'shuttle_max': 4.7},
    'D3/NAIA': {'base': 30, 'ypg_min': 0, 'height_min': 70, 'height_max': 999, 'weight_min': 170, 'weight_max': 999,
                '40_min': 4.9, '40_max': 999, 'vertical_min': 24, 'vertical_max': 999, 'broad_min': 90, 'shuttle_max': 999}
}
tiers_rb = {
    'Power 5': {'base': 90, 'ypg_min': 150, 'height_min': 69, 'height_max': 74, 'weight_min': 190, 'weight_max': 230,
                '40_min': 4.2, '40_max': 4.4, 'vertical_min': 34, 'vertical_max': 36, 'broad_min': 120, 'shuttle_max': 4.2},
    'FCS': {'base': 70, 'ypg_min': 120, 'height_min': 68, 'height_max': 73, 'weight_min': 180, 'weight_max': 220,
            '40_min': 4.3, '40_max': 4.5, 'vertical_min': 32, 'vertical_max': 34, 'broad_min': 110, 'shuttle_max': 4.3},
    'D2': {'base': 50, 'ypg_min': 90, 'height_min': 67, 'height_max': 72, 'weight_min': 170, 'weight_max': 210,
           '40_min': 4.4, '40_max': 4.6, 'vertical_min': 31, 'vertical_max': 33, 'broad_min': 100, 'shuttle_max': 4.4},
    'D3/NAIA': {'base': 30, 'ypg_min': 0, 'height_min': 66, 'height_max': 999, 'weight_min': 160, 'weight_max': 999,
                '40_min': 4.5, '40_max': 4.7, 'vertical_min': 30, 'vertical_max': 32, 'broad_min': 90, 'shuttle_max': 4.5}
}
tiers_wr = {
    'Power 5': {'base': 90, 'rec_ypg_min': 100, 'height_min': 71, 'height_max': 75, 'weight_min': 180, 'weight_max': 210,
                '40_min': 4.4, '40_max': 4.6, 'vertical_min': 34, 'vertical_max': 36, 'broad_min': 120, 'shuttle_max': 4.3},
    'FCS': {'base': 70, 'rec_ypg_min': 80, 'height_min': 70, 'height_max': 74, 'weight_min': 170, 'weight_max': 200,
            '40_min': 4.5, '40_max': 4.7, 'vertical_min': 32, 'vertical_max': 35, 'broad_min': 110, 'shuttle_max': 4.4},
    'D2': {'base': 50, 'rec_ypg_min': 60, 'height_min': 69, 'height_max': 73, 'weight_min': 165, 'weight_max': 195,
           '40_min': 4.6, '40_max': 4.8, 'vertical_min': 30, 'vertical_max': 33, 'broad_min': 100, 'shuttle_max': 4.5},
    'D3/NAIA': {'base': 30, 'rec_ypg_min': 0, 'height_min': 68, 'height_max': 999, 'weight_min': 160, 'weight_max': 999,
                '40_min': 4.7, '40_max': 5.0, 'vertical_min': 28, 'vertical_max': 31, 'broad_min': 90, 'shuttle_max': 4.6}
}

tiers = {'qb': tiers_qb, 'rb': tiers_rb, 'wr': tiers_wr}

def safe_get(row, key, default):
    """Safely get value from row, handling None values."""
    value = row.get(key, default)
    return default if value is None else value

def assign_tier_base(row, position):
    tiers_pos = tiers.get(position, tiers['qb'])
    for name, rules in sorted(tiers_pos.items(), key=lambda x: x[1]['base'], reverse=True):
        checks = []
        if position == 'wr':
            checks.append(safe_get(row, 'rec_ypg', 0) >= rules.get('rec_ypg_min', 0))
        elif position == 'qb':
            checks.append(safe_get(row, 'senior_ypg', 0) >= rules['ypg_min'])
        elif position == 'rb':
            checks.append(safe_get(row, 'ypg', 0) >= rules['ypg_min'])
        checks += [
            rules['height_min'] <= safe_get(row, 'height_inches', 0) <= rules['height_max'],
            rules['weight_min'] <= safe_get(row, 'weight_lbs', 0) <= rules['weight_max'],
            rules['40_min'] <= safe_get(row, 'forty_yard_dash', 5.0) <= rules['40_max'],
            (rules['vertical_min'] - 1) <= safe_get(row, 'vertical_jump', 0) <= (rules['vertical_max'] + 1),
            safe_get(row, 'shuttle', 5.0) <= rules['shuttle_max'],
            safe_get(row, 'broad_jump', 0) >= rules['broad_min']
        ]
        if sum(checks) >= len(checks) * 0.6:
            return rules['base'], name
    return tiers_pos['D3/NAIA']['base'], 'D3/NAIA'

StatementMeta(pocsparkpool, 51, 77, Finished, Available, Finished)

In [358]:
# Cell 6: Meta-Score: Performance, Versatility, Athleticism, Bonus, Rule Score

def safe_percentileofscore(series, value):
    """Safely compute percentile score, handling missing columns or empty series."""
    if series is None or len(series.dropna()) == 0:
        return 0
    return percentileofscore(series.dropna(), value if value is not None else 0)

def safe_get(row, key, default):
    """Safely get value from row, handling None values."""
    value = row.get(key, default)
    return default if value is None else value

def compute_performance(df, row, position):
    if position == 'qb':
        ypg_pct = safe_percentileofscore(df.get('senior_ypg'), safe_get(row, 'senior_ypg', 0))
        td_pct = safe_percentileofscore(df.get('senior_tds'), safe_get(row, 'senior_tds', 0))
        comp_pct = safe_percentileofscore(df.get('senior_comp_pct'), safe_get(row, 'senior_comp_pct', 0))
        traj_pct = safe_percentileofscore(df.get('trajectory'), safe_get(row, 'trajectory', 0))
        return (0.4 * ypg_pct + 0.3 * td_pct + 0.2 * comp_pct + 0.1 * traj_pct + 0.1 * safe_get(row, 'trajectory_z', 0)) * 0.35
    elif position == 'rb':
        ypg_pct = safe_percentileofscore(df.get('ypg'), safe_get(row, 'ypg', 0))
        td_pct = safe_percentileofscore(df.get('td_game'), safe_get(row, 'td_game', 0))
        ypc_pct = safe_percentileofscore(df.get('senior_ypc'), safe_get(row, 'senior_ypc', 0))
        rec_pct = safe_percentileofscore(df.get('senior_rec'), safe_get(row, 'senior_rec', 0))
        return (0.4 * ypg_pct + 0.3 * td_pct + 0.2 * ypc_pct + 0.1 * rec_pct + 0.1 * safe_get(row, 'eff_ratio', 0)) * 0.35
    elif position == 'wr':
        ypg_pct = safe_percentileofscore(df.get('rec_ypg'), safe_get(row, 'rec_ypg', 0))
        td_pct = safe_percentileofscore(df.get('tds_game'), safe_get(row, 'tds_game', 0))
        ypc_pct = safe_percentileofscore(df.get('senior_avg'), safe_get(row, 'senior_avg', 0))
        rec_pct = safe_percentileofscore(df.get('senior_rec'), safe_get(row, 'senior_rec', 0))
        return (0.4 * ypg_pct + 0.3 * td_pct + 0.2 * ypc_pct + 0.1 * rec_pct + 0.1 * safe_get(row, 'eff_ratio', 0)) * 0.35
    return 0

def compute_versatility(df, row, position):
    if position == 'qb':
        comp_pct = safe_percentileofscore(df.get('senior_comp_pct'), safe_get(row, 'senior_comp_pct', 0))
        speed_pct = 100 - safe_percentileofscore(df.get('forty_yard_dash'), safe_get(row, 'forty_yard_dash', 5.0))
        return (0.5 * comp_pct + 0.5 * speed_pct) * 0.35
    elif position == 'rb':
        ypc_pct = safe_percentileofscore(df.get('senior_ypc'), safe_get(row, 'senior_ypc', 0))
        rec_pct = safe_percentileofscore(df.get('senior_rec'), safe_get(row, 'senior_rec', 0))
        ap_pct = safe_percentileofscore(df.get('all_purpose_game'), safe_get(row, 'all_purpose_game', 0))
        return (0.4 * ypc_pct + 0.3 * rec_pct + 0.3 * ap_pct) * 0.4
    elif position == 'wr':
        ypc_pct = safe_percentileofscore(df.get('senior_avg'), safe_get(row, 'senior_avg', 0))
        rec_pct = safe_percentileofscore(df.get('senior_rec'), safe_get(row, 'senior_rec', 0))
        rush_pct = safe_percentileofscore(df.get('senior_rush_yds'), safe_get(row, 'senior_rush_yds', 0))
        return (0.5 * ypc_pct + 0.3 * rec_pct + 0.2 * rush_pct) * 0.4
    return 0

def compute_athleticism(df, row, position):
    f_pct = 100 - safe_percentileofscore(df.get('forty_yard_dash'), safe_get(row, 'forty_yard_dash', 5.0))
    v_pct = safe_percentileofscore(df.get('vertical_jump'), safe_get(row, 'vertical_jump', 0))
    s_pct = 100 - safe_percentileofscore(df.get('shuttle'), safe_get(row, 'shuttle', 5.0))
    b_pct = safe_percentileofscore(df.get('broad_jump'), safe_get(row, 'broad_jump', 0))
    return (0.3 * f_pct + 0.3 * v_pct + 0.2 * s_pct + 0.2 * b_pct) * 0.25

def compute_bonus(row, position):
    b = 0
    th_40 = 4.7 if position == 'qb' else 4.5
    th_sh = 4.4 if position == 'qb' else 4.3
    if safe_get(row, 'forty_yard_dash', np.nan) < th_40: b += 10
    if safe_get(row, 'shuttle', np.nan) < th_sh: b += 5
    if safe_get(row, 'trajectory_z', 0) > 1: b += 5
    if safe_get(row, 'is_strong_state', 0): b += 3
    if safe_get(row, 'hoops_vert', 0) > 35: b += 4
    pctile_cols = [c for c in row.index if '_pos_pctile' in c]
    if sum(safe_get(row, c, 0) > 0.9 for c in pctile_cols) >= 3: b += 7
    return b

    
def compute_rule_score(df, position):
    # Drop all-NaN rows and those missing 'position'
    df = df.dropna(how='all')
    df = df[df['position'].notnull()]
    results = []
    tiers_used = []
    for idx, row in df.iterrows():
        if not isinstance(row, pd.Series):
            continue
        pos = str(row.get('position', position)).lower()
        base, tier_name = assign_tier_base(row, pos)
        bonus = compute_bonus(row, pos)
        perf = compute_performance(df, row, pos)
        vers = compute_versatility(df, row, pos)
        ath = compute_athleticism(df, row, pos)
        multiplier = safe_get(row, 'multiplier', 1.0)
        score = (base * 0.6 + (perf + vers + ath) * 0.4) * (1 + bonus / 100) * multiplier
        score = np.clip(score, 0, 100)
        results.append(score)
        tiers_used.append(tier_name)
    df = df.copy()
    df['rule_score'] = results
    df['rule_score_tier'] = tiers_used
    return df

StatementMeta(pocsparkpool, 51, 78, Finished, Available, Finished)

In [359]:
# Cell 7: Enhanced Pipeline Usage Example with New Functions

# 1. Load Data with Enhanced Functions
df_qb = load_base_csv_enhanced('qb')
df_rb = load_base_csv_enhanced('rb')
df_wr = load_base_csv_enhanced('wr')

# 2. Enrich data for balanced division representation
df_qb = enrich_data_enhanced(df_qb, 'qb')
df_rb = enrich_data_enhanced(df_rb, 'rb')
df_wr = enrich_data_enhanced(df_wr, 'wr')

# 3. Concatenate all positions for multi-position modeling
combined_df = pd.concat([df_qb, df_rb, df_wr], ignore_index=True)

# 4. Enhanced division normalization
combined_df['division_normalized'] = combined_df['division'].str.strip().str.upper()
division_map = {'POWER 5': 3, 'FCS': 2, 'D2': 1, 'D3': 0, 'NAIA': 0}
combined_df['division_num'] = combined_df['division_normalized'].map(division_map).fillna(-1).astype(int)

# Fix any unmapped divisions
unmapped_mask = combined_df['division_num'] == -1
if unmapped_mask.any():
    print(f"Fixing {unmapped_mask.sum()} unmapped division values...")
    combined_df.loc[combined_df['division_normalized'].str.contains('POWER|P5|FBS', na=False), 'division_num'] = 3
    combined_df.loc[combined_df['division_normalized'].str.contains('FCS', na=False), 'division_num'] = 2
    combined_df.loc[combined_df['division_normalized'].str.contains('D2|DIV 2|DIVISION 2', na=False), 'division_num'] = 1
    combined_df.loc[combined_df['division_normalized'].str.contains('D3|DIV 3|DIVISION 3|NAIA', na=False), 'division_num'] = 0

print("Enhanced class distribution:")
print(combined_df['division_num'].value_counts().sort_index())

# 5. Apply enhanced feature engineering
combined_df = enhanced_feature_engineering(combined_df, 'multi')

# 6. Compute rule score
combined_df = compute_rule_score(combined_df, 'multi')

# 7. Output: ready for accuracy evaluation and model training!
print(f"\nDataset ready with {len(combined_df)} total samples and enhanced features")
combined_df[['name', 'position', 'division_normalized', 'division_num', 'rule_score', 'combine_confidence']].head()

StatementMeta(pocsparkpool, 51, 79, Finished, Available, Finished)

Loaded 220 rows for QB
Unique divisions for QB: ['Power 5' 'FCS' 'D3' 'D2' 'NAIA']
Loaded 194 rows for RB
Unique divisions for RB: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Loaded 158 rows for WR
Unique divisions for WR: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Added 10 enhanced synthetic samples for QB
Added 10 enhanced synthetic samples for RB
Added 10 enhanced synthetic samples for WR
Enhanced class distribution:
division_num
0    150
1    105
2    107
3    240
Name: count, dtype: int64
Combine imputation for MULTI:
  Imputing 525 missing forty_yard_dash values
    POWER 5: 201 values from N(4.75, 0.08)
    FCS: 85 values from N(4.85, 0.07)
    D3: 61 values from N(5.10, 0.10)
    D2: 99 values from N(4.95, 0.07)
    NAIA: 79 values from N(5.00, 0.10)
  Imputing 525 missing vertical_jump values
    POWER 5: 201 values from N(32.00, 1.00)
    FCS: 85 values from N(30.00, 1.00)
    D3: 61 values from N(26.00, 1.00)
    D2: 99 values from N(28.00, 1.00)
    NAIA: 79 values from N(27.00, 1.00)
  Imp

Unnamed: 0,name,position,division_normalized,division_num,rule_score,combine_confidence
0,,qb,POWER 5,3,70.016632,0.0
1,,qb,POWER 5,3,74.422153,0.0
2,,qb,POWER 5,3,60.169521,0.0
3,,qb,POWER 5,3,55.080066,0.0
4,,qb,POWER 5,3,75.674207,0.0


In [360]:
from xgboost import XGBClassifier

def train_and_evaluate(train_df, features):
    X = train_df[features].fillna(0)
    y = train_df['Division_Num']
    model = XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        eval_metric='mlogloss',
        random_state=42
    )
    model.fit(X, y)
    return model


StatementMeta(pocsparkpool, 51, 80, Finished, Available, Finished)

In [361]:
# COMPREHENSIVE ACCURACY BOOST - FIXED DUPLICATE HANDLING
# Target: 80%+ exact accuracy with proper duplicate prevention

import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("COMPREHENSIVE ACCURACY BOOST WITH DUPLICATE PREVENTION")
print("="*80)

# STEP 1: Load and combine data using enhanced functions
print("\nSTEP 1: ENHANCED DATA LOADING")
print("-" * 60)

positions = ['qb', 'rb', 'wr']
combined_df = pd.DataFrame()

for pos in positions:
    df = load_base_csv_enhanced(pos)
    df = enrich_data_enhanced(df, pos)
    df['position'] = pos.lower()
    combined_df = pd.concat([combined_df, df], ignore_index=True)
    # Remove duplicates after each concatenation
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated(keep='first')]

# Division normalization
combined_df['division_normalized'] = combined_df['division'].str.strip().str.upper()
division_map = {'POWER 5': 3, 'FCS': 2, 'D2': 1, 'D3': 0, 'NAIA': 0}
combined_df['division_num'] = combined_df['division_normalized'].map(division_map).fillna(-1).astype(int)

print(f"Total samples: {len(combined_df)}")
print(f"Class distribution: {dict(combined_df['division_num'].value_counts().sort_index())}")

# STEP 2: Train/Test Split
print("\nSTEP 2: TRAIN/TEST SPLIT")
print("-" * 60)

try:
    train_df, test_df = train_test_split(
        combined_df, 
        test_size=0.15, 
        stratify=combined_df['division_num'], 
        random_state=42
    )
    print(f"✓ Stratified split: Train={len(train_df)}, Test={len(test_df)}")
    use_full_dataset = False
except:
    print("⚠ Stratified split failed - using full dataset")
    train_df = test_df = combined_df.copy()
    use_full_dataset = True

# STEP 3: Enhanced Feature Engineering with AGGRESSIVE duplicate prevention
print("\nSTEP 3: ENHANCED FEATURE ENGINEERING WITH DUPLICATE PREVENTION")
print("-" * 60)

def comprehensive_duplicate_removal(df, step_name=""):
    """Aggressively remove all duplicate columns at every step"""
    print(f"  {step_name} - Before: {df.shape}")
    
    # Method 1: Remove exact duplicate column names
    df = df.loc[:, ~df.columns.duplicated(keep='first')]
    
    # Method 2: Check for any remaining duplicates and handle them
    duplicate_cols = df.columns[df.columns.duplicated()].unique()
    if len(duplicate_cols) > 0:
        print(f"    Found {len(duplicate_cols)} remaining duplicates: {list(duplicate_cols)}")
        for dup_col in duplicate_cols:
            # Keep only the first occurrence
            dup_indices = df.columns.get_loc(dup_col)
            if hasattr(dup_indices, '__iter__'):
                # Multiple occurrences - drop all but first
                cols_to_drop = [df.columns[i] for i in dup_indices[1:]]
                df = df.drop(columns=cols_to_drop)
    
    print(f"  {step_name} - After: {df.shape}")
    
    # Final verification
    if df.columns.duplicated().any():
        print(f"    ERROR: Still have duplicates!")
        remaining_dups = df.columns[df.columns.duplicated()].unique()
        print(f"    Remaining: {list(remaining_dups)}")
        # Nuclear option: rename duplicates
        df.columns = [f"{col}_{i}" if df.columns.tolist().count(col) > 1 and df.columns.tolist()[:i+1].count(col) > 1 
                     else col for i, col in enumerate(df.columns)]
    
    return df

# Apply enhanced feature engineering
train_df = enhanced_feature_engineering(train_df, 'multi')
test_df = enhanced_feature_engineering(test_df, 'multi')

# AGGRESSIVE duplicate removal after feature engineering
train_df = comprehensive_duplicate_removal(train_df, "Train after feature engineering")
test_df = comprehensive_duplicate_removal(test_df, "Test after feature engineering")

# Ensure division_num is preserved
for df_name, df_ in [('Train', train_df), ('Test', test_df)]:
    if 'division_num' not in df_.columns:
        df_['division_num'] = df_['division_normalized'].map(division_map).fillna(-1).astype(int)

# STEP 4: Feature Selection with duplicate checking
print("\nSTEP 4: FEATURE SELECTION WITH DUPLICATE CHECKING")
print("-" * 60)

# Define comprehensive feature set 
base_features = [
    'senior_ypg', 'senior_tds', 'senior_comp_pct', 'senior_ypc', 'senior_yds', 
    'senior_avg', 'senior_rec', 'senior_td', 'senior_rush_yds', 'rec_ypg', 
    'ypg', 'tds_game', 'td_game', 'trajectory', 'height_inches', 'weight_lbs', 
    'forty_yard_dash', 'vertical_jump', 'shuttle', 'broad_jump', 'bmi', 
    'eff_ratio', 'ath_power', 'trajectory_z', 'is_strong_state', 'all_purpose_game',
    'bmi_ypg', 'height_traj', 'state_eff', 'speed_power_ratio', 'state_talent_score',
    'combine_confidence'
]

# Add engineered features
position_features = [col for col in train_df.columns if col.startswith('pos_')]
state_features = [col for col in train_df.columns if col.startswith('state_tier_')]
interaction_features = [col for col in train_df.columns if any(x in col for x in ['comp_ypg', 'height_comp', 'ypc_speed', 'weight_ypc', 'catch_radius', 'speed_yac'])]

# Combine and filter features
all_features = base_features + position_features + state_features + interaction_features
features = []
for col in all_features:
    if col in train_df.columns:
        if train_df[col].dtype in ['int64', 'float64', 'int32', 'float32', 'bool']:
            features.append(col)

# Remove any potential duplicates from feature list itself
features = list(dict.fromkeys(features))  # Preserves order while removing duplicates

print(f"Selected {len(features)} unique features")

# Compute rule scores
if 'rule_score' not in train_df.columns:
    print("Computing rule scores...")
    train_df = compute_rule_score(train_df, 'multi')
    test_df = compute_rule_score(test_df, 'multi')
    if 'rule_score' not in features:
        features.append('rule_score')

# STEP 5: Data preparation with FINAL duplicate check
print("\nSTEP 5: FINAL DATA PREPARATION")
print("-" * 60)

# Create training matrices
X_train = train_df[features].fillna(0)
y_train = train_df['division_num'].values

X_test = test_df[features].fillna(0)
y_test = test_df['division_num'].values

# CRITICAL: Apply XGBoost safeguard before any training
print("\n🛡️ APPLYING XGBOOST SAFEGUARDS")
X_train, X_test = xgboost_safeguard(X_train, X_test, "Final Data Preparation")

print(f"Final training shapes: X_train={X_train.shape}, X_test={X_test.shape}")

# STEP 6: Class balancing
print("\nSTEP 6: CLASS BALANCING")
print("-" * 60)

print(f"Original class distribution: {dict(pd.Series(y_train).value_counts().sort_index())}")

try:
    adasyn = ADASYN(random_state=42, sampling_strategy='auto')
    X_train_aug, y_train_aug = adasyn.fit_resample(X_train, y_train)
    print(f"✓ ADASYN successful: {X_train.shape} -> {X_train_aug.shape}")
    
    aug_counts = pd.Series(y_train_aug).value_counts().sort_index()
    print(f"Balanced distribution: {dict(aug_counts)}")
except Exception as e:
    print(f"⚠ ADASYN failed: {e}")
    X_train_aug, y_train_aug = X_train, y_train

# STEP 7: Enhanced XGBoost Training
print("\nSTEP 7: ENHANCED XGBOOST TRAINING")
print("-" * 60)

print("Training enhanced XGBoost model...")
enhanced_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    eval_metric='mlogloss',
    random_state=42
)

enhanced_model.fit(X_train_aug, y_train_aug)
print("✓ XGBoost training completed successfully!")

# STEP 8: Evaluation
print("\nSTEP 8: COMPREHENSIVE EVALUATION")
print("=" * 80)

y_pred = enhanced_model.predict(X_test)
y_pred_proba = enhanced_model.predict_proba(X_test)

exact_acc = accuracy_score(y_test, y_pred)
within_one_acc = np.mean(np.abs(y_test - y_pred) <= 1)
f1 = f1_score(y_test, y_pred, average='macro')

print(f"FINAL ENHANCED RESULTS:")
print(f"Exact Accuracy: {exact_acc*100:.2f}%")
print(f"Within-One-Division: {within_one_acc*100:.2f}%")
print(f"F1 Score (Macro): {f1*100:.2f}%")

if use_full_dataset:
    print("⚠ NOTE: Results on same data used for training (potential overfitting)")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
division_names = {0: 'D3/NAIA', 1: 'D2', 2: 'FCS', 3: 'Power 5'}

print(f"\nConfusion Matrix:")
print(cm)

# Per-class accuracy
print(f"\nPER-CLASS ACCURACY:")
for class_idx in range(len(np.unique(y_test))):
    if class_idx in y_test:
        class_mask = y_test == class_idx
        if class_mask.any():
            class_acc = accuracy_score(y_test[class_mask], y_pred[class_mask])
            class_within_one = np.mean(np.abs(y_test[class_mask] - y_pred[class_mask]) <= 1)
            class_name = division_names.get(class_idx, f'Class {class_idx}')
            class_count = class_mask.sum()
            print(f"  {class_name}: {class_acc*100:.1f}% exact, {class_within_one*100:.1f}% within-one (n={class_count})")

# Per-position breakdown
print(f"\nPER-POSITION BREAKDOWN:")
for pos in test_df['position'].unique():
    pos_mask = test_df['position'] == pos
    if pos_mask.any():
        pos_indices = test_df[pos_mask].index
        test_pos_mask = np.array([i for i, idx in enumerate(test_df.index) if idx in pos_indices])
        
        if len(test_pos_mask) > 0:
            pos_y_true = y_test[test_pos_mask]
            pos_y_pred = y_pred[test_pos_mask]
            
            pos_exact = accuracy_score(pos_y_true, pos_y_pred)
            pos_within_one = np.mean(np.abs(pos_y_true - pos_y_pred) <= 1)
            
            print(f"  {pos.upper()} (n={len(pos_y_true)}): {pos_exact*100:.1f}% exact, {pos_within_one*100:.1f}% within-one")

# Feature importance
if hasattr(enhanced_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': X_train_aug.columns,
        'importance': enhanced_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTOP 10 FEATURE IMPORTANCE:")
    for i, (_, row) in enumerate(importance_df.head(10).iterrows()):
        print(f"  {i+1:2d}. {row['feature']:<25}: {row['importance']:.4f}")

# FINAL SUMMARY
print(f"\n" + "="*80)
print("COMPREHENSIVE ACCURACY BOOST SUMMARY")
print("="*80)
print(f"✓ Enhanced data loading with intelligent combine imputation")
print(f"✓ Advanced feature engineering with {len(X_train_aug.columns)} features")
print(f"✓ AGGRESSIVE duplicate column prevention at every step")
print(f"✓ XGBoost safeguards applied before training")
print(f"✓ Enhanced class balancing with ADASYN")
print(f"✓ Optimized XGBoost hyperparameters")
print(f"✓ Comprehensive evaluation and analysis")
print(f"")
print(f"RESULTS:")
print(f"  Exact Accuracy: {exact_acc*100:.2f}% (Target: 80%+)")
print(f"  Within-One: {within_one_acc*100:.2f}%")
print(f"  F1 Score: {f1*100:.2f}%")

improvement = exact_acc * 100 - 53  # Baseline was ~53%
print(f"\nIMPROVEMENT: +{improvement:.1f}% from baseline")

if exact_acc >= 0.8:
    print(f"🎉 SUCCESS: Achieved target accuracy of 80%+!")
else:
    print(f"📈 PROGRESS: Improved accuracy to {exact_acc*100:.1f}%")
    print(f"💡 NEXT STEPS: Consider ensemble methods or more data")

print("="*80)

StatementMeta(pocsparkpool, 51, 81, Finished, Available, Finished)

COMPREHENSIVE ACCURACY BOOST WITH DUPLICATE PREVENTION

STEP 1: ENHANCED DATA LOADING
------------------------------------------------------------
Loaded 220 rows for QB
Unique divisions for QB: ['Power 5' 'FCS' 'D3' 'D2' 'NAIA']
Added 10 enhanced synthetic samples for QB
Loaded 194 rows for RB
Unique divisions for RB: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Added 10 enhanced synthetic samples for RB
Loaded 158 rows for WR
Unique divisions for WR: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Added 10 enhanced synthetic samples for WR
Total samples: 602
Class distribution: {0: 150, 1: 105, 2: 107, 3: 240}

STEP 2: TRAIN/TEST SPLIT
------------------------------------------------------------
✓ Stratified split: Train=511, Test=91

STEP 3: ENHANCED FEATURE ENGINEERING WITH DUPLICATE PREVENTION
------------------------------------------------------------
Combine imputation for MULTI:
  Imputing 447 missing forty_yard_dash values
    NAIA: 67 values from N(5.00, 0.10)
    D2: 85 values from N(4.95, 0.07)


In [362]:
# ACCURACY BOOST IMPLEMENTATION - FULL UPGRADE PLAN WITH COMPREHENSIVE DUPLICATE PREVENTION
# Target: 80%+ exact accuracy for every class

import warnings
warnings.filterwarnings('ignore')

# Install additional dependencies
try:
    from autogluon.tabular import TabularPredictor
    print("✓ AutoGluon available")
except ImportError:
    print("⚠ AutoGluon not available - will use XGBoost fallback")
    TabularPredictor = None

try:
    from ctgan import CTGAN
    print("✓ CTGAN available")
except ImportError:
    print("⚠ CTGAN not available - will use ADASYN fallback")
    CTGAN = None

try:
    import shap
    print("✓ SHAP available")
except ImportError:
    print("⚠ SHAP not available - will skip feature importance analysis")
    shap = None

print("="*80)
print("IMPLEMENTING COMPREHENSIVE ACCURACY BOOST PLAN WITH DUPLICATE PREVENTION")
print("="*80)

# STEP 1: Enhanced Data Loading & Normalization
print("\nSTEP 1: ENHANCED DATA LOADING & NORMALIZATION")
print("-" * 60)

positions = ['qb', 'rb', 'wr']
combined_df = pd.DataFrame()

for pos in positions:
    df = load_base_csv_enhanced(pos)  # Use enhanced function
    df = enrich_data_enhanced(df, pos)  # Use enhanced function
    df['position'] = pos.lower()
    combined_df = pd.concat([combined_df, df], ignore_index=True)
    # CRITICAL: Remove duplicates after each concatenation
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated(keep='first')]
    print(f"  After {pos.upper()} concatenation: {combined_df.shape}")

# Apply robust division normalization
combined_df['division_normalized'] = combined_df['division'].str.strip().str.upper()
division_map = {'POWER 5': 3, 'FCS': 2, 'D2': 1, 'D3': 0, 'NAIA': 0}
combined_df['division_num'] = combined_df['division_normalized'].map(division_map).fillna(-1).astype(int)

# Handle any unmapped values
unmapped_mask = combined_df['division_num'] == -1
if unmapped_mask.any():
    print(f"Fixing {unmapped_mask.sum()} unmapped division values...")
    combined_df.loc[combined_df['division_normalized'].str.contains('POWER|P5|FBS', na=False), 'division_num'] = 3
    combined_df.loc[combined_df['division_normalized'].str.contains('FCS', na=False), 'division_num'] = 2
    combined_df.loc[combined_df['division_normalized'].str.contains('D2|DIV 2|DIVISION 2', na=False), 'division_num'] = 1
    combined_df.loc[combined_df['division_normalized'].str.contains('D3|DIV 3|DIVISION 3|NAIA', na=False), 'division_num'] = 0

print("Class distribution after normalization:")
print(combined_df['division_num'].value_counts().sort_index())

# STEP 2: Stratified Split with Enhanced Logic
print("\nSTEP 2: STRATIFIED SPLIT WITH ENHANCED LOGIC")  
print("-" * 60)

test_size = 0.15
try:
    train_df, test_df = train_test_split(
        combined_df, 
        test_size=test_size, 
        stratify=combined_df['division_num'], 
        random_state=42
    )
    print(f"✓ Successful stratified split: Train={len(train_df)}, Test={len(test_df)}")
    use_full_dataset = False
except:
    print("⚠ Stratified split failed - using full dataset")
    train_df = test_df = combined_df.copy()
    use_full_dataset = True

# STEP 3: Enhanced Feature Engineering with Duplicate Prevention
print("\nSTEP 3: ENHANCED FEATURE ENGINEERING WITH COMPREHENSIVE DUPLICATE PREVENTION")
print("-" * 60)

# Apply enhanced feature engineering to both datasets
train_df = enhanced_feature_engineering(train_df, 'multi')
test_df = enhanced_feature_engineering(test_df, 'multi')

# CRITICAL: Comprehensive duplicate removal function
def comprehensive_duplicate_removal(df, step_name=""):
    """Aggressively remove ALL duplicate columns with multiple methods"""
    print(f"  🔍 {step_name} - Before duplicate removal: {df.shape}")
    
    original_cols = list(df.columns)
    
    # Method 1: Basic duplicate removal
    df_clean = df.loc[:, ~df.columns.duplicated(keep='first')]
    
    # Method 2: Check for any remaining duplicates
    remaining_dups = df_clean.columns[df_clean.columns.duplicated()].unique()
    if len(remaining_dups) > 0:
        print(f"    ⚠️ Found {len(remaining_dups)} remaining duplicates: {list(remaining_dups)}")
        # More aggressive removal
        seen_cols = set()
        keep_cols = []
        for col in df_clean.columns:
            if col not in seen_cols:
                keep_cols.append(col)
                seen_cols.add(col)
        df_clean = df_clean[keep_cols]
    
    # Method 3: Final verification and nuclear option if needed
    if df_clean.columns.duplicated().any():
        print(f"    🚨 NUCLEAR OPTION: Renaming all duplicate columns")
        new_columns = []
        seen_names = {}
        for col in df_clean.columns:
            if col not in seen_names:
                new_columns.append(col)
                seen_names[col] = 1
            else:
                seen_names[col] += 1
                new_columns.append(f"{col}_dup_{seen_names[col]}")
        df_clean.columns = new_columns
    
    removed_count = len(original_cols) - len(df_clean.columns)
    print(f"  ✅ {step_name} - After duplicate removal: {df_clean.shape} (removed {removed_count} duplicates)")
    
    return df_clean

# Apply comprehensive duplicate removal
train_df = comprehensive_duplicate_removal(train_df, "Training data")
test_df = comprehensive_duplicate_removal(test_df, "Test data")

# Ensure division_num is preserved after preprocessing
for df_name, df_ in [('Train', train_df), ('Test', test_df)]:
    if 'division_num' not in df_.columns:
        df_['division_num'] = df_['division_normalized'].map(division_map).fillna(-1).astype(int)
        print(f"  ✅ Restored division_num to {df_name} dataset")

# STEP 4: Feature Selection and Preparation
print("\nSTEP 4: FEATURE SELECTION AND PREPARATION")
print("-" * 60)

# Define comprehensive feature set with enhanced features
base_features = [
    'senior_ypg', 'senior_tds', 'senior_comp_pct', 'senior_ypc', 'senior_yds', 
    'senior_avg', 'senior_rec', 'senior_td', 'senior_rush_yds', 'rec_ypg', 
    'ypg', 'tds_game', 'td_game', 'trajectory', 'height_inches', 'weight_lbs', 
    'forty_yard_dash', 'vertical_jump', 'shuttle', 'broad_jump', 'bmi', 
    'eff_ratio', 'ath_power', 'trajectory_z', 'is_strong_state', 'all_purpose_game',
    # Enhanced interaction features
    'bmi_ypg', 'height_traj', 'state_eff', 'speed_power_ratio', 'state_talent_score',
    'combine_confidence'
]

# Add position features and other engineered features
position_features = [col for col in train_df.columns if col.startswith('pos_')]
state_tier_features = [col for col in train_df.columns if col.startswith('state_tier_')]
interaction_features = []

# Add position-specific interaction features
for col in train_df.columns:
    if any(x in col for x in ['comp_ypg', 'height_comp', 'ypc_speed', 'weight_ypc', 'catch_radius', 'speed_yac']):
        interaction_features.append(col)

# Combine all feature types
all_features = base_features + position_features + state_tier_features + interaction_features

# Only use features that exist and are numeric - WITH DUPLICATE REMOVAL
features = []
seen_features = set()
for col in all_features:
    if col in train_df.columns and col not in seen_features:
        # Only include numeric columns to avoid XGBoost issues
        if train_df[col].dtype in ['int64', 'float64', 'int32', 'float32', 'bool']:
            features.append(col)
            seen_features.add(col)
        else:
            print(f"  Skipping non-numeric feature: {col} (dtype: {train_df[col].dtype})")

print(f"Selected {len(features)} unique features for training")

# Compute rule scores if not already done
if 'rule_score' not in train_df.columns:
    print("Computing rule scores...")
    train_df = compute_rule_score(train_df, 'multi')
    test_df = compute_rule_score(test_df, 'multi')
    if 'rule_score' not in features and 'rule_score' not in seen_features:
        features.append('rule_score')
        seen_features.add('rule_score')

# Apply advanced winsorization and scaling
print("Applying advanced winsorization and scaling...")
numeric_features = [f for f in features if f in train_df.columns and train_df[f].dtype in ['int64', 'float64', 'int32', 'float32']]
train_df, test_df = advanced_winsorize_and_scale(train_df, test_df, numeric_features)

# STEP 5: Final Data Preparation with Comprehensive Validation
print("\nSTEP 5: FINAL DATA PREPARATION WITH COMPREHENSIVE VALIDATION")
print("-" * 60)

# Prepare training data with comprehensive validation
X_train = train_df[features].fillna(0)
y_train = train_df['division_num'].values

X_test = test_df[features].fillna(0)
y_test = test_df['division_num'].values

print(f"Initial training data shape: {X_train.shape}")
print(f"Initial test data shape: {X_test.shape}")

# CRITICAL: Apply XGBoost safeguard BEFORE any model operations
print("\n🛡️ APPLYING COMPREHENSIVE XGBOOST SAFEGUARDS")
X_train, X_test = xgboost_safeguard(X_train, X_test, "Pre-Processing Safety Check")

# Final validation: Check for any remaining issues
print("Final data validation:")
print(f"  X_train dtypes: {X_train.dtypes.value_counts().to_dict()}")
print(f"  X_test dtypes: {X_test.dtypes.value_counts().to_dict()}")

# Ensure all features are numeric for XGBoost
non_numeric_cols = []
for col in X_train.columns:
    if X_train[col].dtype not in ['int64', 'float64', 'int32', 'float32', 'bool']:
        non_numeric_cols.append(col)

if non_numeric_cols:
    print(f"  Converting non-numeric columns to numeric: {non_numeric_cols}")
    for col in non_numeric_cols:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce').fillna(0)
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)

print(f"\nValidated feature list ({len(X_train.columns)} features):")
for i, feature in enumerate(X_train.columns[:10]):  # Show first 10
    print(f"  {i+1}. {feature}")
if len(X_train.columns) > 10:
    print(f"  ... and {len(X_train.columns) - 10} more features")

# STEP 6: Enhanced Class Balancing with Additional Safeguards
print("\nSTEP 6: ENHANCED CLASS BALANCING WITH SAFEGUARDS")
print("-" * 60)

class_counts = pd.Series(y_train).value_counts().sort_index()
print(f"Original class distribution: {dict(class_counts)}")

# Apply ADASYN for class balancing with safeguards
print("Applying ADASYN class balancing...")
try:
    # CRITICAL: Additional safeguard right before ADASYN
    print("🛡️ Pre-ADASYN safety check...")
    if X_train.columns.duplicated().any():
        print("⚠️ Found duplicates before ADASYN - applying emergency fix")
        X_train = X_train.loc[:, ~X_train.columns.duplicated(keep='first')]
        X_test = X_test.loc[:, ~X_test.columns.duplicated(keep='first')]
    
    adasyn = ADASYN(random_state=42, sampling_strategy='auto')
    X_train_aug, y_train_aug = adasyn.fit_resample(X_train, y_train)
    print(f"✓ ADASYN successful: {X_train.shape} -> {X_train_aug.shape}")
    
    # CRITICAL: Post-ADASYN safety check
    print("🛡️ Post-ADASYN safety check...")
    if hasattr(X_train_aug, 'columns') and X_train_aug.columns.duplicated().any():
        print("⚠️ ADASYN introduced duplicates - fixing...")
        X_train_aug = pd.DataFrame(X_train_aug).loc[:, ~pd.DataFrame(X_train_aug).columns.duplicated(keep='first')]
    
    aug_class_counts = pd.Series(y_train_aug).value_counts().sort_index()
    print(f"Balanced class distribution: {dict(aug_class_counts)}")
    
except Exception as e:
    print(f"⚠ ADASYN failed: {e}")
    print("Using original data without balancing")
    X_train_aug, y_train_aug = X_train, y_train

# STEP 7: Enhanced XGBoost Training with Final Safeguards
print("\nSTEP 7: ENHANCED XGBOOST TRAINING WITH FINAL SAFEGUARDS")
print("-" * 60)

# FINAL NUCLEAR SAFETY CHECK before XGBoost
print("🚨 FINAL NUCLEAR SAFETY CHECK BEFORE XGBOOST")
if hasattr(X_train_aug, 'columns'):
    if X_train_aug.columns.duplicated().any():
        print("🔧 APPLYING NUCLEAR FIX: Renaming all columns to ensure absolute uniqueness")
        n_cols = len(X_train_aug.columns)
        unique_names = [f"feature_{i:03d}" for i in range(n_cols)]
        X_train_aug.columns = unique_names
        X_test.columns = unique_names[:len(X_test.columns)]
    else:
        print("✅ No duplicate columns detected in final nuclear check")
else:
    # Convert numpy array to DataFrame with guaranteed unique names
    print("🔧 Converting numpy array to DataFrame with guaranteed unique column names")
    n_cols = X_train_aug.shape[1] if hasattr(X_train_aug, 'shape') else len(X_train_aug[0])
    unique_names = [f"feature_{i:03d}" for i in range(n_cols)]
    X_train_aug = pd.DataFrame(X_train_aug, columns=unique_names)
    X_test = pd.DataFrame(X_test, columns=unique_names)

print("Training enhanced XGBoost model...")
enhanced_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    eval_metric='mlogloss',
    random_state=42
)

enhanced_model.fit(X_train_aug, y_train_aug)

# Comprehensive evaluation
y_pred = enhanced_model.predict(X_test)
y_pred_proba = enhanced_model.predict_proba(X_test)

exact_acc = accuracy_score(y_test, y_pred)
within_one_acc = np.mean(np.abs(y_test - y_pred) <= 1)
f1 = f1_score(y_test, y_pred, average='macro')

print("✓ Enhanced XGBoost training completed successfully without duplicate errors!")

# STEP 8: Meta-blending with Rule Score
print("\nSTEP 8: META-BLENDING WITH RULE SCORE")
print("-" * 60)

# Create meta-features for blending
def create_meta_features(rule_scores, model_preds, model_probas):
    """Create meta-features for blending"""
    meta_features = pd.DataFrame({
        'rule_score': rule_scores,
        'model_pred': model_preds,
        'model_confidence': np.max(model_probas, axis=1),
        'model_uncertainty': 1 - np.max(model_probas, axis=1)
    })
    
    # Add probability features for each class
    for i in range(model_probas.shape[1]):
        meta_features[f'prob_class_{i}'] = model_probas[:, i]
    
    return meta_features

# Create meta-features for train and test
train_rule_scores = train_df['rule_score'].fillna(50).values
test_rule_scores = test_df['rule_score'].fillna(50).values

# Get training predictions for meta-model
y_train_pred = enhanced_model.predict(X_train)
y_train_pred_proba = enhanced_model.predict_proba(X_train)

meta_features_train = create_meta_features(train_rule_scores, y_train_pred, y_train_pred_proba)
meta_features_test = create_meta_features(test_rule_scores, y_pred, y_pred_proba)

# Train meta-model with safeguards
print("Training meta-blending model...")
meta_model = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.2,
    random_state=42
)

meta_model.fit(meta_features_train, y_train)
y_pred_final = meta_model.predict(meta_features_test)

print("✓ Meta-blending model trained successfully")

# STEP 9: Comprehensive Results
print("\nSTEP 9: COMPREHENSIVE RESULTS")
print("=" * 80)

# Final metrics
final_exact_acc = accuracy_score(y_test, y_pred_final)
final_within_one_acc = np.mean(np.abs(y_test - y_pred_final) <= 1)
final_f1 = f1_score(y_test, y_pred_final, average='macro')
cm = confusion_matrix(y_test, y_pred_final)

print(f"FINAL ENHANCED RESULTS:")
print(f"Exact Accuracy: {final_exact_acc*100:.2f}%")
print(f"Within-One-Division: {final_within_one_acc*100:.2f}%")
print(f"F1 Score (Macro): {final_f1*100:.2f}%")

if use_full_dataset:
    print("⚠ NOTE: Results are on the same data used for training (potential overfitting)")

# Detailed confusion matrix
division_names = {0: 'D3/NAIA', 1: 'D2', 2: 'FCS', 3: 'Power 5'}

print(f"\nConfusion Matrix:")
print(cm)

# Per-class accuracy
print(f"\nPER-CLASS ACCURACY:")
for class_idx in range(len(np.unique(y_test))):
    if class_idx in y_test:
        class_mask = y_test == class_idx
        if class_mask.any():
            class_acc = accuracy_score(y_test[class_mask], y_pred_final[class_mask])
            class_within_one = np.mean(np.abs(y_test[class_mask] - y_pred_final[class_mask]) <= 1)
            class_name = division_names.get(class_idx, f'Class {class_idx}')
            class_count = class_mask.sum()
            print(f"  {class_name}: {class_acc*100:.1f}% exact, {class_within_one*100:.1f}% within-one (n={class_count})")

# Per-position breakdown
print(f"\nPER-POSITION BREAKDOWN:")
for pos in test_df['position'].unique():
    pos_mask = test_df['position'] == pos
    if pos_mask.any():
        pos_indices = test_df[pos_mask].index
        # Map to test set indices
        test_pos_mask = np.array([i for i, idx in enumerate(test_df.index) if idx in pos_indices])
        
        if len(test_pos_mask) > 0:
            pos_y_true = y_test[test_pos_mask]
            pos_y_pred = y_pred_final[test_pos_mask]
            
            pos_exact = accuracy_score(pos_y_true, pos_y_pred)
            pos_within_one = np.mean(np.abs(pos_y_true - pos_y_pred) <= 1)
            
            print(f"  {pos.upper()} (n={len(pos_y_true)}): {pos_exact*100:.1f}% exact, {pos_within_one*100:.1f}% within-one")

# FCS-specific analysis
print(f"\nFCS-SPECIFIC ANALYSIS:")
fcs_mask = y_test == 2
if fcs_mask.any():
    fcs_count = fcs_mask.sum()
    fcs_correct = (y_test[fcs_mask] == y_pred_final[fcs_mask]).sum()
    fcs_accuracy = fcs_correct / fcs_count
    
    print(f"✓ FCS Class Analysis:")
    print(f"  Total FCS samples in test: {fcs_count}")
    print(f"  Correctly predicted: {fcs_correct}")
    print(f"  FCS Accuracy: {fcs_accuracy*100:.1f}%")
    
    # Show FCS predictions breakdown
    fcs_predictions = y_pred_final[fcs_mask]
    print(f"  FCS prediction breakdown:")
    for pred_class in np.unique(fcs_predictions):
        pred_count = (fcs_predictions == pred_class).sum()
        pred_name = division_names.get(pred_class, f'Class {pred_class}')
        print(f"    Predicted as {pred_name}: {pred_count} samples")
else:
    print("⚠ No FCS samples in test set")

# Feature importance
if hasattr(enhanced_model, 'feature_importances_'):
    feature_names = X_train_aug.columns if hasattr(X_train_aug, 'columns') else [f"feature_{i}" for i in range(len(enhanced_model.feature_importances_))]
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': enhanced_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTOP 10 FEATURE IMPORTANCE:")
    for i, (_, row) in enumerate(importance_df.head(10).iterrows()):
        print(f"  {i+1:2d}. {row['feature']:<25}: {row['importance']:.4f}")

# FINAL SUMMARY
print(f"\n" + "="*80)
print("ACCURACY BOOST IMPLEMENTATION SUMMARY WITH DUPLICATE PREVENTION")
print("="*80)
print(f"✓ Enhanced data loading with intelligent combine imputation")
print(f"✓ Advanced feature engineering with comprehensive duplicate prevention")
print(f"✓ Multiple layers of duplicate column safeguards:")
print(f"  - After each data concatenation")
print(f"  - After feature engineering")
print(f"  - Before and after ADASYN")
print(f"  - Nuclear safety check before XGBoost")
print(f"✓ State embeddings and interaction features")
print(f"✓ Enhanced class balancing with ADASYN")
print(f"✓ Meta-blending with rule scores")
print(f"✓ Comprehensive evaluation and analysis")
print(f"")
print(f"FINAL RESULTS:")
print(f"  Exact Accuracy: {final_exact_acc*100:.2f}% (Target: 80%+)")
print(f"  Within-One: {final_within_one_acc*100:.2f}%")
print(f"  F1 Score: {final_f1*100:.2f}%")

improvement = final_exact_acc * 100 - 53  # Baseline was ~53%
print(f"\nPERFORMANCE IMPROVEMENT: +{improvement:.1f}% from baseline")

if final_exact_acc >= 0.8:
    print(f"🎉 SUCCESS: Achieved target accuracy of 80%+!")
else:
    print(f"📈 PROGRESS: Improved accuracy to {final_exact_acc*100:.1f}%")
    print(f"💡 NEXT STEPS: Consider adding more data for rare classes or ensemble methods")

print("="*80)
print("✅ DUPLICATE COLUMN ERROR PREVENTION: COMPREHENSIVE SAFEGUARDS APPLIED")
print("="*80)

StatementMeta(pocsparkpool, 51, 82, Finished, Available, Finished)

✓ AutoGluon available
✓ CTGAN available
✓ SHAP available
IMPLEMENTING COMPREHENSIVE ACCURACY BOOST PLAN WITH DUPLICATE PREVENTION

STEP 1: ENHANCED DATA LOADING & NORMALIZATION
------------------------------------------------------------
Loaded 220 rows for QB
Unique divisions for QB: ['Power 5' 'FCS' 'D3' 'D2' 'NAIA']
Added 10 enhanced synthetic samples for QB
  After QB concatenation: (230, 29)
Loaded 194 rows for RB
Unique divisions for RB: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Added 10 enhanced synthetic samples for RB
  After RB concatenation: (434, 52)
Loaded 158 rows for WR
Unique divisions for WR: ['Power 5' 'NAIA' 'FCS' 'D3' 'D2']
Added 10 enhanced synthetic samples for WR
  After WR concatenation: (602, 64)
Class distribution after normalization:
division_num
0    150
1    105
2    107
3    240
Name: count, dtype: int64

STEP 2: STRATIFIED SPLIT WITH ENHANCED LOGIC
------------------------------------------------------------
✓ Successful stratified split: Train=511, Test=91

STE