In [2]:
# =============================================================================
# VERIFY: What data do we actually have?
# =============================================================================

import pandas as pd
import numpy as np

# Load what you're using for training
X = pd.read_pickle('data/processed/X_train_processed.pkl')

print("="*80)
print("DATA VERIFICATION")
print("="*80)

# Check if count features are scaled or not
print("\nCritical features check:")

features_to_check = {
    'n_diagnoses': 'Should be 0-30 (count)',
    'n_previous_icu_stays': 'Should be 0-24 (count)',
    'Severity_Score': 'Should be 0-5 (score)',
    'age': 'Can be scaled (-2 to +2)',
}

for feat, description in features_to_check.items():
    if feat in X.columns:
        print(f"\n{feat}: {description}")
        print(f"  Min: {X[feat].min():.3f}")
        print(f"  Max: {X[feat].max():.3f}")
        print(f"  Mean: {X[feat].mean():.3f}")
        print(f"  Sample values: {X[feat].head(3).tolist()}")
        
        # Diagnosis
        if feat == 'n_diagnoses':
            if X[feat].min() < 0:
                print("  ❌ SCALED (WRONG!) - should be positive integers")
            elif X[feat].max() > 10:
                print("  ✅ NOT SCALED (CORRECT!) - raw counts")
            else:
                print("  ⚠️ UNCERTAIN - might be scaled")
                
        if feat == 'n_previous_icu_stays':
            if X[feat].min() < 0:
                print("  ❌ SCALED (WRONG!) - should be 0 or positive")
            elif X[feat].max() > 5:
                print("  ✅ NOT SCALED (CORRECT!) - raw counts")
            else:
                print("  ⚠️ UNCERTAIN - might be scaled")
                
        if feat == 'Severity_Score':
            if X[feat].min() < -0.5:
                print("  ❌ SCALED (WRONG!) - should be 0-5")
            elif X[feat].max() > 4:
                print("  ✅ NOT SCALED (CORRECT!) - raw score")
            else:
                print("  ⚠️ UNCERTAIN - might be scaled")

print("\n" + "="*80)
print("VERDICT:")
print("="*80)

# Simple verdict
if 'n_diagnoses' in X.columns:
    if X['n_diagnoses'].min() < 0:
        print("\n❌ You're using INCORRECTLY SCALED data!")
        print("   Count features were scaled when they shouldn't be")
        print("   This is why you got 0.41")
    else:
        print("\n✅ Data looks CORRECT!")
        print("   Count features are NOT scaled")
        print("   Something else caused 0.41...")

DATA VERIFICATION

Critical features check:

n_diagnoses: Should be 0-30 (count)
  Min: 1.000
  Max: 39.000
  Mean: 14.838
  Sample values: [10.0, 8.0, 12.0]
  ✅ NOT SCALED (CORRECT!) - raw counts

n_previous_icu_stays: Should be 0-24 (count)
  Min: 0.000
  Max: 24.000
  Mean: 0.396
  Sample values: [0.0, 0.0, 0.0]
  ✅ NOT SCALED (CORRECT!) - raw counts

Severity_Score: Should be 0-5 (score)
  Min: 0.000
  Max: 5.000
  Mean: 1.176
  Sample values: [1, 2, 2]
  ✅ NOT SCALED (CORRECT!) - raw score

age: Can be scaled (-2 to +2)
  Min: -2.249
  Max: 2.145
  Mean: -0.000
  Sample values: [1.2504291693121448, 0.021009985825780366, 0.10383359283660654]

VERDICT:

✅ Data looks CORRECT!
   Count features are NOT scaled
   Something else caused 0.41...
