In [8]:
# =============================================================================
# EMERGENCY: CHECK SUBMISSION FORMAT
# =============================================================================

import pandas as pd

print("="*80)
print("SUBMISSION FORMAT CHECK")
print("="*80)

# Load our submission
our_submission = pd.read_csv('outputs/predictions/SANITY_CHECK_submission.csv')

print(f"\nOur submission:")
print(f"  Shape: {our_submission.shape}")
print(f"  Columns: {our_submission.columns.tolist()}")
print(f"\nFirst 10 rows:")
print(our_submission.head(10))

print(f"\nLast 10 rows:")
print(our_submission.tail(10))

# Check for issues
print(f"\n" + "="*80)
print("CHECKING FOR ISSUES")
print("="*80)

# Issue 1: Duplicate IDs
duplicates = our_submission['icustay_id'].duplicated().sum()
print(f"\n1. Duplicate icustay_ids: {duplicates}")
if duplicates > 0:
    print("   ❌ ERROR: Duplicate IDs found!")

# Issue 2: Missing values
missing = our_submission.isnull().sum().sum()
print(f"\n2. Missing values: {missing}")
if missing > 0:
    print("   ❌ ERROR: Missing values found!")

# Issue 3: Prediction range
pred_min = our_submission['prediction'].min()
pred_max = our_submission['prediction'].max()
print(f"\n3. Prediction range: [{pred_min:.4f}, {pred_max:.4f}]")

if pred_min < 0 or pred_max > 1:
    print("   ❌ ERROR: Predictions outside [0,1]!")

# Issue 4: Check against sample submission
print(f"\n" + "="*80)
print("COMPARING TO SAMPLE SUBMISSION")
print("="*80)

# Try to load sample submission
try:
    # Check in different possible locations
    from pathlib import Path
    
    possible_paths = [   
        Path('outputs/predictions/gradient_boosting/gradient_boosting_conservative_20251122_0107.csv'),
        Path('../data/sample_submission_classification.csv'),
        Path('../../data/sample_submission.csv'),
        Path('../data/sample_submission.csv'),
    ]
    
    sample = None
    for path in possible_paths:
        if path.exists():
            sample = pd.read_csv(path)
            print(f"\nFound sample submission: {path}")
            break
    
    if sample is not None:
        print(f"\nSample submission:")
        print(f"  Shape: {sample.shape}")
        print(f"  Columns: {sample.columns.tolist()}")
        print(f"\nFirst 5 rows:")
        print(sample.head())
        
        # Compare IDs
        print(f"\n" + "="*80)
        print("ID COMPARISON")
        print("="*80)
        
        sample_ids = set(sample['icustay_id'])
        our_ids = set(our_submission['icustay_id'])
        
        print(f"\nSample IDs: {len(sample_ids)}")
        print(f"Our IDs: {len(our_ids)}")
        
        missing_in_ours = sample_ids - our_ids
        extra_in_ours = our_ids - sample_ids
        
        if missing_in_ours:
            print(f"\n❌ Missing {len(missing_in_ours)} IDs that should be there!")
            print(f"   Sample: {list(missing_in_ours)[:5]}")
        
        if extra_in_ours:
            print(f"\n❌ Have {len(extra_in_ours)} extra IDs that shouldn't be there!")
            print(f"   Sample: {list(extra_in_ours)[:5]}")
        
        if sample_ids == our_ids:
            print(f"\n✓ ID sets match perfectly")
            
            # Check order
            if list(sample['icustay_id']) == list(our_submission['icustay_id']):
                print(f"✓ ID order matches perfectly")
            else:
                print(f"❌ ID order is DIFFERENT!")
                print(f"\nSample first 10: {list(sample['icustay_id'].head(10))}")
                print(f"Ours first 10:   {list(our_submission['icustay_id'].head(10))}")
        
    else:
        print("\n⚠️ Could not find sample submission file")
        print("   Please check manually!")
        
except Exception as e:
    print(f"\n⚠️ Error loading sample: {e}")

# Issue 5: Check original test file
print(f"\n" + "="*80)
print("CHECKING ORIGINAL TEST FILE")
print("="*80)

test_original = pd.read_csv('data/mimic_test_HEF.csv')

print(f"\nOriginal test file:")
print(f"  Shape: {test_original.shape}")
print(f"  icustay_id column exists: {'icustay_id' in test_original.columns}")

original_ids = test_original['icustay_id'].tolist()
our_ids_list = our_submission['icustay_id'].tolist()

print(f"\nFirst 10 IDs in original test: {original_ids[:10]}")
print(f"First 10 IDs in our submission: {our_ids_list[:10]}")

if original_ids == our_ids_list:
    print("\n✓ IDs match original test file perfectly!")
else:
    print("\n❌ IDs DON'T match original test file!")
    
    # Find first mismatch
    for i, (orig, ours) in enumerate(zip(original_ids, our_ids_list)):
        if orig != ours:
            print(f"\n   First mismatch at index {i}:")
            print(f"   Original: {orig}")
            print(f"   Ours: {ours}")
            break

print(f"\n" + "="*80)

SUBMISSION FORMAT CHECK

Our submission:
  Shape: (5221, 2)
  Columns: ['icustay_id', 'prediction']

First 10 rows:
   icustay_id  prediction
0      208169    0.031823
1      251754    0.336064
2      242171    0.007675
3      263035    0.006094
4      279388    0.011700
5      262988    0.011797
6      291777    0.022569
7      217458    0.064734
8      283955    0.111994
9      287227    0.054409

Last 10 rows:
      icustay_id  prediction
5211      272385    0.078206
5212      223244    0.009639
5213      210299    0.017599
5214      224997    0.007469
5215      282794    0.013100
5216      278087    0.883848
5217      266914    0.046038
5218      213413    0.092823
5219      286384    0.027457
5220      280741    0.072603

CHECKING FOR ISSUES

1. Duplicate icustay_ids: 0

2. Missing values: 0

3. Prediction range: [0.0037, 0.9987]

COMPARING TO SAMPLE SUBMISSION

Found sample submission: outputs\predictions\gradient_boosting\gradient_boosting_conservative_20251122_0107.csv

Sample 