In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && unset MPLBACKEND && python analyze_reconstruction.py \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_path /share/project10/home/prelajarsela/Attention_based_reduction/gridsearch_results_FINAL/gridsearch_autoencoder_20250624_225515/combo_504/best_model.pth \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --endpoints os24 \
    --output_dir ./reconstruction_analysis_256



In [9]:
import pickle
from collections import Counter

def analyze_os_keys(data):
    """
    Analyze OS_6 and OS_24 keys in the dataset
    """
    total_items = len(data)
    
    # Track missing keys
    missing_os6 = 0
    missing_os24 = 0
    
    # Collect values when present
    os6_values = []
    os24_values = []
    
    # Analyze each dictionary
    for item in data:
        # Check OS_6
        if 'OS_6' not in item:
            missing_os6 += 1
        else:
            os6_values.append(item['OS_6'])
        
        # Check OS_24
        if 'OS_24' not in item:
            missing_os24 += 1
        else:
            os24_values.append(item['OS_24'])
    
    # Results
    print("🔍 OS_6 ANALYSIS:")
    print(f"Total items: {total_items}")
    print(f"Missing OS_6: {missing_os6} ({missing_os6/total_items*100:.1f}%)")
    print(f"Has OS_6: {len(os6_values)} ({len(os6_values)/total_items*100:.1f}%)")
    
    if os6_values:
        os6_counts = Counter(os6_values)
        print(f"OS_6 values: {dict(os6_counts)}")
    
    print("\n🔍 OS_24 ANALYSIS:")
    print(f"Missing OS_24: {missing_os24} ({missing_os24/total_items*100:.1f}%)")
    print(f"Has OS_24: {len(os24_values)} ({len(os24_values)/total_items*100:.1f}%)")
    
    if os24_values:
        os24_counts = Counter(os24_values)
        print(f"OS_24 values: {dict(os24_counts)}")
    
    return {
        'total': total_items,
        'os6_missing': missing_os6,
        'os24_missing': missing_os24,
        'os6_values': os6_counts if os6_values else {},
        'os24_values': os24_counts if os24_values else {}
    }

# Load your data (replace with your actual file path)
with open('/home/beshoy/CT-FM_features/APOLLO_w_PET_w_OS.pkl', 'rb') as f:
    data = pickle.load(f)

# Run analysis
results = analyze_os_keys(data)

🔍 OS_6 ANALYSIS:
Total items: 497
Missing OS_6: 4 (0.8%)
Has OS_6: 493 (99.2%)
OS_6 values: {1.0: 309, 0.0: 184}

🔍 OS_24 ANALYSIS:
Missing OS_24: 32 (6.4%)
Has OS_24: 465 (93.6%)
OS_24 values: {1.0: 107, 0.0: 358}


In [27]:
import torch

def check_feature_tensors_for_nan(data):
    """
    Check all feature tensors for NaN values
    """
    total_items = len(data)
    items_with_nan = 0
    nan_details = []
    
    print(f"🔍 Checking {total_items} items for NaN values in feature tensors...")
    
    for i, item in enumerate(data):
        if 'features' not in item:
            print(f"⚠️  Item {i}: No 'features' key found")
            continue
        
        features = item['features']
        item_has_nan = False
        
        # Check each tensor in features list
        for j, tensor in enumerate(features):
            if torch.isnan(tensor).any():
                if not item_has_nan:  # First NaN found in this item
                    items_with_nan += 1
                    item_has_nan = True
                
                nan_count = torch.isnan(tensor).sum().item()
                nan_details.append({
                    'item_index': i,
                    'tensor_index': j,
                    'nan_count': nan_count,
                    'tensor_size': tensor.numel(),
                    'patient_id': item.get('patient_id', 'unknown')
                })
    
    # Results
    print(f"\n📊 RESULTS:")
    print(f"Total items checked: {total_items}")
    print(f"Items with NaN: {items_with_nan}")
    print(f"Items without NaN: {total_items - items_with_nan}")
    print(f"Clean rate: {(total_items - items_with_nan)/total_items*100:.1f}%")
    
    if nan_details:
        print(f"\n❌ NaN DETAILS:")
        for detail in nan_details[:10]:  # Show first 10
            print(f"  Item {detail['item_index']} (Patient: {detail['patient_id']}): "
                  f"Tensor {detail['tensor_index']} has {detail['nan_count']}/{detail['tensor_size']} NaN values")
        
        if len(nan_details) > 10:
            print(f"  ... and {len(nan_details) - 10} more")
    else:
        print(f"\n✅ All feature tensors are clean (no NaN values found)!")
    
    return {
        'total_items': total_items,
        'items_with_nan': items_with_nan,
        'nan_details': nan_details
    }

# Quick version to run directly:
def quick_nan_check(data):
    """Quick one-liner version"""
    items_with_nan = 0
    for i, item in enumerate(data):
        if 'features' in item:
            if any(torch.isnan(tensor).any() for tensor in item['features']):
                items_with_nan += 1
    
    print(f"Items with NaN: {items_with_nan}/{len(data)}")
    print(f"Clean rate: {(len(data) - items_with_nan)/len(data)*100:.1f}%")
    return items_with_nan == 0

# Run the check (replace 'data' with your variable name)
results = check_feature_tensors_for_nan(data)
# or for quick check:
# is_clean = quick_nan_check(data)

🔍 Checking 497 items for NaN values in feature tensors...

📊 RESULTS:
Total items checked: 497
Items with NaN: 2
Items without NaN: 495
Clean rate: 99.6%

❌ NaN DETAILS:
  Item 484 (Patient: INT1010586): Tensor 90 has 512/512 NaN values
  Item 484 (Patient: INT1010586): Tensor 91 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 88 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 89 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 90 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 91 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 92 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 93 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 94 has 512/512 NaN values
  Item 492 (Patient: INT1010732): Tensor 95 has 512/512 NaN values


In [54]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/APOLLO_w_PET_w_OS.pkl /home/beshoy/CT-FM_features/LUNG_RADIO_fast.pkl \
    --model_type autoencoder \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 8.0 \
    --val_split 0.10 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
EXCLUDED 2 patients due to NaN values in features:
  - Patient ID: INT1010586
  - Patient ID: INT1010732
Feature validation: kept 1048 patients, excluded 2 patients with NaN features
Filtered patients: kept 1004, excluded 44 patients missing required endpoints
Automatic split: Train 80%, Val 10%, Test 10%
Stratification successful with endpoints: ['os24']
Train samples: 802
Validation samples: 101
Test samples: 101
Creating autoencoder model...
Model parameters: 3,041,777
Starting training...
Starting training for 50 epochs...
Model type: autoencoder
Device: cuda
  New best validation AUC: 0.6391
Epoch 1/50 (0.83s)
  Train Loss: 315.5201
  Train Cosine Sim: 0.4607
  Val Loss: 320.3751
  Val Cosine Sim: 0.6792
  Val OS24 AUC: 0.6391
  LR: 1.00e-03

  New best validation AUC: 0.6688
Epoch 2/50 (0.37s)
  Train Loss: 282.5008
  Train Cosine Sim: 0.7042
  Val Loss: 291.5440
  Val Cosine Sim: 0.7290
  Val OS24 AUC: 0.6688
  LR: 1.00e-03

Epoch 3/50

In [53]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/APOLLO_w_PET_w_OS.pkl /home/beshoy/CT-FM_features/LUNG_RADIO_fast.pkl \
    --model_type autoencoder \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 8.0 \
    --val_split 0.15 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
EXCLUDED 2 patients due to NaN values in features:
  - Patient ID: INT1010586
  - Patient ID: INT1010732
Feature validation: kept 1048 patients, excluded 2 patients with NaN features
Filtered patients: kept 1004, excluded 44 patients missing required endpoints
Automatic split: Train 70%, Val 15%, Test 15%
Stratification successful with endpoints: ['os24']
Train samples: 702
Validation samples: 151
Test samples: 151
Creating autoencoder model...
Model parameters: 3,041,777
Starting training...
Starting training for 50 epochs...
Model type: autoencoder
Device: cuda
  New best validation AUC: 0.5847
Epoch 1/50 (0.75s)
  Train Loss: 317.8988
  Train Cosine Sim: 0.4247
  Val Loss: 321.1274
  Val Cosine Sim: 0.6674
  Val OS24 AUC: 0.5847
  LR: 1.00e-03

  New best validation AUC: 0.5935
Epoch 2/50 (0.36s)
  Train Loss: 288.8756
  Train Cosine Sim: 0.6924
  Val Loss: 299.7529
  Val Cosine Sim: 0.7213
  Val OS24 AUC: 0.5935
  LR: 1.00e-03

Epoch 3/50

In [44]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/APOLLO_w_PET_w_OS.pkl /home/beshoy/CT-FM_features/LUNG_RADIO_fast.pkl \
    --model_type autoencoder \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 8.0 \
    --val_split 0.20 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
EXCLUDED 2 patients due to NaN values in features:
  - Patient ID: INT1010586
  - Patient ID: INT1010732
Feature validation: kept 1048 patients, excluded 2 patients with NaN features
Filtered patients: kept 1004, excluded 44 patients missing required endpoints
Automatic split: Train 60%, Val 20%, Test 20%
Stratification successful with endpoints: ['os24']
Train samples: 602
Validation samples: 201
Test samples: 201
Creating autoencoder model...
Model parameters: 3,041,777
Starting training...
Starting training for 50 epochs...
Model type: autoencoder
Device: cuda
  New best validation AUC: 0.5542
Epoch 1/50 (0.78s)
  Train Loss: 319.1919
  Train Cosine Sim: 0.4000
  Val Loss: 320.8290
  Val Cosine Sim: 0.6571
  Val OS24 AUC: 0.5542
  LR: 1.00e-03

  New best validation AUC: 0.5981
Epoch 2/50 (0.32s)
  Train Loss: 293.2068
  Train Cosine Sim: 0.6835
  Val Loss: 305.9619
  Val Cosine Sim: 0.7124
  Val OS24 AUC: 0.5981
  LR: 1.00e-03

  New best

In [61]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/INT.pkl /home/beshoy/CT-FM_features/LUNG_RADIO_fast.pkl /home/beshoy/CT-FM_features/GHD.pkl /home/beshoy/CT-FM_features/VHIO.pkl /home/beshoy/CT-FM_features/SZMC.pkl \
    --train_test_split_json /home/beshoy/CT-FM_features/train_test_split_w_LR.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 128 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 10.0 \ 
    --val_split 0.20 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
Feature validation: kept 1310 patients, excluded 0 patients with NaN features
Filtered patients: kept 1234, excluded 76 patients missing required endpoints
Stratification successful with endpoints: ['os24']
Train samples: 904
Validation samples: 226
Test samples: 104
Creating autoencoder model...
Model parameters: 6,013,169
Starting training...
Starting training for 100 epochs...
Model type: autoencoder
Device: cuda
  New best validation loss: 332.7836
Epoch 1/100 (0.96s)
  Train Loss: 329.7264
  Train Cosine Sim: 0.3489
  Val Loss: 332.7836
  Val Cosine Sim: 0.6255
  Val OS24 AUC: 0.5243
  LR: 1.00e-03

  New best validation loss: 322.1438
Epoch 2/100 (0.54s)
  Train Loss: 308.0706
  Train Cosine Sim: 0.6727
  Val Loss: 322.1438
  Val Cosine Sim: 0.7094
  Val OS24 AUC: 0.5304
  LR: 1.00e-03

  New best validation loss: 300.3691
Epoch 3/100 (0.50s)
  Train Loss: 286.3739
  Train Cosine Sim: 0.7196
  Val Loss: 300.3691
  Val Cosine Sim: 0.7352

In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/LUNG_RADIO_fast.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split_w_LR.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [48]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/OLD/VHIO_w_TNM.pkl /home/beshoy/CT-FM_features/OLD/INT_w_TNM.pkl /home/beshoy/CT-FM_features/OLD/GHD_w_TNM.pkl /home/beshoy/CT-FM_features/OLD/SZMC_w_TNM.pkl \
    --train_test_split_json /home/beshoy/CT-FM_features/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 128 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 3.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
^C
Traceback (most recent call last):
  File "/home/beshoy/code-projects/Attention_based_reduction/main_autoencoder.py", line 244, in <module>
    main()
  File "/home/beshoy/code-projects/Attention_based_reduction/main_autoencoder.py", line 65, in main
    train_loader, val_loader, test_loader, class_weights = create_data_loaders(
                                                           ^^^^^^^^^^^^^^^^^^^^
  File "/home/beshoy/code-projects/Attention_based_reduction/utils/dataloader.py", line 532, in create_data_loaders
    all_data = load_pkl_files(pkl_files)
               ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/beshoy/code-projects/Attention_based_reduction/utils/dataloader.py", line 166, in load_pkl_files
    center_data = pickle.load(f)
                  ^^^^^^^^^^^^^^
  File "/home/beshoy/miniconda3/envs/ABR/lib/python3.11/site-packages/torch/storage.py", line 530, in _load_from_bytes
    return torch.load(io.BytesIO(b), weights_onl

In [35]:
!source ~/miniconda3/etc/profile.d/conda.sh && conda activate ABR && python main_autoencoder.py \
    --pkl_files /home/beshoy/CT-FM_features/VHIO.pkl /home/beshoy/CT-FM_features/INT.pkl /home/beshoy/CT-FM_features/GHD.pkl /home/beshoy/CT-FM_features/SZMC.pkl \
    --train_test_split_json /home/beshoy/CT-FM_features/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 3.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



Using device: cuda
Creating data loaders...
Filtered patients: kept 693, excluded 64 patients missing required endpoints
Stratification successful with endpoints: ['os24']
Train samples: 471
Validation samples: 118
Test samples: 104
Creating autoencoder model...
Model parameters: 5,930,993
Starting training...
Starting training for 50 epochs...
Model type: autoencoder
Device: cuda
  New best validation AUC: 0.5042
Epoch 1/50 (0.67s)
  Train Loss: 322.3872
  Train Cosine Sim: 0.3496
  Val Loss: 323.7358
  Val Cosine Sim: 0.6297
  Val OS24 AUC: 0.5042
  LR: 1.00e-03

  New best validation AUC: 0.5151
Epoch 2/50 (0.30s)
  Train Loss: 301.1518
  Train Cosine Sim: 0.6673
  Val Loss: 314.0607
  Val Cosine Sim: 0.7046
  Val OS24 AUC: 0.5151
  LR: 1.00e-03

  New best validation AUC: 0.5736
Epoch 3/50 (0.30s)
  Train Loss: 281.1984
  Train Cosine Sim: 0.7167
  Val Loss: 297.9332
  Val Cosine Sim: 0.7329
  Val OS24 AUC: 0.5736
  LR: 1.00e-03

Epoch 4/50 (0.16s)
  Train Loss: 263.5110
  Train Co

In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && unset MPLBACKEND && python analyze_reconstruction.py \
    --model_path /share/project10/home/prelajarsela/Attention_based_reduction/gridsearch_results_FINAL/gridsearch_autoencoder_20250624_225515/combo_504/best_model.pth \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --endpoints os24 



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/LUNG_RADIO_fast.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 8.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python extract_features.py \
    --model_path /share/project10/home/prelajarsela/Attention_based_reduction/gridsearch_results_FINAL/gridsearch_autoencoder_20250624_225515/combo_504/best_model.pth \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --output_csv /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/I3LUNG_reduced_features_BEST.csv \
    --attention_k 11 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --endpoints os24 


In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 256 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 3.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints os6 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints stage_t \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)


  New best validation loss: 21.3605
Epoch 86/100 (0.81s)
  Train Loss: 22.1671
  Train Cosine Sim: 0.9883
  Val Loss: 21.3605
  Val Cosine Sim: 0.9891
  Val STAGE_T Acc: 0.3252
  LR: 1.00e-03

  New best validation loss: 20.8089
Epoch 87/100 (0.85s)
  Train Loss: 22.4138
  Train Cosine Sim: 0.9885
  Val Loss: 20.8089
  Val Cosine Sim: 0.9891
  Val STAGE_T Acc: 0.2927
  LR: 1.00e-03

  New best validation loss: 20.5409
Epoch 88/100 (0.88s)
  Train Loss: 22.0745
  Train Cosine Sim: 0.9890
  Val Loss: 20.5409
  Val Cosine Sim: 0.9892
  Val STAGE_T Acc: 0.2927
  LR: 1.00e-03

  New best validation loss: 20.2977
Epoch 89/100 (0.87s)
  Train Loss: 21.2613
  Train Cosine Sim: 0.9889
  Val Loss: 20.2977
  Val Cosine Sim: 0.9894
  Val STAGE_T Acc: 0.2276
  LR: 1.00e-03

  New best validation loss: 20.1453
Epoch 90/100 (0.89s)
  Train Loss: 21.0994
  Train Cosine Sim: 0.9891
  Val Loss: 20.1453
  Val Cosine Sim: 0.9898
  Val STAGE_T Acc: 0.3171
  LR: 1.00e-03

  New best validation loss: 20.1342

In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints stage_n \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)


Epoch 86/100 (0.63s)
  Train Loss: 11.8773
  Train Cosine Sim: 0.9885
  Val Loss: 10.9117
  Val Cosine Sim: 0.9893
  Val STAGE_N Acc: 0.3496
  LR: 1.00e-03

Epoch 87/100 (0.60s)
  Train Loss: 11.6764
  Train Cosine Sim: 0.9891
  Val Loss: 10.9410
  Val Cosine Sim: 0.9896
  Val STAGE_N Acc: 0.3171
  LR: 1.00e-03

  New best validation loss: 10.5343
Epoch 88/100 (0.84s)
  Train Loss: 11.7410
  Train Cosine Sim: 0.9894
  Val Loss: 10.5343
  Val Cosine Sim: 0.9899
  Val STAGE_N Acc: 0.3333
  LR: 1.00e-03

  New best validation loss: 10.3985
Epoch 89/100 (0.80s)
  Train Loss: 12.0606
  Train Cosine Sim: 0.9897
  Val Loss: 10.3985
  Val Cosine Sim: 0.9900
  Val STAGE_N Acc: 0.3089
  LR: 1.00e-03

  New best validation loss: 10.0173
Epoch 90/100 (0.84s)
  Train Loss: 11.3113
  Train Cosine Sim: 0.9897
  Val Loss: 10.0173
  Val Cosine Sim: 0.9901
  Val STAGE_N Acc: 0.3659
  LR: 1.00e-03

  New best validation loss: 9.4610
Epoch 91/100 (0.83s)
  Train Loss: 10.7753
  Train Cosine Sim: 0.9900
  

In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD_w_TNM.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC_w_TNM.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints stage_m \
    --selection_metric auc \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)


In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python gridsearch.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --endpoints os24 \
    --model_type autoencoder \
    --save_dir ./gridsearch_results


  New best validation loss: 274.5007
Epoch 32/50 (0.37s)
  Train Loss: 254.1179
  Train Cosine Sim: 0.7346
  Val Loss: 274.5007
  Val Cosine Sim: 0.7409
  Val OS24 AUC: 0.5698
  LR: 1.00e-04

  New best validation loss: 271.8943
Epoch 33/50 (0.36s)
  Train Loss: 251.6065
  Train Cosine Sim: 0.7369
  Val Loss: 271.8943
  Val Cosine Sim: 0.7427
  Val OS24 AUC: 0.5656
  LR: 1.00e-04

  New best validation loss: 271.1156
Epoch 34/50 (0.37s)
  Train Loss: 249.4521
  Train Cosine Sim: 0.7386
  Val Loss: 271.1156
  Val Cosine Sim: 0.7447
  Val OS24 AUC: 0.5648
  LR: 1.00e-04

^C
Traceback (most recent call last):
  File "/share/project10/home/prelajarsela/Attention_based_reduction/gridsearch.py", line 346, in <module>
    main()
  File "/share/project10/home/prelajarsela/Attention_based_reduction/gridsearch.py", line 332, in main
    results_df = grid_search.run_search(max_epochs=50)  # Shorter epochs for grid search
  File "/share/project10/home/prelajarsela/Attention_based_reduction/gridsea

In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 50 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 2.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 11 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 128 \
    --learning_rate 1e-2 \
    --weight_decay 1e-3 \
    --reconstruction_weight 1.0 \
    --prediction_weight 4.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 22 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 4.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



In [None]:
!source /share/data/apps/anaconda3/bin/activate && conda activate ABR && python main_autoencoder.py \
    --pkl_files /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/VHIO.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/INT.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/GHD.pkl /share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/SZMC.pkl \
    --train_test_split_json /share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json \
    --model_type autoencoder \
    --attention_k 44 \
    --latent_dim 512 \
    --encoder_layers 256 128 \
    --predictor_layers 64 32 \
    --dropout_rate 0.3 \
    --epochs 100 \
    --batch_size 64 \
    --learning_rate 1e-3 \
    --weight_decay 1e-4 \
    --reconstruction_weight 1.0 \
    --prediction_weight 4.0 \
    --val_split 0.2 \
    --endpoints os24 \
    --save_dir ./results/autoencoder_$(date +%Y%m%d_%H%M%S)



# ML Prediction from Reduced Features

## This notebook predicts clinical endpoints from autoencoder-extracted features using scikit-learn.

## Configuration

In [None]:
# CONFIGURATION - CHANGE THESE
TARGET_ENDPOINT = 'OS_6'  # Options: 'OS_6', 'OS_24', 'STAGE_T', 'STAGE_N', 'STAGE_M'
FEATURES_CSV = '/share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/I3LUNG_reduced_features_AUC_sel.csv' #'/share/project10/home/prelajarsela/CT-FM/PKLs_fixed_dim/I3LUNG_reduced_features.csv'
TRAIN_TEST_JSON = '/share/project10/data/CTs_I3LUNG_COHORT23/train_test_split.json'
VAL_SPLIT = 0.2

print(f"Target endpoint: {TARGET_ENDPOINT}")

## Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Data Loading and Preprocessing

In [None]:
def load_data(csv_path: str, train_test_json: str):
    """Load features and apply train/test split"""
    # Load features
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} patients with {len([c for c in df.columns if c.startswith('feature_')])} features")
    
    # Load train/test split
    with open(train_test_json, 'r') as f:
        split_data = json.load(f)
    
    # Filter for target endpoint availability
    df = df.dropna(subset=[TARGET_ENDPOINT])
    print(f"After filtering for {TARGET_ENDPOINT}: {len(df)} patients")
    
    # Apply train/test split
    train_subjects = set(split_data['TRAIN_SET'])
    test_subjects = set(split_data['TEST_SET'])
    
    train_df = df[df['Subject'].isin(train_subjects)].copy()
    test_df = df[df['Subject'].isin(test_subjects)].copy()
    
    return train_df, test_df

def stratified_split_by_center(train_df, test_size=0.2):
    """Split train into train/val with center stratification"""
    strat_labels = []
    for _, row in train_df.iterrows():
        center = row['Center']
        label = int(row[TARGET_ENDPOINT])
        if TARGET_ENDPOINT in ['STAGE_T', 'STAGE_N']:
            label = int(label > 0)
        strat_labels.append(f"{center}_{label}")
    
    try:
        train_idx, val_idx = train_test_split(
            range(len(train_df)), test_size=test_size, 
            stratify=strat_labels, random_state=42
        )
        print("Using stratified split by center and label")
    except ValueError:
        train_idx, val_idx = train_test_split(
            range(len(train_df)), test_size=test_size, random_state=42
        )
        print("Using random split (stratification failed)")
    
    return train_df.iloc[train_idx], train_df.iloc[val_idx]

def prepare_features(df):
    """Extract features and targets"""
    feature_cols = [col for col in df.columns if col.startswith('feature_')]
    X = df[feature_cols].values
    y = df[TARGET_ENDPOINT].values
    
    # Convert to binary for staging endpoints
    if TARGET_ENDPOINT in ['STAGE_T', 'STAGE_N']:
        y = (y > 0).astype(int)
    
    return X, y.astype(int)

# Load and split data
train_df, test_df = load_data(FEATURES_CSV, TRAIN_TEST_JSON)
train_df, val_df = stratified_split_by_center(train_df, VAL_SPLIT)

print(f"\nDataset sizes:")
print(f"  Train: {len(train_df)}")
print(f"  Val: {len(val_df)}")
print(f"  Test: {len(test_df)}")

## Feature Preparation and EDA

In [None]:
# Prepare features
X_train, y_train = prepare_features(train_df)
X_val, y_val = prepare_features(val_df)
X_test, y_test = prepare_features(test_df)

print(f"Feature dimensions: {X_train.shape[1]}")
print(f"Class distributions:")
print(f"  Train: {np.bincount(y_train)} (class 0: {np.bincount(y_train)[0]}, class 1: {np.bincount(y_train)[1]})")
print(f"  Val: {np.bincount(y_val)} (class 0: {np.bincount(y_val)[0]}, class 1: {np.bincount(y_val)[1]})")
print(f"  Test: {np.bincount(y_test)} (class 0: {np.bincount(y_test)[0]}, class 1: {np.bincount(y_test)[1]})")


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
datasets = [('Train', y_train), ('Val', y_val), ('Test', y_test)]

for i, (name, y) in enumerate(datasets):
    counts = np.bincount(y)
    axes[i].bar(['Class 0', 'Class 1'], counts, alpha=0.7)
    axes[i].set_title(f'{name} Set - {TARGET_ENDPOINT}')
    axes[i].set_ylabel('Count')
    for j, count in enumerate(counts):
        axes[i].text(j, count + 0.5, str(count), ha='center')

plt.tight_layout()
plt.show()

## Model Definition and Grid Search

In [None]:
def remove_correlated_features(X_train, X_val, X_test, threshold=0.95):
    """Remove highly correlated features using Spearman correlation"""
    from scipy.stats import spearmanr
    
    # Calculate Spearman correlation matrix
    corr_matrix = np.abs(spearmanr(X_train).correlation)
    
    # Find features to remove
    upper_tri = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
    high_corr_pairs = np.where((corr_matrix > threshold) & upper_tri)
    features_to_remove = set()
    
    for i, j in zip(high_corr_pairs[0], high_corr_pairs[1]):
        # Remove the feature with lower variance
        if np.var(X_train[:, i]) < np.var(X_train[:, j]):
            features_to_remove.add(i)
        else:
            features_to_remove.add(j)
    
    # Keep features
    keep_features = [i for i in range(X_train.shape[1]) if i not in features_to_remove]
    
    print(f"Removed {len(features_to_remove)} highly correlated features (threshold={threshold})")
    print(f"Kept {len(keep_features)} features")
    
    return X_train[:, keep_features], X_val[:, keep_features], X_test[:, keep_features]

def create_pipelines():
    """Create ML pipelines with feature selection and regularization"""
    from sklearn.feature_selection import SelectKBest, f_classif, RFE
    from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
    from sklearn.linear_model import ElasticNet
    from sklearn.naive_bayes import GaussianNB
    
    pipelines = {
        'LogisticRegression_L1': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('clf', LogisticRegression(random_state=42, max_iter=2000, penalty='l1', solver='liblinear'))
        ]),
        'LogisticRegression_L2': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('clf', LogisticRegression(random_state=42, max_iter=2000, penalty='l2'))
        ]),
        'ElasticNet': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('clf', LogisticRegression(random_state=42, max_iter=2000, penalty='elasticnet', solver='saga', l1_ratio=0.5))
        ]),
        'RandomForest': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=100)),
            ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
        ]),
        'ExtraTrees': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=100)),
            ('clf', ExtraTreesClassifier(random_state=42, n_jobs=-1))
        ]),
        'GradientBoosting': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=80)),
            ('clf', GradientBoostingClassifier(random_state=42))
        ]),
        'SVM_RBF': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=60)),
            ('clf', SVC(random_state=42, probability=True, kernel='rbf'))
        ]),
        'SVM_Linear': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=60)),
            ('clf', SVC(random_state=42, probability=True, kernel='linear'))
        ])
    }
    
    param_grids = {
        'LogisticRegression_L1': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__C': [0.001, 0.01, 0.1, 1.0]
        },
        'LogisticRegression_L2': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__C': [0.001, 0.01, 0.1, 1.0]
        },
        'ElasticNet': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__C': [0.001, 0.01, 0.1, 1.0],
            'clf__l1_ratio': [0.1, 0.5, 0.9]
        },
        'RandomForest': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [3, 5, 10],
            'clf__min_samples_split': [10, 20],
            'clf__min_samples_leaf': [5, 10]
        },
        'ExtraTrees': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [3, 5, 10],
            'clf__min_samples_split': [10, 20],
            'clf__min_samples_leaf': [5, 10]
        },
        'GradientBoosting': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [3, 5],
            'clf__learning_rate': [0.01, 0.1, 0.2],
            'clf__min_samples_split': [10, 20]
        },
        'SVM_RBF': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__C': [0.001, 0.01, 0.1, 1.0],
            'clf__gamma': ['scale', 'auto']
        },
        'SVM_Linear': {
            'selector__k': [20, 40, 60, 100, 200, 300],
            'clf__C': [0.001, 0.01, 0.1, 1.0]
        }
    }
    
    return pipelines, param_grids

def evaluate_model(model, X, y):
    """Evaluate model and return metrics"""
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    
    auc = roc_auc_score(y, y_prob) if len(np.unique(y)) > 1 else 0.5
    acc = accuracy_score(y, y_pred)
    
    return {'auc': auc, 'accuracy': acc, 'y_pred': y_pred, 'y_prob': y_prob}

pipelines, param_grids = create_pipelines()
print("Created pipelines for:", list(pipelines.keys()))

## Training and Evaluation

In [None]:
results = {}
best_models = {}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Grid search for each algorithm
for name, pipeline in pipelines.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        pipeline, param_grids[name], cv=cv, 
        scoring='roc_auc', n_jobs=-1, verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_models[name] = best_model
    
    # Evaluate on all sets
    train_metrics = evaluate_model(best_model, X_train, y_train)
    val_metrics = evaluate_model(best_model, X_val, y_val)
    test_metrics = evaluate_model(best_model, X_test, y_test)
    
    results[name] = {
        'best_params': grid_search.best_params_,
        'train_auc': train_metrics['auc'],
        'val_auc': val_metrics['auc'], 
        'test_auc': test_metrics['auc'],
        'train_acc': train_metrics['accuracy'],
        'val_acc': val_metrics['accuracy'],
        'test_acc': test_metrics['accuracy'],
        'train_pred': train_metrics['y_pred'],
        'val_pred': val_metrics['y_pred'],
        'test_pred': test_metrics['y_pred'],
        'train_prob': train_metrics['y_prob'],
        'val_prob': val_metrics['y_prob'],
        'test_prob': test_metrics['y_prob']
    }
    
    print(f"Best params: {grid_search.best_params_}")
    print(f"AUCs - Train: {train_metrics['auc']:.3f}, Val: {val_metrics['auc']:.3f}, Test: {test_metrics['auc']:.3f}")
    print(f"Accs - Train: {train_metrics['accuracy']:.3f}, Val: {val_metrics['accuracy']:.3f}, Test: {test_metrics['accuracy']:.3f}")


## Results Summary

In [None]:
# Create results summary
summary_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train_AUC': [results[m]['train_auc'] for m in results.keys()],
    'Val_AUC': [results[m]['val_auc'] for m in results.keys()],
    'Test_AUC': [results[m]['test_auc'] for m in results.keys()],
    'Train_Acc': [results[m]['train_acc'] for m in results.keys()],
    'Val_Acc': [results[m]['val_acc'] for m in results.keys()],
    'Test_Acc': [results[m]['test_acc'] for m in results.keys()]
})

print(f"Results Summary for {TARGET_ENDPOINT}:")
print("="*60)
print(summary_df.round(3).to_string(index=False))

best_model_name = summary_df.loc[summary_df['Val_AUC'].idxmax(), 'Model']
print(f"\nBest model by validation AUC: {best_model_name}")

## Visualization

In [None]:
# Plot performance comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# AUC comparison
x_pos = np.arange(len(summary_df))
width = 0.25

axes[0].bar(x_pos - width, summary_df['Train_AUC'], width, label='Train', alpha=0.8)
axes[0].bar(x_pos, summary_df['Val_AUC'], width, label='Val', alpha=0.8)
axes[0].bar(x_pos + width, summary_df['Test_AUC'], width, label='Test', alpha=0.8)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('AUC')
axes[0].set_title(f'AUC Comparison - {TARGET_ENDPOINT}')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(summary_df['Model'], rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy comparison  
axes[1].bar(x_pos - width, summary_df['Train_Acc'], width, label='Train', alpha=0.8)
axes[1].bar(x_pos, summary_df['Val_Acc'], width, label='Val', alpha=0.8)
axes[1].bar(x_pos + width, summary_df['Test_Acc'], width, label='Test', alpha=0.8)
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Accuracy')
axes[1].set_title(f'Accuracy Comparison - {TARGET_ENDPOINT}')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(summary_df['Model'], rotation=45)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## OS_6 (dim: 256)

In [None]:
# ROC curves for best model
best_result = results[best_model_name]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
datasets = [('Train', y_train, best_result['train_prob']), 
            ('Val', y_val, best_result['val_prob']),
            ('Test', y_test, best_result['test_prob'])]

for i, (name, y_true, y_prob) in enumerate(datasets):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)
    
    axes[i].plot(fpr, tpr, linewidth=3, label=f'ROC (AUC = {roc_auc:.3f})')
    axes[i].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title(f'{name} Set - {best_model_name}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle(f'ROC Curves for {TARGET_ENDPOINT}', fontsize=16)
plt.tight_layout()
plt.show()

## OS_24 (dim: 256)

In [None]:
# ROC curves for best model
best_result = results[best_model_name]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
datasets = [('Train', y_train, best_result['train_prob']), 
            ('Val', y_val, best_result['val_prob']),
            ('Test', y_test, best_result['test_prob'])]

for i, (name, y_true, y_prob) in enumerate(datasets):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)
    
    axes[i].plot(fpr, tpr, linewidth=3, label=f'ROC (AUC = {roc_auc:.3f})')
    axes[i].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title(f'{name} Set - {best_model_name}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle(f'ROC Curves for {TARGET_ENDPOINT}', fontsize=16)
plt.tight_layout()
plt.show()

## OS_6 (dim: 512)

In [None]:
# ROC curves for best model
best_result = results[best_model_name]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
datasets = [('Train', y_train, best_result['train_prob']), 
            ('Val', y_val, best_result['val_prob']),
            ('Test', y_test, best_result['test_prob'])]

for i, (name, y_true, y_prob) in enumerate(datasets):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)
    
    axes[i].plot(fpr, tpr, linewidth=3, label=f'ROC (AUC = {roc_auc:.3f})')
    axes[i].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title(f'{name} Set - {best_model_name}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle(f'ROC Curves for {TARGET_ENDPOINT}', fontsize=16)
plt.tight_layout()
plt.show()

## OS_24 (dim: 512)

In [None]:
# ROC curves for best model
best_result = results[best_model_name]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
datasets = [('Train', y_train, best_result['train_prob']), 
            ('Val', y_val, best_result['val_prob']),
            ('Test', y_test, best_result['test_prob'])]

for i, (name, y_true, y_prob) in enumerate(datasets):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)
    
    axes[i].plot(fpr, tpr, linewidth=3, label=f'ROC (AUC = {roc_auc:.3f})')
    axes[i].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title(f'{name} Set - {best_model_name}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle(f'ROC Curves for {TARGET_ENDPOINT}', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices for best model
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
datasets = [('Train', y_train, best_result['train_pred']), 
            ('Val', y_val, best_result['val_pred']),
            ('Test', y_test, best_result['test_pred'])]

for i, (name, y_true, y_pred) in enumerate(datasets):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
               xticklabels=['Negative', 'Positive'],
               yticklabels=['Negative', 'Positive'])
    axes[i].set_title(f'{name} Set - {best_model_name}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.suptitle(f'Confusion Matrices for {TARGET_ENDPOINT}', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Save results to JSON
final_results = {
    'target_endpoint': TARGET_ENDPOINT,
    'dataset_sizes': {'train': len(train_df), 'val': len(val_df), 'test': len(test_df)},
    'feature_dims': X_train.shape[1],
    'best_model': best_model_name,
    'summary': summary_df.to_dict('records')
}

output_file = f'ml_results_{TARGET_ENDPOINT}.json'
with open(output_file, 'w') as f:
    json.dump(final_results, f, indent=2)

print(f"Results saved to: {output_file}")
print(f"\nFinal Summary for {TARGET_ENDPOINT}:")
print(f"Best model: {best_model_name}")
print(f"Test AUC: {results[best_model_name]['test_auc']:.3f}")
print(f"Test Accuracy: {results[best_model_name]['test_acc']:.3f}")