In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv
/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv
/kaggle/input/equity-post-HCT-survival-predictions/train.csv
/kaggle/input/equity-post-HCT-survival-predictions/test.csv


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from torch.utils.data import DataLoader, TensorDataset
import joblib
import os

# Define device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ensure the directory for saving preprocessors exists
preprocessor_dir = "./preprocessor"
os.makedirs(preprocessor_dir, exist_ok=True)

# =============================
# STEP 1: LOAD AND PREPROCESS TRAINING DATA
# =============================

# Load dataset
train_file_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
df = pd.read_csv(train_file_path)

# List of selected columns + target column
selected_columns = [
    "prim_disease_hct", "hla_match_b_low", "prod_type",
    "year_hct", "obesity", "donor_age", "prior_tumor", "gvhd_proph",
    "sex_match", "comorbidity_score", "karnofsky_score", "donor_related",
    "age_at_hct", "efs"  # Target column
]

# Keep only the selected columns
df = df[selected_columns]

# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from numerical list
target = "efs"
if target in num_cols:
    num_cols.remove(target)

# Handle missing values
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Encoding categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cats = encoder.fit_transform(df[cat_cols])
cat_feature_names = encoder.get_feature_names_out(cat_cols)

# Convert encoded categories to DataFrame
df_encoded = pd.DataFrame(encoded_cats, columns=cat_feature_names)

# Drop original categorical columns and merge encoded ones
df = df.drop(columns=cat_cols)
df = pd.concat([df, df_encoded], axis=1)

# Standardize numerical features
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Save preprocessors for inference
joblib.dump(num_imputer, os.path.join(preprocessor_dir, "num_imputer.pkl"))
joblib.dump(cat_imputer, os.path.join(preprocessor_dir, "cat_imputer.pkl"))
joblib.dump(encoder, os.path.join(preprocessor_dir, "encoder.pkl"))
joblib.dump(scaler, os.path.join(preprocessor_dir, "scaler.pkl"))

# Split dataset into train and validation sets
X = df.drop(columns=[target])
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# =============================
# STEP 2: DEFINE THE NEURAL NETWORK
# =============================

class EFSModel(nn.Module):
    def __init__(self, input_size):
        super(EFSModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize model
input_size = X_train.shape[1]
model = EFSModel(input_size).to(device)  # Move model to the appropriate device

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# =============================
# STEP 3: TRAIN THE MODEL
# =============================

num_epochs = 100
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to the appropriate device

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move data to the appropriate device
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "/kaggle/working/efs_model.pth")
        print("Model saved!")

Epoch 1/100 - Train Loss: 0.6885 - Val Loss: 0.6829
Model saved!
Epoch 2/100 - Train Loss: 0.6815 - Val Loss: 0.6751
Model saved!
Epoch 3/100 - Train Loss: 0.6747 - Val Loss: 0.6679
Model saved!
Epoch 4/100 - Train Loss: 0.6684 - Val Loss: 0.6609
Model saved!
Epoch 5/100 - Train Loss: 0.6630 - Val Loss: 0.6544
Model saved!
Epoch 6/100 - Train Loss: 0.6575 - Val Loss: 0.6486
Model saved!
Epoch 7/100 - Train Loss: 0.6535 - Val Loss: 0.6436
Model saved!
Epoch 8/100 - Train Loss: 0.6492 - Val Loss: 0.6397
Model saved!
Epoch 9/100 - Train Loss: 0.6464 - Val Loss: 0.6364
Model saved!
Epoch 10/100 - Train Loss: 0.6446 - Val Loss: 0.6339
Model saved!
Epoch 11/100 - Train Loss: 0.6433 - Val Loss: 0.6320
Model saved!
Epoch 12/100 - Train Loss: 0.6415 - Val Loss: 0.6303
Model saved!
Epoch 13/100 - Train Loss: 0.6399 - Val Loss: 0.6289
Model saved!
Epoch 14/100 - Train Loss: 0.6392 - Val Loss: 0.6278
Model saved!
Epoch 15/100 - Train Loss: 0.6383 - Val Loss: 0.6268
Model saved!
Epoch 16/100 - Trai