In [None]:
# # Install rclone to enable syncing and mounting of OneDrive
!curl https://rclone.org/install.sh | sudo bash

In [None]:
# Launch rclone configuration interface
!rclone config

In [None]:
# Update packages and install FUSE3
!apt-get update && apt-get install -y fuse3

In [None]:
# # Create a local directory where OneDrive will be mounted
!mkdir -p /content/MyOneDrive

In [None]:
# Mount the remote OneDrive directory to the local path using rclone
!rclone mount MyOneDrive: /content/MyOneDrive --vfs-cache-mode full --allow-other --daemon

In [None]:
import os
import pandas as pd
import numpy as np
import nibabel as nib

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.models import resnet18, ResNet18_Weights

from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
# Paths to the CSV files containing the impression text and outcome labels
impressions_csv = "/content/MyOneDrive/GAAP Research Resources/Final_Impressions.csv"  # Update this path
labels_csv = "/content/MyOneDrive/GAAP Research Resources/Final_Impressions_labels.csv"              # Update this path

# Directory where CT scan files (.nii.gz) are stored
download_dir = "/content/MyOneDrive/GAAP Research Resources/CT + Radiology Impressions Data/CTPA"                  # Folder with .nii.gz files
os.makedirs(download_dir, exist_ok=True)

In [None]:
# Load CSVs into pandas DataFrames
impressions_df = pd.read_csv(impressions_csv)
labels_df = pd.read_csv(labels_csv)
merged_df = pd.merge(impressions_df, labels_df, on="impression_id")

# Merge both CSVs on the shared 'impression_id' column
merged_df["file_path"] = merged_df["impression_id"].apply(
    lambda x: os.path.join(download_dir, f"{x}.nii.gz")
)

# Create a full file path for each .nii.gz file based on impression_id
merged_df = merged_df[merged_df["file_path"].apply(os.path.exists)].reset_index(drop=True)
print(f"Final usable sample count: {len(merged_df)}")

In [None]:
# Split into 80/20 split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

In [None]:
# Apply windowing (WL: window level, WW: window width) to enhance specific structures in CT slices
def window_ct(ct, WL, WW):
    ct = np.clip(ct, WL - WW / 2, WL + WW / 2)
    return (ct - (WL - WW / 2)) / WW

# Load a CT scan and process each 2D slice
def process_ct_scan(path):
    try:
        # Load 3D volume data from the .nii.gz file
        ct = nib.load(path).get_fdata()
        slices = []
        for i in range(ct.shape[-1]):
            slice_ = ct[:, :, i] # Get one 2D slice
            # Resize to 256x256
            slice_resized = T.functional.resize(T.functional.to_pil_image(slice_), [256, 256])
            # Crop the center 224x224 region
            slice_cropped = T.functional.center_crop(slice_resized, [224, 224])
            # Convert to NumPy array and float32 format
            slice_np = np.array(slice_cropped).astype(np.float32)
            # Apply three different window views
            lung     = window_ct(slice_np, WL=-600, WW=1500)
            pe       = window_ct(slice_np, WL=100, WW=700)
            mediast  = window_ct(slice_np, WL=40, WW=400)
            # Stack the three windows
            stacked = np.stack([lung, pe, mediast], axis=-1)  # (224, 224, 3)
            slices.append(stacked)
        # Stack all processed slices into one 4D tensor
        return np.stack(slices)  # (N, 224, 224, 3)
    except Exception as e:
        print(f"Skipping file {path} due to error: {e}")
        return None

In [None]:
# Custom PyTorch Dataset for loading and transforming CT scans
class CTScanDataset(Dataset):
    def __init__(self, df):
        self.df = df
        # Normalization values from ImageNet
        self.mean = [0.485, 0.456, 0.406]
        self.std  = [0.229, 0.224, 0.225]
        self.normalize = T.Normalize(mean=self.mean, std=self.std)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        ct_path = row["file_path"]
        # Process the CT scan into a stack of slices
        slices = process_ct_scan(ct_path)  # (N, 224, 224, 3)

        if slices is None:  # Skip if processing failed
            return None

        # Reorder axes to (num_slices, channels, height, width)
        slices = slices.transpose(0, 3, 1, 2)  # (N, 3, 224, 224)

        # Convert to tensor and normalize each slice individually
        slices_tensor = torch.tensor(slices, dtype=torch.float32)
        slices_tensor = torch.stack([self.normalize(slice_) for slice_ in slices_tensor])

        # Convert string labels to numerical values
        label_values = []
        for col in ["1_month_readmission", "6_month_readmission", "12_month_readmission", "pe_positive"]:
            value = row[col]
            if isinstance(value, str):
                if value.upper() == 'TRUE':
                    label_values.append(1.0)
                elif value.upper() == 'FALSE':
                    label_values.append(0.0)
                else:  # Handle 'Censored' or other unexpected strings
                    label_values.append(0.0)
            else:
                label_values.append(float(value))

        label = np.array(label_values).astype(np.float32)

        return slices_tensor, torch.tensor(label)

In [None]:
class LRCN(nn.Module):
    def __init__(self, hidden_size=128, num_layers=1):
        super().__init__()
        # Use a pretrained ResNet18 as a feature extractor
        base_cnn = resnet18(weights=ResNet18_Weights.DEFAULT)
        self.cnn = nn.Sequential(*list(base_cnn.children())[:-1])  # Output shape: (batch, 512, 1, 1)
        # LSTM to process sequence of feature vectors
        self.rnn = nn.LSTM(input_size=512, hidden_size=hidden_size, batch_first=True, num_layers=num_layers)
        # Final classification layer to predict 4 outcomes
        self.fc = nn.Linear(hidden_size, 4)

    def forward(self, x):
        # Batch, Time (slices), Channels, Height, Width
        B, T, C, H, W = x.shape
        # Merge batch and time to process all slices at once
        x = x.view(B * T, C, H, W)
        # Extract features using CNN
        with torch.no_grad():
            x = self.cnn(x).squeeze() # Shape: (B*T, 512)
        # Reshape back to sequence format
        x = x.view(B, T, -1) # Shape: (B, T, 512)
        # Pass through LSTM
        x, _ = self.rnn(x)
        # Use last time step's output for prediction
        x = x[:, -1, :]
        # Final layer for multi-label binary classification
        return torch.sigmoid(self.fc(x))

In [None]:
# Select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Skips failed data samples
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None  # Return None for both data and label if batch is empty
    return torch.utils.data.dataloader.default_collate(batch)

# Ensures dataset is not empty before training
if merged_df.empty:
    print("Error: No matching files found after filtering. Cannot proceed with training.")
else:
    # Create dataset and dataloader
    train_dataset = CTScanDataset(train_df)
    val_dataset = CTScanDataset(val_df)

    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)

    # Initialize model, loss function, and optimizer
    model = LRCN().to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    # Train the model for 5 epochs
    model.train()
    for epoch in range(5):
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training"):
            if x is None or y is None:
                continue
            # Move data to GPU/CPU
            x, y = x.to(device), y.to(device)
            # Forward + backward + optimization
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        # Print average loss per epoch
        avg_loss = total_loss / len(train_loader) if len(train_loader) > 0 else 0
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}")

        # Validation loop
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for x_val, y_val in tqdm(val_loader, desc=f"Epoch {epoch + 1} - Validation"):
                if x_val is None or y_val is None:
                    continue
                x_val, y_val = x_val.to(device), y_val.to(device)
                out_val = model(x_val)
                val_loss = criterion(out_val, y_val)
                total_val_loss += val_loss.item()
        # Print average loss per epoch
        avg_val_loss = total_val_loss / len(val_loader) if len(val_loader) > 0 else 0
        print(f"Epoch {epoch + 1} Val Loss: {avg_val_loss:.4f}")

        # Set back to train mode for next epoch
        model.train()
