In [1]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import os




In [3]:
class TFRecordVectorDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.paths = self.data['path'].tolist()
        self.labels = self.data.iloc[:, -14:].values.astype(np.float32)  # labels are in the last 14 columns

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        try:
            # readin .tfrecord
            npy_path = path.replace('.tfrecord', '.npy')
            embedding = np.load(npy_path)

            
            # Convert to Tensor
            embedding_tensor = torch.tensor(embedding, dtype=torch.float32)  
            label_tensor = torch.tensor(self.labels[idx], dtype=torch.float32)  
            return embedding_tensor, label_tensor
        except Exception as e:
            print(f"Error loading {path}: {str(e)}")
            return torch.zeros(1376), torch.zeros(14)  

In [4]:
# Load csv file

# Use the L40S GPU
os.environ['CUDA_VISIBLE_DEVICES'] = "5"
print(os.getenv('CUDA_VISIBLE_DEVICES'))

# load  training set and test set
#train_df, test_df = train_test_split(data, test_size=0.2, random_state=39)
train_df = pd.read_csv("preprocessed_training.csv")
test_df = pd.read_csv("preprocessed_test.csv")



train_dataset = TFRecordVectorDataset(train_df)
test_dataset = TFRecordVectorDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)



5


In [5]:
print(train_df.shape)
print("\nlabel distribution in training set:")
print(train_df.select_dtypes(include=[np.number]).sum())

(207314, 33)

label distribution in training set:
gender                                112018.0
insurance_Medicaid                     17152.0
insurance_Medicare                     92265.0
insurance_Other                        97897.0
20-30                                   9882.0
30-40                                  12025.0
40-50                                  22666.0
50-60                                  40674.0
60-80                                  87345.0
80+                                    33812.0
race_AMERICAN INDIAN/ALASKA NATIVE       480.0
race_ASIAN                              6630.0
race_BLACK/AFRICAN AMERICAN            32887.0
race_HISPANIC/LATINO                   11035.0
race_OTHER                              9664.0
race_UNABLE TO OBTAIN                    918.0
race_UNKNOWN                            9844.0
race_WHITE                            135856.0
Enlarged Cardiomediastinum              6770.0
Cardiomegaly                           42720.0
Lung Opaci

In [6]:
print(test_df.shape)
print("\nlabel distribution in test set:")
print(test_df.select_dtypes(include=[np.number]).sum())

(21591, 31)

label distribution in test set:
gender                                11913.0
insurance_Medicaid                     1812.0
insurance_Medicare                     9862.0
insurance_Other                        9917.0
20-30                                  1067.0
30-40                                  1238.0
40-50                                  2425.0
50-60                                  4103.0
60-80                                  9189.0
80+                                    3487.0
race_AMERICAN INDIAN/ALASKA NATIVE      184.0
race_ASIAN                              760.0
race_BLACK/AFRICAN AMERICAN            3685.0
race_HISPANIC/LATINO                   1392.0
race_OTHER                             1062.0
race_WHITE                            14508.0
Enlarged Cardiomediastinum              761.0
Cardiomegaly                           4441.0
Lung Opacity                           4904.0
Lung Lesion                             634.0
Edema                              

In [7]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        # First hidden layer
        self.fc1 = nn.Linear(input_size, hidden_size1)
        # Second hidden layer
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        # Output
        self.fc3 = nn.Linear(hidden_size2, output_size)
        # Activation Func
        self.relu = nn.ReLU()

    def forward(self, x):
        # First
        out = self.fc1(x)
        out = self.relu(out)
        # Second
        out = self.fc2(out)
        out = self.relu(out)
        # Output
        out = self.fc3(out)
        return out

In [8]:

# Initialization
input_size = 1376  # dimension of input vector
hidden_size1 = 512  # dimension of first layer 
hidden_size2 = 256  # dimension of second layer
output_size = 14   # number of labels


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = MLP(input_size, hidden_size1, hidden_size2, output_size)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)
# optimizer and loss function
criterion = nn.BCEWithLogitsLoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

cuda


In [9]:
from sklearn.metrics import roc_auc_score
import numpy as np
from tqdm import tqdm

# Define number epochs
num_epochs = 20
best_auroc = 0.0

In [10]:
for epoch in range(num_epochs):
    # training
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} - Training'):
        inputs, labels = inputs.to(device), labels.to(device)
        #inputs, labels = inputs.cuda(), labels.cuda()
        

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    

    train_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

    # evaluation
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc=f'Epoch {epoch+1}/{num_epochs} - Validation'):
            inputs, labels = inputs.to(device), labels.to(device)

            

            outputs = model(inputs)
            probs = torch.sigmoid(outputs) 
            
            # save results
            all_probs.append(probs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # calculate AUROC
    all_probs = np.concatenate(all_probs, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    
    auroc_scores = []
    for i in range(all_labels.shape[1]):  # triverse all labels
        try:
            auroc = roc_auc_score(all_labels[:, i], all_probs[:, i])
            auroc_scores.append(auroc)
        except ValueError:
            print(f"Label {i} has no positive or negative samples in the test set.")
            auroc_scores.append(np.nan)
    
    # calculate macro AUROC
    macro_auroc = np.nanmean(auroc_scores)
    print(f'Epoch {epoch+1}, Validation Macro AUROC: {macro_auroc:.4f}')

    # save the best model
    if macro_auroc > best_auroc:
        best_auroc = macro_auroc
        torch.save(model.state_dict(), 'best_mlp_model.pth')
        print(f'Best model saved with AUROC: {best_auroc:.4f}')

print('Training complete.')

Epoch 1/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:55<00:00, 116.29it/s]


Epoch 1, Train Loss: 0.2593


Epoch 1/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 176.45it/s]


Epoch 1, Validation Macro AUROC: 0.8153
Best model saved with AUROC: 0.8153


Epoch 2/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 125.38it/s]


Epoch 2, Train Loss: 0.2537


Epoch 2/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 189.10it/s]


Epoch 2, Validation Macro AUROC: 0.8140


Epoch 3/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 124.97it/s]


Epoch 3, Train Loss: 0.2520


Epoch 3/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 192.27it/s]


Epoch 3, Validation Macro AUROC: 0.8214
Best model saved with AUROC: 0.8214


Epoch 4/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 128.43it/s]


Epoch 4, Train Loss: 0.2510


Epoch 4/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 192.24it/s]


Epoch 4, Validation Macro AUROC: 0.8221
Best model saved with AUROC: 0.8221


Epoch 5/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 128.03it/s]


Epoch 5, Train Loss: 0.2502


Epoch 5/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 188.16it/s]


Epoch 5, Validation Macro AUROC: 0.8199


Epoch 6/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:49<00:00, 130.21it/s]


Epoch 6, Train Loss: 0.2496


Epoch 6/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 191.58it/s]


Epoch 6, Validation Macro AUROC: 0.8260
Best model saved with AUROC: 0.8260


Epoch 7/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:47<00:00, 136.38it/s]


Epoch 7, Train Loss: 0.2492


Epoch 7/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 190.10it/s]


Epoch 7, Validation Macro AUROC: 0.8245


Epoch 8/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:47<00:00, 135.51it/s]


Epoch 8, Train Loss: 0.2487


Epoch 8/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 193.81it/s]


Epoch 8, Validation Macro AUROC: 0.8232


Epoch 9/20 - Training: 100%|███████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 126.85it/s]


Epoch 9, Train Loss: 0.2483


Epoch 9/20 - Validation: 100%|███████████████████████████████████████████████████████| 675/675 [00:03<00:00, 186.07it/s]


Epoch 9, Validation Macro AUROC: 0.8262
Best model saved with AUROC: 0.8262


Epoch 10/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 128.82it/s]


Epoch 10, Train Loss: 0.2480


Epoch 10/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 190.75it/s]


Epoch 10, Validation Macro AUROC: 0.8262


Epoch 11/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:46<00:00, 138.47it/s]


Epoch 11, Train Loss: 0.2477


Epoch 11/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 194.58it/s]


Epoch 11, Validation Macro AUROC: 0.8247


Epoch 12/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:48<00:00, 133.31it/s]


Epoch 12, Train Loss: 0.2475


Epoch 12/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 188.22it/s]


Epoch 12, Validation Macro AUROC: 0.8249


Epoch 13/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 128.92it/s]


Epoch 13, Train Loss: 0.2472


Epoch 13/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 188.04it/s]


Epoch 13, Validation Macro AUROC: 0.8255


Epoch 14/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 125.92it/s]


Epoch 14, Train Loss: 0.2470


Epoch 14/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 190.92it/s]


Epoch 14, Validation Macro AUROC: 0.8218


Epoch 15/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 126.77it/s]


Epoch 15, Train Loss: 0.2468


Epoch 15/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 193.84it/s]


Epoch 15, Validation Macro AUROC: 0.8265
Best model saved with AUROC: 0.8265


Epoch 16/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:47<00:00, 135.09it/s]


Epoch 16, Train Loss: 0.2465


Epoch 16/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 186.94it/s]


Epoch 16, Validation Macro AUROC: 0.8252


Epoch 17/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 129.01it/s]


Epoch 17, Train Loss: 0.2463


Epoch 17/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 191.04it/s]


Epoch 17, Validation Macro AUROC: 0.8235


Epoch 18/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:50<00:00, 129.07it/s]


Epoch 18, Train Loss: 0.2461


Epoch 18/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 190.36it/s]


Epoch 18, Validation Macro AUROC: 0.8221


Epoch 19/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 126.62it/s]


Epoch 19, Train Loss: 0.2461


Epoch 19/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 188.55it/s]


Epoch 19, Validation Macro AUROC: 0.8245


Epoch 20/20 - Training: 100%|██████████████████████████████████████████████████████| 6479/6479 [00:51<00:00, 126.63it/s]


Epoch 20, Train Loss: 0.2459


Epoch 20/20 - Validation: 100%|██████████████████████████████████████████████████████| 675/675 [00:03<00:00, 194.91it/s]


Epoch 20, Validation Macro AUROC: 0.8235
Training complete.


In [11]:
# Save results using the model trained after last epoch

column_name = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion",
    "Edema", "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax",
    "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices", "No Finding"
]
df_labels = pd.DataFrame(all_labels, columns=column_name)

df_labels.to_csv("true_labels_sub.csv", index=False)


In [12]:
df_predictions = pd.DataFrame(all_probs, columns=column_name)
df_predictions.to_csv("predicted_labels_sub.csv", index=False)