In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
import json
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score


In [2]:
X_train = pd.read_csv('multi_task_data/X_train_all.csv')
X_test = pd.read_csv('multi_task_data/X_test_all.csv')
y_train = pd.read_csv('multi_task_data/y_train_all.csv')
y_test = pd.read_csv('multi_task_data/y_test_all.csv')
y_train['task'] = X_train['task']
y_test['task'] = X_test['task']

In [3]:
with open('task_embeddings.json', 'r') as file:
    task_embeddings = json.load(file)
task_embedding_dim = 128


In [4]:
# unique_tasks
unique_tasks_1 = ['guo_los', 'guo_readmission', 'guo_icu']
unique_tasks_2 = ['new_hypertension', 'new_hyperlipidemia', 'new_pancan', 'new_celiac', 'new_lupus', 'new_acutemi']
unique_tasks_3 = ['lab_thrombocytopenia', 'lab_hyperkalemia', 'lab_hyponatremia', 'lab_anemia', 'lab_hypoglycemia']





In [5]:
def expand_embeddings(df, task_embeddings):
    embeddings = df['task'].map(task_embeddings)
    new_columns = [f'task_emb_{i}' for i in range(task_embedding_dim)]
    df = pd.concat([df.drop('task', axis=1), pd.DataFrame(embeddings.tolist(), columns=new_columns, index=df.index)], axis=1)
    return df

In [6]:
X_train = X_train[X_train['task'].isin(unique_tasks_1)]
X_train = X_train.drop(columns=['Unnamed: 0'])
X_test = X_test[X_test['task'].isin(unique_tasks_1)]
X_test = X_test.drop(columns=['Unnamed: 0'])

y_train = y_train[y_train['task'].isin(unique_tasks_1)]
y_train = y_train.drop(columns=['Unnamed: 0'])
y_test = y_test[y_test['task'].isin(unique_tasks_1)]
y_test = y_test.drop(columns=['Unnamed: 0'])

# X_train = X_train[X_train['task'].isin(unique_tasks_2)]
# X_train = X_train.drop(columns=['Unnamed: 0'])
# X_test = X_test[X_test['task'].isin(unique_tasks_2)]
# X_test = X_test.drop(columns=['Unnamed: 0'])

# y_train = y_train[y_train['task'].isin(unique_tasks_2)]
# y_train = y_train.drop(columns=['Unnamed: 0'])
# y_test = y_test[y_test['task'].isin(unique_tasks_2)]
# y_test = y_test.drop(columns=['Unnamed: 0'])

# X_train = X_train[X_train['task'].isin(unique_tasks_3)]
# X_train = X_train.drop(columns=['Unnamed: 0'])
# X_test = X_test[X_test['task'].isin(unique_tasks_3)]
# X_test = X_test.drop(columns=['Unnamed: 0'])

# y_train = y_train[y_train['task'].isin(unique_tasks_3)]
# y_train = y_train.drop(columns=['Unnamed: 0'])
# y_test = y_test[y_test['task'].isin(unique_tasks_3)]
# y_test = y_test.drop(columns=['Unnamed: 0'])



In [7]:
X_train_1 = expand_embeddings(X_train, task_embeddings)
X_test_1 = expand_embeddings(X_test, task_embeddings)

In [8]:
y_train_1 = y_train[y_train['task'].isin(unique_tasks_1)]['0'].values
y_test_1 = y_test[y_test['task'].isin(unique_tasks_1)]['0'].values

# y_train_1 = y_train[y_train['task'].isin(unique_tasks_2)]['0'].values
# y_test_1 = y_test[y_test['task'].isin(unique_tasks_2)]['0'].values

# y_train_1 = y_train[y_train['task'].isin(unique_tasks_3)]['0'].values
# y_test_1 = y_test[y_test['task'].isin(unique_tasks_3)]['0'].values

In [9]:
X_train_1 = X_train_1.to_numpy()
X_test_1 = X_test_1.to_numpy()

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
class TwoLayerNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [12]:
# def train_model(num_epochs, model, train_loader, val_loader, criterion, optimizer):
#     val_accuracy_current = 0
#     val_auc = 0
#     for epoch in range(num_epochs):
#         print(f'Epoch [{epoch+1}/{num_epochs}]')
#         model.train()
#         for inputs, labels in train_loader:
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         # Evaluate on validation set
#         if epoch % 2 == 0:
#             model.eval()
#             with torch.no_grad():
#                 preds = []
#                 gts = []
#                 correct = 0
#                 total = 0
                
#                 for inputs, labels in val_loader:
#                     inputs, labels = inputs.to(device), labels.to(device)
#                     outputs = model(inputs)
#                     _, predicted = torch.max(outputs.data, 1)
#                     total += labels.size(0)
#                     correct += (predicted == labels).sum().item()
#                     preds.extend(list(predicted.cpu().numpy()))
#                     gts.extend(list(labels.cpu().numpy()))
#                 val_accuracy = 100 * correct / total
#                 auc_score = roc_auc_score(gts, preds)
                
                
#                 if auc_score > val_auc:
#                     val_auc = auc_score
#                     torch.save(model.state_dict(), 'best_model_task1.pth')
#                 print(f'Epoch [{epoch+1}/{num_epochs}], Validation Auc: {auc_score:.2f}%')
#     return model

def train_model(num_epochs, model, train_loader, val_loader, criterion, optimizer, task_name):
    val_accuracy_current = 0
    val_auc = 0
    for epoch in range(num_epochs):
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # print(outputs.device.type, labels.device.type)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate on validation set
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                preds = []
                gts = []
                correct = 0
                total = 0
                
                for inputs, labels in val_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
                    preds.extend(list(predicted.cpu().numpy()))
                    gts.extend(list(labels.cpu().numpy()))
                val_accuracy = 100 * correct / total
                auc_score = roc_auc_score(gts, preds)
                
                
                if auc_score > val_auc:
                    val_auc = auc_score
                    model_name = f'multi_task_models/best_model_task_{task_name}.pth'
                    torch.save(model.state_dict(), model_name)
                print(f'Epoch [{epoch+1}/{num_epochs}], Validation Auc: {auc_score:.2f}%')
    return model, model_name

In [13]:
# def evaluate_model(model, test_loader):
#     model.eval()
#     preds = []
#     preds_prob = []
#     gts = []
#     with torch.no_grad():
#         correct = 0
#         total = 0
#         for inputs, labels in test_loader:
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             # _, predicted = torch.max(outputs.data, 1)

#             probabilities = torch.softmax(outputs.data, dim=1)

#             # Get the predicted class for each data point (highest probability)
#             predicted = torch.argmax(probabilities, dim=1)

#             # Gather the probabilities of the predicted classes
#             predicted_probabilities = probabilities[torch.arange(probabilities.shape[0]), predicted]

            

#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()
#             preds.extend(list(predicted.cpu().numpy()))
#             gts.extend(list(labels.cpu().numpy()))
#             preds_prob.extend(list(predicted_probabilities.cpu().numpy()))

#         test_accuracy = 100 * correct / total
#         print(f'Test Accuracy: {test_accuracy:.2f}%')
#         auc_score = roc_auc_score(gts, preds)
#         ave_preds_prob = np.mean(preds_prob)

#     return test_accuracy, auc_score * 100, ave_preds_prob


def evaluate_model(model, test_loader):
    model.eval()
    preds = []
    preds_prob = []
    probs = []
    gts = []
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # _, predicted = torch.max(outputs.data, 1)

            probabilities = torch.softmax(outputs.data, dim=1)

            # Get the predicted class for each data point (highest probability)
            predicted = torch.argmax(probabilities, dim=1)

            # Gather the probabilities of the predicted classes
            predicted_probabilities = probabilities[torch.arange(probabilities.shape[0]), predicted]

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            preds.extend(list(predicted.cpu().numpy()))
            gts.extend(list(labels.cpu().numpy()))
            preds_prob.extend(list(predicted_probabilities.cpu().numpy()))
            probs.extend(list(probabilities.cpu().numpy()))

        test_accuracy = 100 * correct / total
        print(f'Test Accuracy: {test_accuracy:.2f}%')
        auc_score = roc_auc_score(gts, preds)
        ave_preds_prob = np.mean(preds_prob)

    return test_accuracy, auc_score * 100, ave_preds_prob, gts, preds, preds_prob, probs

In [14]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_1), y=y_train_1)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Move weights to the same device as model
if torch.cuda.is_available():
    class_weights = class_weights.to(device=device)

# Convert numpy arrays to PyTorch tensors
X_train_1 = torch.tensor(X_train_1).float()
X_test_1 = torch.tensor(X_test_1).float()
y_train_1 = torch.tensor(y_train_1).long()
y_test_1 = torch.tensor(y_test_1).long()

# Create TensorDatasets
train_dataset = TensorDataset(X_train_1, y_train_1)
test_dataset = TensorDataset(X_test_1, y_test_1)

# Create DataLoaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [15]:
y_train[y_train['task'].isin(unique_tasks_2)]['0'].values

array([], dtype=int64)

In [16]:
input_size = X_train_1.shape[1]
hidden_size = 100  # You can tune this
num_classes = len(torch.unique(y_train_1))
model = TwoLayerNet(input_size, hidden_size, num_classes)
model.to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)  # Suitable for classification with imbalanced dataset
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Learning rate can be tuned

model = train_model(50, model, train_loader, test_loader, criterion, optimizer)

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/50]
Epoch [1/50], Validation Auc: 0.76%
Epoch [2/50]
Epoch [3/50]
Epoch [3/50], Validation Auc: 0.76%
Epoch [4/50]
Epoch [5/50]
Epoch [5/50], Validation Auc: 0.76%
Epoch [6/50]
Epoch [7/50]
Epoch [7/50], Validation Auc: 0.75%
Epoch [8/50]
Epoch [9/50]
Epoch [9/50], Validation Auc: 0.75%
Epoch [10/50]
Epoch [11/50]
Epoch [11/50], Validation Auc: 0.73%
Epoch [12/50]
Epoch [13/50]
Epoch [13/50], Validation Auc: 0.74%
Epoch [14/50]
Epoch [15/50]
Epoch [15/50], Validation Auc: 0.70%
Epoch [16/50]
Epoch [17/50]
Epoch [17/50], Validation Auc: 0.70%
Epoch [18/50]
Epoch [19/50]
Epoch [19/50], Validation Auc: 0.70%
Epoch [20/50]
Epoch [21/50]
Epoch [21/50], Validation Auc: 0.70%
Epoch [22/50]
Epoch [23/50]
Epoch [23/50], Validation Auc: 0.67%
Epoch [24/50]
Epoch [25/50]
Epoch [25/50], Validation Auc: 0.69%
Epoch [26/50]
Epoch [27/50]
Epoch [27/50], Validation Auc: 0.69%
Epoch [28/50]
Epoch [29/50]
Epoch [29/50], Validation Auc: 0.67%
Epoch [30/50]
Epoch [31/50]
Epoch [31/50], Validation

In [17]:
# model = TwoLayerNet(input_size, hidden_size, num_classes)
# model.load_state_dict(torch.load('best_model_task2.pth'))
# model.to(device)

In [18]:
y_test = y_test.reset_index(drop=True, inplace=False)
for task in unique_tasks_1:
# for task in unique_tasks_2:
# for task in unique_tasks_3:
    idx = y_test[y_test['task'].isin([task])].index
    X_test_temp = X_test_1[idx]
    y_test_temp = y_test_1[idx]
    test_temp_dataset = TensorDataset(X_test_temp, y_test_temp)
    test_temp_loader = DataLoader(test_temp_dataset, batch_size=batch_size, shuffle=False)
    acc_temp, auc_temp, outputs_temp = evaluate_model(model, test_temp_loader)
    print(task, acc_temp, auc_temp, outputs_temp)

Test Accuracy: 77.95%
guo_los 77.9498861047836 69.03100108497182 0.9444388
Test Accuracy: 85.29%
guo_readmission 85.29008679762448 57.37787614148422 0.9664198
Test Accuracy: 95.48%
guo_icu 95.48355424644085 60.51048698167792 0.99147075


In [19]:
outputs_temp

0.99147075

In [20]:
torch.(outputs_temp.data, )

SyntaxError: invalid syntax (1282978832.py, line 1)

In [None]:
y_test['task'].isin([unique_tasks_2])

0       False
1       False
2       False
3       False
4       False
        ...  
6416    False
6417    False
6418    False
6419    False
6420    False
Name: task, Length: 6421, dtype: bool

In [None]:
y_test

Unnamed: 0,0,task
0,0,guo_los
1,0,guo_los
2,0,guo_los
3,0,guo_los
4,0,guo_los
...,...,...
6416,0,guo_icu
6417,0,guo_icu
6418,0,guo_icu
6419,0,guo_icu
