In [1]:
import sys
import cv2
import torch
import torch.nn.functional as F
import pandas as pd
from ast import literal_eval
sys.path.append("/Users/claire/Desktop/Stuff-/codes/dissertation/cxr/rgrg/src/full_model")
sys.path.append("/Users/claire/Desktop/Stuff-/codes/dissertation/cxr/rgrg/src")
# import sys
sys.path.append("/Users/claire/Desktop/Stuff-/codes/dissertation/cxr/rgrg/")

from full_model.train_full_model import *
from full_model.generate_reports_for_images import *


In [2]:
def get_datasets():
    PERCENTAGE_OF_TRAIN_SET_TO_USE = 0.1
    PERCENTAGE_OF_VAL_SET_TO_USE = 0.1
    usecols = [
        "mimic_image_file_path",
        "bbox_coordinates",
        "bbox_labels",
        "bbox_phrases",
        "bbox_phrase_exists",
        "bbox_is_abnormal",
        'bbox_abnormalities',
    ]

    # all of the columns below are stored as strings in the csv_file
    # however, as they are actually lists, we apply the literal_eval func to convert them to lists
    converters = {
        "bbox_coordinates": literal_eval,
        "bbox_labels": literal_eval,
        "bbox_phrases": literal_eval,
        "bbox_phrase_exists": literal_eval,
        "bbox_is_abnormal": literal_eval,
        "bbox_abnormalities": literal_eval,
    }

    datasets_as_dfs = {}
    datasets_as_dfs["train"] = pd.read_csv(os.path.join(path_full_dataset, "train_ab.csv"), usecols=usecols, converters=converters)
    datasets_as_dfs["test"] = pd.read_csv(os.path.join(path_full_dataset, "test_ab.csv"), usecols=usecols, converters=converters)

    total_num_samples_train = len(datasets_as_dfs["train"])
    total_num_samples_val = len(datasets_as_dfs["test"])

    # compute new number of samples for both train and val
    new_num_samples_train = int(PERCENTAGE_OF_TRAIN_SET_TO_USE * total_num_samples_train)
    new_num_samples_val = int(PERCENTAGE_OF_VAL_SET_TO_USE * total_num_samples_val)


    from datasets import Dataset
    # limit the datasets to those new numbers
    datasets_as_dfs["train"] = datasets_as_dfs["train"][:new_num_samples_train]
    datasets_as_dfs["test"] = datasets_as_dfs["test"][:new_num_samples_val]

    raw_train_dataset = Dataset.from_pandas(datasets_as_dfs["train"])
    raw_test_dataset = Dataset.from_pandas(datasets_as_dfs["test"])

    return raw_train_dataset, raw_test_dataset


In [3]:
import cv2
import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, dataset_name: str, tokenized_dataset, transforms, log):
        super().__init__()
        self.dataset_name = dataset_name
        self.tokenized_dataset = tokenized_dataset
        self.transforms = transforms
        self.log = log

    def __len__(self):
        return len(self.tokenized_dataset)

    def __getitem__(self, index):
        # get the image_path for potential logging in except block
        image_path = self.tokenized_dataset[index]["mimic_image_file_path"]

        # if something in __get__item fails, then return None
        # collate_fn in dataloader filters out None values
        try:
            bbox_coordinates = self.tokenized_dataset[index]["bbox_coordinates"]  # List[List[int]]
            bbox_labels = self.tokenized_dataset[index]["bbox_labels"]  # List[int]
            input_ids = self.tokenized_dataset[index]["input_ids"]  # List[List[int]]
            attention_mask = self.tokenized_dataset[index]["attention_mask"]  # List[List[int]]
            bbox_phrase_exists = self.tokenized_dataset[index]["bbox_phrase_exists"]  # List[bool]
            bbox_is_abnormal = self.tokenized_dataset[index]["bbox_is_abnormal"]  # List[bool]
            bbox_abnormalities = self.tokenized_dataset[index]["bbox_abnormalities"]  # List[List[int]]
            

            if self.dataset_name != "train":
                # we only need the reference phrases during evaluation when computing scores for metrics
                bbox_phrases = self.tokenized_dataset[index]["bbox_phrases"]  # List[str]

                # same for the reference_report
                reference_report = self.tokenized_dataset[index]["reference_report"]  # str

            # cv2.imread by default loads an image with 3 channels
            # since we have grayscale images, we only have 1 channel and thus use cv2.IMREAD_UNCHANGED to read in the 1 channel
            image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)[:,:,0]
            image = cv2.resize(image, (512, 512))


            # apply transformations to image, bbox_coordinates and bbox_labels
            transformed = self.transforms(image=image, bboxes=bbox_coordinates, class_labels=bbox_labels)

            transformed_image = transformed["image"]

            transformed_bbox_coordinates = transformed["bboxes"]
            transformed_bbox_labels = transformed["class_labels"]

            transformed_bbox_coordinates = [[x * 2 for x in bbox] for bbox in transformed_bbox_coordinates]
            sample = {
                "image": transformed_image,
                "bbox_coordinates": torch.tensor(transformed_bbox_coordinates, dtype=torch.float),
                "bbox_labels": torch.tensor(transformed_bbox_labels, dtype=torch.int64),
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "bbox_phrase_exists": torch.tensor(bbox_phrase_exists, dtype=torch.bool),
                "bbox_is_abnormal": torch.tensor(bbox_is_abnormal, dtype=torch.bool),
                "bbox_abnormalities": torch.tensor(bbox_abnormalities, dtype=torch.int64),
            }

            if self.dataset_name != "train":
                sample["bbox_phrases"] = bbox_phrases
                sample["reference_report"] = reference_report

        except Exception as e:
            self.log.error(f"__getitem__ failed for: {image_path}")
            self.log.error(f"Reason: {e}")
            return None

        return sample


In [4]:
def get_tokenized_datasets(tokenizer, raw_train_dataset):
    def tokenize_function(example):
        phrases = example["bbox_phrases"]  # List[str]
        bos_token = "<|endoftext|>"  # note: in the GPT2 tokenizer, bos_token = eos_token = "<|endoftext|>"
        eos_token = "<|endoftext|>"

        phrases_with_special_tokens = [bos_token + phrase + eos_token for phrase in phrases]

        # the tokenizer will return input_ids of type List[List[int]] and attention_mask of type List[List[int]]
        return tokenizer(phrases_with_special_tokens, truncation=True, max_length=1024)

    tokenized_train_dataset = raw_train_dataset.map(tokenize_function)

    # tokenized datasets will consist of the columns
    #   - mimic_image_file_path (str)
    #   - bbox_coordinates (List[List[int]])
    #   - bbox_labels (List[int])
    #   - bbox_phrases (List[str])
    #   - input_ids (List[List[int]])
    #   - attention_mask (List[List[int]])
    #   - bbox_phrase_exists (List[bool])
    #   - bbox_is_abnormal (List[bool])
    #
    #   val dataset will have additional column:
    #   - reference_report (str)

    return tokenized_train_dataset


In [5]:
import cv2
import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, dataset_name: str, tokenized_dataset, transforms, log):
        
        super().__init__()
        self.dataset_name = dataset_name
        self.tokenized_dataset = tokenized_dataset
        self.transforms = transforms
        self.log = log
    

    def __len__(self):
        return len(self.tokenized_dataset)

    def __getitem__(self, index):
   
        # get the image_path for potential logging in except block
        image_path = self.tokenized_dataset[index]["mimic_image_file_path"]

        # if something in __get__item fails, then return None
        # collate_fn in dataloader filters out None values
        bbox_coordinates = self.tokenized_dataset[index]["bbox_coordinates"]  # List[List[int]]
        bbox_labels = self.tokenized_dataset[index]["bbox_labels"]  # List[int]
        input_ids = self.tokenized_dataset[index]["input_ids"]  # List[List[int]]
        attention_mask = self.tokenized_dataset[index]["attention_mask"]  # List[List[int]]
        bbox_phrase_exists = self.tokenized_dataset[index]["bbox_phrase_exists"]  # List[bool]
        bbox_is_abnormal = self.tokenized_dataset[index]["bbox_is_abnormal"]  # List[bool]
        bbox_abnormalities = self.tokenized_dataset[index]["bbox_abnormalities"]  # List[List[int]]


        # if self.dataset_name != "train":
        #     # we only need the reference phrases during evaluation when computing scores for metrics
        #     bbox_phrases = self.tokenized_dataset[index]["bbox_phrases"]  # List[str]

        #     # same for the reference_report
        #     reference_report = self.tokenized_dataset[index]["reference_report"]  # str

        # cv2.imread by default loads an image with 3 channels
        # since we have grayscale images, we only have 1 channel and thus use cv2.IMREAD_UNCHANGED to read in the 1 channel
        image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)[:,:,0]
        image = cv2.resize(image, (512, 512))


        # apply transformations to image, bbox_coordinates and bbox_labels
        transformed = self.transforms(image=image, bboxes=bbox_coordinates, class_labels=bbox_labels)

        transformed_image = transformed["image"]

        transformed_bbox_coordinates = transformed["bboxes"]
        transformed_bbox_labels = transformed["class_labels"]
     
        transformed_bbox_coordinates = [[x * 2 for x in bbox] for bbox in transformed_bbox_coordinates]
        sample = {
            "image": transformed_image,
            "bbox_coordinates": torch.tensor(transformed_bbox_coordinates, dtype=torch.float),
            "bbox_labels": torch.tensor(transformed_bbox_labels, dtype=torch.int64),
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bbox_phrase_exists": torch.tensor(bbox_phrase_exists, dtype=torch.bool),
            "bbox_is_abnormal": torch.tensor(bbox_is_abnormal, dtype=torch.bool),
            "bbox_abnormalities": bbox_abnormalities,
        }
    
        # if self.dataset_name != "train":
        #     sample["bbox_phrases"] = bbox_phrases
        #     sample["reference_report"] = reference_report



        return sample


In [6]:
raw_train_dataset, raw_test_dataset = get_datasets()

tokenizer = get_tokenizer()
# tokenize the raw datasets
tokenized_train_dataset= get_tokenized_datasets(tokenizer, raw_train_dataset)
train_transforms = get_transforms("train")
train_dataset_complete = CustomDataset("train", tokenized_train_dataset, train_transforms, log)




  0%|          | 0/11391 [00:00<?, ?ex/s]

In [16]:
train_dataset_complete.__getitem__(0)

{'image': tensor([[[-1.5466, -1.5466, -1.5466,  ..., -1.5466, -1.5466, -1.5466],
          [-1.5466, -1.5466, -1.5466,  ..., -1.5466, -1.5466, -1.5466],
          [-1.5466, -1.5466, -1.5466,  ..., -1.5466, -1.5466, -1.5466],
          ...,
          [ 0.9985,  0.9985,  0.9855,  ..., -1.0532, -1.0662, -1.0791],
          [ 1.0375,  1.0375,  1.0504,  ..., -1.1311, -1.1960, -1.2350],
          [ 1.0634,  1.0764,  1.0894,  ..., -1.1571, -1.2609, -1.3129]]]),
 'bbox_coordinates': tensor([[ 82.,  18., 242., 294.],
         [128.,  26., 242., 116.],
         [112., 116., 234., 180.],
         [ 82., 180., 224., 294.],
         [172., 102., 236., 192.],
         [136.,  18., 240.,  80.],
         [ 58., 244., 104., 290.],
         [ 82., 246., 256., 294.],
         [268.,  26., 428., 292.],
         [268.,  34., 386., 116.],
         [268., 116., 394., 180.],
         [268., 180., 428., 292.],
         [268., 102., 330., 192.],
         [272.,  26., 374.,  80.],
         [404., 260., 450., 306

In [7]:
tokenized_val_dataset = get_tokenized_datasets(tokenizer, raw_test_dataset)
val_transforms = get_transforms("val")
val_dataset_complete = CustomDataset("val", tokenized_val_dataset, val_transforms, log)

  0%|          | 0/3080 [00:00<?, ?ex/s]

torch.Size([1, 1, 512, 512])

1. feature data loader
feature, anatomical_ind, finding_labels = next(data_loader)
fetaure: 1 * 1024
anatomical_ind: 1
finding_labels: [0, 4, 19]

2. model
a. 29 models for each anatomical region
b. one model for all anatomical region, 40-class classification



In [8]:
checkpoint_path = "/Users/claire/Desktop/Stuff-/codes/dissertation/data/checkpoints/full_model_checkpoint_val_loss_19.793_overall_steps_155252.pt"
model = get_model(checkpoint_path)





In [9]:
train_dataset_complete = CustomDataset("train", tokenized_train_dataset, train_transforms, log)
val_dataset_complete = CustomDataset("val", tokenized_val_dataset, val_transforms, log)

In [15]:

def get_data_loaders(tokenizer, train_dataset_complete, val_dataset_complete):
    custom_collate_test = CustomCollator(
        tokenizer=tokenizer, is_val_or_test=True, pretrain_without_lm_model=False
    )

    test_loader = DataLoader(
        train_dataset_complete,
        batch_size=64,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
    )

    val_loader = DataLoader(
        val_dataset_complete,
        batch_size=64,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
    )

    return test_loader, val_loader



In [11]:

features = []
one_hot_abs= []
sample = train_dataset_complete[0]
image = sample['image']
image = image.unsqueeze(0)


In [12]:
_, _, feature,_=  model.object_detector(image)



In [13]:
features.append(feature)
bbox_abnormalities = sample['bbox_abnormalities']


In [15]:
def one_hot(lst):
    lst = [0 if x not in lst else 1 for x in range(41)]
    return lst
one_hot_ab = torch.Tensor([one_hot(x) for x in bbox_abnormalities])
one_hot_abs.append(one_hot_ab)

In [17]:
features

[tensor([[[ 0.0382,  0.0046,  0.0988,  ...,  0.0264,  0.2625, -0.1633],
          [-0.2999,  0.1143, -0.1615,  ...,  0.0610,  0.2086, -0.2958],
          [-0.1840, -0.1034,  0.1876,  ...,  0.1570,  0.2957, -0.3913],
          ...,
          [ 0.2341,  0.0404, -0.6592,  ...,  0.4296,  0.0581,  0.2007],
          [-0.2065,  0.0582, -0.2670,  ...,  0.0558, -0.0731, -0.2660],
          [ 0.1962,  0.3054,  0.1338,  ...,  0.0078,  0.0516,  0.0147]]],
        grad_fn=<ViewBackward0>)]

In [21]:

train_dataset_complete[0]['image'].shape

torch.Size([1, 512, 512])

In [25]:
# random tensor of shape 100, 512, 512
tmp = torch.rand(100,1, 512, 512)
model.object_detector(tmp)

: 

: 

In [None]:
one_hot_abs= []
for sample in dataset:
    image = sample['image']
    image = image.unsqueeze(0)
    _, _, feature,_=  model.object_detector(image)
    features.append(feature)
    bbox_abnormalities = sample['bbox_abnormalities']
    one_hot_ab = torch.Tensor([one_hot(x) for x in bbox_abnormalities])
    one_hot_abs.append(one_hot_ab)

In [18]:
def initial_features(dataset, model):
    def one_hot(lst):
        lst = [0 if x not in lst else 1 for x in range(41)]
        return lst
    features = []
    one_hot_abs= []
    for sample in dataset:
        print(1)
        image = sample['image']
        image = image.unsqueeze(0)
        _, _, feature,_=  model.object_detector(image)
        features.append(feature)
        bbox_abnormalities = sample['bbox_abnormalities']
        one_hot_ab = torch.Tensor([one_hot(x) for x in bbox_abnormalities])
        one_hot_abs.append(one_hot_ab)
    return features, one_hot_abs

# save the features and one_hot_abs

val_features, val_one_hot_abs = initial_features(val_dataset_complete, model)


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


: 

: 

In [None]:
train_features, train_one_hot_abs = initial_features(train_dataset_complete, model)
# save the features and one_hot_abs
torch.save(train_features, '/Users/claire/Desktop/Stuff-/codes/dissertation/data/train_features.pt')
torch.save(train_one_hot_abs, '/Users/claire/Desktop/Stuff-/codes/dissertation/data/train_one_hot_abs.pt')
torch.save(val_features, '/Users/claire/Desktop/Stuff-/codes/dissertation/data/val_features.pt')
torch.save(val_one_hot_abs, '/Users/claire/Desktop/Stuff-/codes/dissertation/data/val_one_hot_abs.pt')

# load the features and one_hot_abs
train_features = torch.load('/Users/claire/Desktop/Stuff-/codes/dissertation/data/train_features.pt')
train_one_hot_abs = torch.load('/Users/claire/Desktop/Stuff-/codes/dissertation/data/train_one_hot_abs.pt')
val_features = torch.load('/Users/claire/Desktop/Stuff-/codes/dissertation/data/val_features.pt')
val_one_hot_abs = torch.load('/Users/claire/Desktop/Stuff-/codes/dissertation/data/val_one_hot_abs.pt')

In [21]:
class FeatureDataset(Dataset):
    def __init__(self, customer_dataset, model, mixup=True):
        self.dataset = customer_dataset
        self.model = model
        self.mixup = mixup
    
    def __len__(self):
        return len(self.dataset)
    
    '''
    Returns a list of tuples of the form (top_region_features, region_id, finding_id)
    An index co
    '''
    def one_hot(self, lst):
        lst = [0 if x not in lst else 1 for x in range(41)]
        return lst

    def __getitem__(self, index):
        sample = self.dataset[index]
        image = sample['image']
        image = image.unsqueeze(0)
        _, _, top_region_features,_= self.model.object_detector(image)
        bbox_abnormalities = sample['bbox_abnormalities']
        one_hot_ab = [self.one_hot(x) for x in bbox_abnormalities]
        return top_region_features, one_hot_ab

In [20]:
class FeatureDataset(Dataset):
    def __init__(self, customer_dataset, model):
        self.dataset = customer_dataset
        self.model = model
        features, one_hot_abs = self.initial_features(customer_dataset, model)
        self.features = features
        self.one_hot_abs = one_hot_abs

    
    def initial_features(self, dataset, model):
        def one_hot(lst):
            lst = [0 if x not in lst else 1 for x in range(41)]
            return lst
        features = []
        one_hot_abs= []
        for sample in dataset:
            image = sample['image']
            image = image.unsqueeze(0)
            _, _, feature,_=  model.object_detector(image)
            features.append(feature)
            bbox_abnormalities = sample['bbox_abnormalities']
            one_hot_ab = torch.Tensor([one_hot(x) for x in bbox_abnormalities])
            one_hot_abs.append(one_hot_ab)
        return features, one_hot_abs

    def __len__(self):
        return len(self.dataset)
    
    '''
    Returns a list of tuples of the form (top_region_features, region_id, finding_id)
    An index co
    '''
    

    def __getitem__(self, index):
        feature = self.features[index]
        one_hot_ab = self.one_hot_abs[index]
        return feature, one_hot_ab

In [28]:
# import nn
import torch.nn as nn
class MultiClassClassifier(nn.Module):
    def __init__(self):
        super(MultiClassClassifier, self).__init__()
        self.fc1 = nn.Linear(29*1024, 512)
        self.fc2 = nn.Linear(512, 29*41)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x.view(-1, 29, 41)

In [22]:
from torch.utils.data import DataLoader

# Assume you have split your dataset into training and validation sets
train_dataset = FeatureDataset(train_dataset_complete, model)
val_dataset = FeatureDataset(val_dataset_complete, model)
# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)


In [24]:
val_dataloader.dataset[0][0]

tensor([[[-0.0717, -0.0461,  0.1126,  ..., -0.0140,  0.0137, -0.1753],
         [-0.2667, -0.0901, -0.0809,  ...,  0.0708, -0.0802, -0.2500],
         [-0.2707, -0.0151,  0.2605,  ..., -0.1532, -0.0634, -0.4341],
         ...,
         [ 0.3879,  0.3287, -0.5041,  ...,  0.5047,  0.0249,  0.2009],
         [-0.0717, -0.0461,  0.1126,  ..., -0.0140,  0.0137, -0.1753],
         [ 0.2453,  0.3405,  0.2579,  ..., -0.0862,  0.1149,  0.0326]]],
       grad_fn=<ViewBackward0>)

In [31]:
def train_loop(dataloader, model, loss_fn, optimizer):
    print('start training')
    size = len(dataloader.dataset)
    print(size)
    for batch, (X, y) in enumerate(dataloader):
        print(batch)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [33]:
classifier_model = MultiClassClassifier()
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# Train and test the model_
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, classifier_model, loss_fn, optimizer)
    test_loop(val_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
start training
11391


: 

: 

In [52]:
x,y = FeatureDataset(train_dataset_complete, model)[0]



In [53]:
print(a.shape, b.shape)

torch.Size([1, 29, 1024]) torch.Size([29, 41])


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Prepare your data
X = [] # feature vectors
region_ids = [] # region ids
y = [] # disease labels
dataset = FeatureDataset(train_dataset_complete, model)
for i in range(len(dataset)):
    lst = dataset[i]
    for a, b, c in lst:
        X.append(a.detach().numpy()) # convert tensor to numpy array
        region_ids.append(b)
        y.append(c)

# One-hot encode region ids
encoder = OneHotEncoder(sparse=False)
region_ids = encoder.fit_transform(np.array(region_ids).reshape(-1, 1))

# Concatenate feature vectors and region ids
X = np.concatenate((X, region_ids), axis=1)

# Split your data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


KeyboardInterrupt: 

In [17]:

# One-hot encode region ids
encoder = OneHotEncoder(sparse=False)
region_ids = encoder.fit_transform(np.array(region_ids).reshape(-1, 1))

# Concatenate feature vectors and region ids
X = np.concatenate((X, region_ids), axis=1)

# Split your data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40])

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

class DiseaseClassifier(nn.Module):
    def __init__(self):
        super(DiseaseClassifier, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, len(np.unique(y_train))) # number of unique disease labels

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Convert your data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create TensorDatasets for training and validation sets
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

# Initialize your model
nn_model = DiseaseClassifier()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)

In [None]:
criterion = nn.CrossEntropyLoss()

In [37]:
criterion(torch.tensor([[9, 0.2, 0.3]]), torch.tensor([0]))

tensor(0.0003)

In [39]:
# Train your model
for epoch in range(10): # number of epochs
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = nn_model(X_batch)
        print(output.shape, y_batch.shape)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        print('Epoch {}, loss {}'.format(epoch, loss.item()))

torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.6581358909606934
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.518080472946167
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.9580976963043213
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.9304499626159668
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.4994324445724487
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.8277125358581543
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.553109049797058
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.9471303224563599
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.6802353858947754
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.7639497518539429
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.7882554531097412
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.4971253871917725
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.5017262697219849
torch.Size([64, 41]) torch.Size([64])
Epoch 0, loss 1.661021113395691
torch.Siz

KeyboardInterrupt: 

In [34]:

correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in train_loader:
        output = nn_model(X_batch)
        _, predicted = torch.max(output.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

nn_accuracy = correct / total
print(f'Neural Network training accuracy: {nn_accuracy}')

Neural Network training accuracy: 0.41600199242622593


In [26]:
# Validate your model
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        output = nn_model(X_batch)
        _, predicted = torch.max(output.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

nn_accuracy = correct / total
print(f'Neural Network validation accuracy: {nn_accuracy}')

Neural Network validation accuracy: 0.41135150323124475


In [33]:
for X_batch, y_batch in val_loader:
    print(X_batch)
    print(X_batch.shape)
    print(y_batch)
    print(y_batch.shape)
    break

tensor([[-0.2438,  0.0368,  0.1006,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0031,  0.1986, -0.1564,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0663, -0.0284,  0.1491,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1661, -0.0798,  0.1029,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1276,  0.0789, -0.1726,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2311,  0.0249,  0.1263,  ...,  0.0000,  0.0000,  0.0000]])
torch.Size([64, 1049])
tensor([ 0,  0, 15,  0, 15, 15,  3,  0, 32, 15,  0,  8,  0,  0,  7,  7,  7,  0,
         0, 31,  0,  0,  1, 21,  0,  2, 12, 12,  1,  0, 11,  0, 18,  5,  7,  7,
         0,  8, 21, 15,  0,  3,  0,  0, 11,  0,  0,  7,  3,  1,  0,  0, 11,  8,
         9,  7, 15,  0,  0, 17, 25, 13, 14,  2])
torch.Size([64])
