In [2]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
import torchvision.transforms as transforms
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder

import os
from PIL import Image

from torch.autograd import Variable


  warn(f"Failed to load image Python extension: {e}")


In [14]:
## Create dataset
class REFLACXWithClinicalDataset(data.Dataset):
    def __init__(self,
        image_size = 224,
        clinical_cols =  ['age', 'gender', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'pain', 'acuity'],
        clinical_numerical_cols = ['age', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity'],
        clinical_categorical_cols = ['gender'],
        labels_cols = [
            'Airway wall thickening', 'Atelectasis', 'Consolidation',
            'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
            'Groundglass opacity', 'Pneumothorax', 'Pulmonary edema',
            'Quality issue', 'Support devices', 'Wide mediastinum',
            'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
            'Hiatal hernia', 'High lung volume / emphysema',
            'Interstitial lung disease', 'Lung nodule or mass',
            'Pleural abnormality'
        ],
        all_disease_cols = [
            'Airway wall thickening', 'Atelectasis', 'Consolidation',
            'Enlarged cardiac silhouette', 'Fibrosis', 'Fracture',
            'Groundglass opacity', 'Pneumothorax', 'Pulmonary edema','Wide mediastinum',
            'Abnormal mediastinal contour', 'Acute fracture', 'Enlarged hilum',
            'Hiatal hernia', 'High lung volume / emphysema',
            'Interstitial lung disease', 'Lung nodule or mass',
            'Pleural abnormality'
        ],
        mode='train',
        horizontal_flip = True,
     ):
        super(REFLACXWithClinicalDataset, self).__init__()

        self.image_size = image_size
        self.df = pd.read_csv('reflacx_with_clinical.csv', index_col=0)
        self.clinical_cols = clinical_cols
        self.clinical_numerical_cols = clinical_numerical_cols
        self.clinical_categorical_cols = clinical_categorical_cols
        self.labels_cols = labels_cols
        self.all_disease_cols = all_disease_cols 
        self.encoder_map = {}
        self.mode = mode

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        train_transforms_lst = [
            transforms.Resize((self.image_size, self.image_size)),
            transforms.RandomHorizontalFlip() if horizontal_flip else None,
            transforms.ToTensor(),
            normalize,
        ]
        self.train_transform = transforms.Compose([t for t in train_transforms_lst if t])

        self.test_transform = transforms.Compose([
            transforms.Resize((self.image_size, self.image_size)),
            transforms.ToTensor(),
            normalize,
        ])

        self.preprocess_clinical_df()
        self.preprocess_label()

    def preprocess_clinical_df(self, ):

        self.clinical_df = self.df[self.clinical_cols]
        self.encoders_map = {}

        # encode the categorical cols.
        for col in self.clinical_categorical_cols:
            le = LabelEncoder()
            self.clinical_df[col] = le.fit_transform(self.clinical_df[col])
            self.encoders_map[col] = le

        self.clinical_df = self.clinical_df[self.clinical_numerical_cols + self.clinical_categorical_cols]
        

    def preprocess_label(self,):
        self.df[self.all_disease_cols] = self.df[self.all_disease_cols].gt(0)


    def load_image_array(self, image_path):
        return np.asarray(Image.open(image_path))


    def plot_image_from_array(self, image_array):
        im = Image.fromarray(image_array)
        im.show()


    def __getitem__(self, index, mode="train"):
        # find the df

        instance = self.df.iloc[index] 

        img = Image.open(instance['image_path']).convert("RGB")

        label_long_tensor = torch.tensor(np.array(self.df[self.labels_cols].iloc[[index]])).long()

        instance_clinical_df = self.clinical_df.iloc[[index]] 

        return  self.train_transform(img) if mode=="train" else self.test_transform(img) , instance_clinical_df, label_long_tensor # we will feed the categorical column to the model, so we keep it in dataframe form.

    def __len__(self):
        return len(self.df)





In [15]:
reflacx_dataset = REFLACXWithClinicalDataset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.clinical_df[col] = le.fit_transform(self.clinical_df[col])


In [16]:
reflacx_dataset.__getitem__(0)

(tensor([[[ 0.3138,  0.3138,  0.1939,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.3823,  0.2967,  0.1426,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.3823,  0.2624,  0.1254,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [ 1.8379,  1.8379,  1.8208,  ...,  2.2489,  2.2489,  2.2489],
          [ 1.6495,  1.6324,  1.6153,  ...,  2.2489,  2.2489,  2.2489],
          [ 1.3584,  1.3584,  1.3584,  ...,  2.2489,  2.2489,  2.2489]],
 
         [[ 0.4503,  0.4503,  0.3277,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.5203,  0.4328,  0.2752,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.5203,  0.3978,  0.2577,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [ 2.0084,  2.0084,  1.9909,  ...,  2.4286,  2.4286,  2.4286],
          [ 1.8158,  1.7983,  1.7808,  ...,  2.4286,  2.4286,  2.4286],
          [ 1.5182,  1.5182,  1.5182,  ...,  2.4286,  2.4286,  2.4286]],
 
         [[ 0.6705,  0.6705,  0.5485,  ..., -1.8044, -1.8044, -1.8044],
          [ 0.7402,  0.6531,

In [1]:
from data.dataset import REFLACXWithClinicalDataset

  warn(f"Failed to load image Python extension: {e}")


In [2]:
reflacx_dataset = REFLACXWithClinicalDataset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.clinical_df[col] = le.fit_transform(self.clinical_df[col])


In [3]:
reflacx_dataset.__getitem__(0)

(tensor([[[ 0.3138,  0.3138,  0.1939,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.3823,  0.2967,  0.1426,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.3823,  0.2624,  0.1254,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [ 1.8379,  1.8379,  1.8208,  ...,  2.2489,  2.2489,  2.2489],
          [ 1.6495,  1.6324,  1.6153,  ...,  2.2489,  2.2489,  2.2489],
          [ 1.3584,  1.3584,  1.3584,  ...,  2.2489,  2.2489,  2.2489]],
 
         [[ 0.4503,  0.4503,  0.3277,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.5203,  0.4328,  0.2752,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.5203,  0.3978,  0.2577,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [ 2.0084,  2.0084,  1.9909,  ...,  2.4286,  2.4286,  2.4286],
          [ 1.8158,  1.7983,  1.7808,  ...,  2.4286,  2.4286,  2.4286],
          [ 1.5182,  1.5182,  1.5182,  ...,  2.4286,  2.4286,  2.4286]],
 
         [[ 0.6705,  0.6705,  0.5485,  ..., -1.8044, -1.8044, -1.8044],
          [ 0.7402,  0.6531,