In [None]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

In [None]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'c0e252b0-50d3-44e6-bb79-a007a94e3a9e'
resource_group = 'ml_rg'
workspace_name = 'machine_learning1'

ws = Workspace(subscription_id, resource_group, workspace_name)

# print('Workspace name: ' + ws.name, 
#       'Azure region: ' + ws.location, 
#       'Subscription id: ' + ws.subscription_id, 
#       'Resource group: ' + ws.resource_group, sep='\n')

ds1 = Dataset.get_by_name(ws, name='chexpert')
ds2 = Dataset.get_by_name(ws, name='chexpert2')
#ds2.download(target_path='.', overwrite=False)
#print(dataset)


In [None]:
df = ds1.to_pandas_dataframe()
for i, p in enumerate(df["Path"]):
    df["Path"][i] = p[20:]

In [None]:
import cv2
import matplotlib.pyplot as plt
import os
from os import listdir
from os.path import isfile, join
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
%matplotlib inline

img0 = 'valid/patient64541/study1/view1_frontal.jpg'
fig, ax = plt.subplots(figsize=(10,10))
ax.grid(False)
    
im=cv2.imread(img0)
print(len(im))      # height
print(len(im[0]))   # width
plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
plt.show()


In [None]:
# img_array = []
# for im_path in df["Path"]:
#     im = cv2.imread(im_path)
#     img_array.append(im)

In [None]:
# def get_max_by_col(li, col):
#     # col - 1 is used to 'hide' the fact lists' indexes are zero-based from the caller
#     return max(li, key=lambda x: x[col - 1])[col - 1]

# def get_min_by_col(li, col):
#     # col - 1 is used to 'hide' the fact lists' indexes are zero-based from the caller
#     return min(li, key=lambda x: x[col - 1])[col - 1]

# ls = []
# for i in img_array:
#     ls.append(list(i.shape)[:2])

# print("Max height: ", get_max_by_col(ls, 1))
# print("Max width: ", get_max_by_col(ls, 2))
# print(get_min_by_col(ls, 1))
# print(get_min_by_col(ls, 2))

In [None]:
import glob
import numpy as np
from PIL import Image
from io import StringIO
from torchvision import transforms

class DiseaseDataset(Dataset):
    def __init__(self, img_path, label_matrix):
        self.path = img_path
        self.folder = [p for p in glob.glob(img_path + '/**', recursive=True) if p.endswith('jpg')]
        self.labels = label_matrix
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.CenterCrop(320),
            transforms.ToTensor(),
            ])

    def __len__(self):
        return len(self.folder)

    def __getitem__(self, idx):
        img_loc = self.folder[idx]
        image = cv2.imread(img_loc)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image)
        targets = self.labels[idx]

        return {
            'image': torch.tensor(image, dtype=torch.float32),
            'label': torch.tensor(targets, dtype=torch.float32)
        }

class TrainDiseaseDataset(Dataset):

    def __init__(self, zip_file, img_list, label_matrix):
        self.zip_file = zip_file
        self.folder = img_list
        self.labels = label_matrix
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.CenterCrop(320),
            transforms.ToTensor(),
            ])

    def __len__(self):
        return len(self.folder)

    def __getitem__(self, idx):
        img_loc = self.folder[idx]
        im_file = self.zip_file.read(img_loc)
        image = cv2.imdecode(np.frombuffer(im_file, np.uint8), 1)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image)
        targets = self.labels[idx]
        return {
            'image': torch.tensor(image, dtype=torch.float32),
            'label': torch.tensor(targets, dtype=torch.float32)
        }


In [None]:
# root_folder = 'valid'
# transforms = torchvision.transforms.Compose([
#     torchvision.transforms.CenterCrop(320),
#     torchvision.transforms.ToTensor()
# ])
# # TO DO: normalize

# DataLoader = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(root_folder, transforms), batch_size=1)

In [None]:
# for _ in range(5):
#     img, lbl = next(iter(DataLoader))
#     print(img.shape)
#     print(lbl)

In [None]:
import os
import zipfile
import pandas as pd

zip_path = 'CheXpert-v1.0-small.zip'
os.path.isfile(zip_path)
z = zipfile.ZipFile(zip_path)
zip_name_list = z.namelist()

zip_train_list = [x for x in zip_name_list if x.startswith('CheXpert-v1.0-small/train') and x.endswith('.jpg')]

# Train Image Names
sorted_train_list = sorted(zip_train_list, key=lambda f: int(f[33:38]))[1:]

# Train Labels
train_csv = 'CheXpert-v1.0-small/train.csv'
df_train = pd.read_csv(z.open(train_csv))
labels = list(df_train.columns[5:])
label_arr_train = df_train[labels].to_numpy()

# Train Data Loader
batch_size = 32
train_data = TrainDiseaseDataset(z, sorted_train_list, label_arr_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)


In [None]:
# Multi-Label Classification

# Test Labels
labels = list(df.columns[5:])
label_arr_test = df[labels].to_numpy()

# Test Data Loader
batch_size = 5
test_data = DiseaseDataset('valid', label_arr_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


In [None]:
# labelLoader = [torch.tensor(label).view(1, 14).float() for label in label_arr]
# labelLoader[0].shape
# labelLoader[0].dtype

In [None]:
# import numpy as np
# from sklearn.model_selection import train_test_split

# img_np_array = np.array(img_array)

# index_array = np.arange(234)

# train_id, test_id = train_test_split(index_array, test_size=0.2)

# print(type(label_arr))
# #train_im = img_np_array(list(train_id))
# #test_im = img_np_array(test_id)

# train_label = label_arr(train_id)
# test_label = label_arr(test_id)


In [None]:
# ds_model = Dataset.get_by_name(ws, name='densenet121')
# ds_model.download(target_path='.', overwrite=False)

In [None]:
# MODEL
model = torch.load("models/pretrained_densenet121.pth") # to load

#print(model)


In [None]:
num_output_classes = 14

for param in model.parameters():
    param.requires_grad = False
    
new_classifier = torch.nn.Sequential(
    torch.nn.Linear(1024, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, num_output_classes),
    torch.nn.Sigmoid()
)

model.classifier = new_classifier

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)
model.to(device)
#next(model.parameters()).is_cuda

In [None]:
from tqdm import tqdm

n_epoch = 3
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

losses = []
checkpoint_losses = []
n_total_steps = len(train_loader)

for epoch in range(n_epoch):
    running_loss = 0.0
    for i, data in enumerate(tqdm(train_loader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['image'], data['label']
        inputs, labels = inputs.to(device), labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % (int(n_total_steps/2)) == 0:
            checkpoint_loss = torch.tensor(losses).mean().item()
            checkpoint_losses.append(checkpoint_loss)
            print (f'Epoch [{epoch+1}/{n_epoch}], Step [{i+1}/{n_total_steps}], Loss: {checkpoint_loss:.4f}')

print('Finished Training')
print("Saving model")
torch.save(model, "models/densenet121_valid.pth")
print('Model saved in "models/densenet121_valid.pth"')

In [None]:
# n_epoch = 3
# criterion = torch.nn.BCELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# batch_size = 5

# for epoch in range(n_epoch):
#     running_loss = 0.0
#     for i, (data, label) in enumerate(zip(DataLoader, labelLoader)):
#         # get the inputs; data is a list of [inputs, labels]
#         inputs, _ = data
#         inputs, labels = inputs.to(device), label.to(device)
#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # forward + backward + optimize
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.item()
#         if i % 100 == 0:    # print every 2000 mini-batches
#             print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
#             running_loss = 0.0

# print('Finished Training')
# print("Saving model")
# torch.save(model, "models/densenet121_valid.pth")
# print('Model saved in "models/densenet121_valid.pth"')