In [None]:
import torch 
import torchvision.transforms as transforms 
from torchvision.utils import save_image
import numpy as np 
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd 
import numpy as np 
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

os.chdir("/content/drive/MyDrive/HAM10000")
!ls

HAM10000_images_augmented  HAM10000_segmentations_lesion_tschandl
HAM10000_images_part_1	   training_data.csv
HAM10000_images_part_2	   training_data.npy
HAM10000_metadata.csv


In [None]:
#get metadata
mdata = pd.read_csv("/content/drive/MyDrive/HAM10000/HAM10000_metadata.csv")
mdata.iloc[2, 1]
mdata

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern


In [None]:
from pandas.core.window.ewm import GroupbyIndexer
meta_dic = {}
for img_id, group in mdata.groupby("image_id"):
  meta_dic[img_id] = {}
  meta_dic[img_id]["lesion_id"] = list(group["lesion_id"])[0]
  meta_dic[img_id]["image_id"] = list(group["image_id"])[0]
  meta_dic[img_id]["dx"] = list(group["dx"])[0]
  meta_dic[img_id]["dx_type"] = list(group["dx_type"])[0]
  meta_dic[img_id]["age"] = list(group["age"])[0]
  meta_dic[img_id]["sex"] = list(group["sex"])[0]
  meta_dic[img_id]["localization"] = list(group["localization"])[0]

Image Augmentation Section

In [None]:
#random rotation between 45 and 315 degrees 

In [None]:
#transform the function according to the pytorch docs
#random rotation 
#crop ROI 
#random brightness/contrast 
#zoom 
#erosion/dialation
#noise 

#here we are going to introduce several random transforms 
from PIL import Image
img_size = 224
preprocess = transforms.Compose([
    transforms.RandomRotation((45,315)),
    transforms.ColorJitter(brightness=.3, contrast=.3),
    transforms.RandomSolarize(.3, .1),
    transforms.RandomInvert(p=.2), 
    transforms.RandomAdjustSharpness(2, p=.2),
    transforms.ToTensor(),
])

In [None]:
class DermClassHelper(): 
  #make images 50 x 50 to start 
  mdata = pd.read_csv("/content/drive/MyDrive/HAM10000/HAM10000_metadata.csv")
  IMG_SIZE = 224
  labels = {}
  img_map = {}
  #training data 
  training_data = []
  # track balance 
  balance = {}
  #relavent directories 
  sds = ["/HAM10000_images_part_1/", "/HAM10000_images_part_2/"]
  root = "/content/drive/MyDrive/HAM10000"

  def __init__(self):
    #assign labels
    self.make_labels()
    #make the map
    self.make_image_map()
    print(self.labels, self.img_map)
  """ Assign numbers to the diagnosis types"""
  def make_labels(self):
    i = 0 
    for d in self.mdata["dx"].unique():
      self.labels[d] = i 
      i += 1
  """ Assign to each image the corresponding label """
  def make_image_map(self):
    for im, dx in zip(list(self.mdata["image_id"]), list(self.mdata["dx"])):
      self.img_map[im] = self.labels[dx]

  def make_training_data(self):
    for sd in self.sds:
      for f in tqdm(os.listdir(self.root+sd)):
        # extract the image id from the file path
        id = f.split(".")[0].strip(" ").split(" ")[0]
        if self.img_map[id] not in self.balance:
          self.balance[self.img_map[id]] = 0
        self.balance[self.img_map[id]] += 1
    print(self.balance)

In [None]:
dch = DermClassHelper()
dch.make_training_data()

In [None]:
#write a different data loader class 
from torch.utils.data import DataLoader
class URMDermClass():
  def __init__(self, helper, csv_file, root_dir, sds, transform1=None):
    self.derm_frame = pd.read_csv(csv_file) 
    self.root_dir = root_dir
    self.sds = sds 
    self.transform1 = transform1
    #pass in derm class helper here
    self.helper = helper
    #everything put moles
    self.target_classes = set([0, 2, 3, 4, 5, 6])

  def __len__(self):
    return len(self.derm_frame)

  def __getitem__(self, index):
    #1 is the image file col 
    filename = self.derm_frame.iloc[index, 1]
    img_path = None
    #find the image path
    # print(str(self.root_dir+self.sds[0]+filename+".jpg"))
    if os.path.exists(str(self.root_dir+self.sds[0]+filename+".jpg")):
      img_path = str(self.root_dir+self.sds[0]+filename+".jpg")
    elif os.path.exists(str(self.root_dir+self.sds[1]+filename+".jpg")):
      img_path = str(self.root_dir+self.sds[1]+filename+".jpg")
    #only get under-represented samples 
    c = self.helper.img_map[filename]
    if c in self.target_classes:
      #get the image using PIL 
      img = Image.open(img_path)
      # y_label = torch.tensor(np.eye(len(self.helper.labels))[self.helper.img_map[filename]]) #this is good for adam optim 
      y_label = torch.tensor(int(c))

      if self.transform1: 
        image = self.transform1(img)
      return ((image, filename), y_label)
    else:
      return ((None, None), None)

In [None]:
dataset = URMDermClass(helper=dch, csv_file="/content/drive/MyDrive/HAM10000/HAM10000_metadata.csv", root_dir="/content/drive/MyDrive/HAM10000", sds=["/HAM10000_images_part_1/", "/HAM10000_images_part_2/"], transform1=preprocess)

In [None]:
#get some randomness
train_set, test_set = torch.utils.data.random_split(dataset, [int(len(dataset)*.5), int(len(dataset)*.5)+1])

In [None]:
path = "/content/drive/MyDrive/HAM10000/HAM10000_images_augmented/"
df = pd.DataFrame(columns=["image_id", "lesion_id","dx",  "dx_type","age", "sex", "localization", "dataset"])
df

Unnamed: 0,image_id,lesion_id,dx,dx_type,age,sex,localization,dataset


In [None]:
#create 1000 new augmented images
i = 0
for (img, filename), label in tqdm(train_set):
  if img != None:
    # print(meta_dic[filename]["dx"])
    newdf = pd.DataFrame(data = [[filename, meta_dic[filename]["lesion_id"], meta_dic[filename]["dx"], meta_dic[filename]["dx_type"], meta_dic[filename]["age"], meta_dic[filename]["sex"],  meta_dic[filename]["localization"], "Augmented"]], columns=["image_id", "lesion_id","dx",  "dx_type","age", "sex", "localization", "dataset"])
    df = df.append(newdf)
    save_image(img, str(path+filename+".jpg"))
    # plt.imshow(  img.permute(1, 2, 0)  )
    i += 1
    print(i)
    if i == 2000:
      break

In [None]:
df
df.to_csv("/content/drive/MyDrive/HAM10000/HAM10000_metadata_augmented.csv")

In [None]:
combineddf = mdata.append(df)

In [None]:
combineddf

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
0,HAM_0005335,ISIC_0028389,bkl,histo,40.0,male,upper extremity,Augmented
0,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,Augmented
0,HAM_0000959,ISIC_0026532,bkl,histo,75.0,female,face,Augmented
0,HAM_0003429,ISIC_0028807,bcc,histo,70.0,male,back,Augmented
