In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle &> /dev/null
!cp kaggle.json ~/.kaggle/ &> /dev/null
!chmod 600 ~/.kaggle/kaggle.json &> /dev/null
!kaggle datasets download -d surajghuwalewala/ham1000-segmentation-and-classification &> /dev/null
!unzip ham1000-segmentation-and-classification.zip &> /dev/null

Saving kaggle.json to kaggle.json


In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder
from PIL import Image
import shutil

In [3]:
# csv file -> labels
# 2 folders , 1 for images 1 for masks

# image_folder = "/content/images"
# mask_folder = "/content/masks"
csv_path = "/content/GroundTruth.csv"

# output_base = "/content/ham1000_data"
# train_dir = os.path.join(output_base, "train")
# val_dir = os.path.join(output_base, "val")
# test_dir = os.path.join(output_base, "test")


In [4]:
data_table = pd.read_csv(csv_path)
data_table.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# creating a normal encoding for all the columns for straitfying the dataset
enc_label = pd.DataFrame()
labels = data_table.drop(["image"], axis=1)
enc_label['label'] = labels.idxmax(axis=1)
enc_label['label'] = enc_label['label'].str.replace('MEL', '1')
enc_label['label'] = enc_label['label'].str.replace('NV', '2')
enc_label['label'] = enc_label['label'].str.replace('BCC', '3')
enc_label['label'] = enc_label['label'].str.replace('AKIEC', '4')
enc_label['label'] = enc_label['label'].str.replace('BKL', '5')
enc_label['label'] = enc_label['label'].str.replace('DF', '6')
enc_label['label'] = enc_label['label'].str.replace('VASC', '7')
enc_label.head()

Unnamed: 0,label
0,2
1,2
2,2
3,2
4,1


In [6]:
data_table = pd.concat([data_table, enc_label], axis=1)
data_table.head()


Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,label
0,ISIC_0024306,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
1,ISIC_0024307,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
2,ISIC_0024308,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
3,ISIC_0024309,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
4,ISIC_0024310,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [7]:
train_df, x_df = train_test_split(data_table, train_size=0.7, stratify=data_table['label'])
val_df, test_df = train_test_split(x_df, train_size=0.5, stratify=x_df['label'])

train_df.head()

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC,label
7548,ISIC_0031854,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
4589,ISIC_0028895,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
198,ISIC_0024504,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
8483,ISIC_0032789,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
6769,ISIC_0031075,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2


In [9]:
train_images, train_labels = train_df['image'], train_df['label']
val_images, val_labels = val_df['image'], val_df['label']
test_images, test_labels = test_df['image'], test_df['label']

# print(f"Train image: {train_images[0]}")

In [10]:
os.makedirs("data", exist_ok=True)
os.makedirs("data/train", exist_ok=True)
os.makedirs("data/val", exist_ok=True)
os.makedirs("data/test", exist_ok=True)

for image in train_images:
    shutil.copy(f"/content/images/{image}.jpg", "data/train")

for image in val_images:
    shutil.copy(f"/content/images/{image}.jpg", "data/val")

for image in test_images:
    shutil.copy(f"/content/images/{image}.jpg", "data/test")

In [11]:
# images preprocessing
test_valid_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     transforms.Resize((224, 224)),
     transforms.ColorJitter(brightness=0.1, contrast=0.4, saturation=0.1, hue=0.1)
     ])

train_preprocessing = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     transforms.Resize((224, 224)),
     transforms.ColorJitter(brightness=0.1, contrast=0.4, saturation=0.1, hue=0.1),
     transforms.RandomHorizontalFlip(p=0.5),
     transforms.RandomVerticalFlip(p=0.5),
     transforms.RandomRotation(degrees=30),
     ])


In [12]:
train_images = ImageFolder("data", transform=train_preprocessing)
val_images = ImageFolder("data", transform=test_valid_transform)
test_images = ImageFolder("data", transform=test_valid_transform)

FileNotFoundError: Couldn't find any class folder in data/train.

In [None]:
# train_images = pre_processing(train_images.to_numpy())
# val_images = pre_processing(val_images.to_numpy())
# test_images = pre_processing(test_images.to_numpy())

# train_images = train_val_transform(train_images)
# val_images = train_val_transform(val_images)