# Loading and organizing image data

In [1]:
import zipfile 
import os
import torch

In [2]:
cancer_subtypes = {'adenosis' : 'benign', 
                   'fibroadenoma' : 'benign',
                   'phyllodes_tumor' : 'benign',
                   'tubular_adenoma' : 'benign',
                   'ductal_carcinoma' : 'malignant',
                   'lobular_carcinoma' : 'malignant',
                   'mucinous_carcinoma' : 'malignant',
                   'papillary_carcinoma' : 'malignant'}

magnifications = ['40X', '100X', '200X', '400X']
classes = ['benign', 'malignant']

In [3]:
# Get BreakHis Dataset also for fine-tuning as in example
import kagglehub

# Download latest version
img_root_dir = kagglehub.dataset_download("ambarish/breakhis")
img_root_dir += '/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/'

print("Path to dataset files:", img_root_dir)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ambarish/breakhis?dataset_version_number=4...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.99G/3.99G [01:22<00:00, 51.8MB/s]

Extracting files...





Path to dataset files: /home/ubuntu/.cache/kagglehub/datasets/ambarish/breakhis/versions/4/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/


In [4]:
# organize files into desired structure
# root/[40X/100X/200X/400X]/
#    
#    benign/
#        subtype1/
#            image_zyx.png
#    malignant/
#        subtype1/
#            image_abc.png
#        subtype2/
#    ...

# Create image folders
!mkdir images/

for mag in magnifications:
    !mkdir images/{mag}
    for classification in classes:
        !mkdir images/{mag}/{classification}/
        for subtype in cancer_subtypes.keys():
            if cancer_subtypes[subtype] == classification:
                !mkdir images/{mag}/{classification}/{subtype}


In [47]:
# Move images from kagglehub download location to local repo
imgs = []
for root, dirs, files in os.walk(img_root_dir):
    # Check magnification (check mag first, as this is the last item of path)
    mag = None
    for magnification in magnifications:
        img_path = 'images/'
        
        if magnification in root:
            mag = magnification
            img_path += f'{mag}/'

            # Check benign/malignant
            ben_mal = None
            for g in classes:
                if g in root:
                    ben_mal = g
                    
            # Check subtype
            subtype = None
            for can_subtype in cancer_subtypes.keys():
                if cancer_subtypes[can_subtype] in root:
                    subtype = can_subtype

            # create image path
            img_path += f'{ben_mal}/{subtype}'

            # Add images for each mag/subtype into the proper folder (benign or malignant)
            for file in files:
                imgs.append(file)
                # move image file to correct location
                !mv {root}/{file} {img_path}

In [48]:
len(imgs)

7909

# Creating train/val/test datasets