# Loading Image Data and Create Train/Val Datasets

In [1]:
import zipfile 
import os
import torch

In [2]:
# Get BreakHis Dataset also for fine-tuning as in example
import kagglehub

# Download latest version
img_root_dir = kagglehub.dataset_download("ambarish/breakhis")
img_root_dir += '/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/'

print("Path to dataset files:", img_root_dir)

Path to dataset files: /home/ubuntu/.cache/kagglehub/datasets/ambarish/breakhis/versions/4/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/


In [3]:
import random

# Use 100x files
annotated_files = []

for root, dirs, files in os.walk(img_root_dir):
    if root.endswith('100X'):
        if 'malignant' in root:
            for file in files:
                annotated_files.append(f'{file} 1')
        if 'benign' in root:
            for file in files:
                annotated_files.append(f'{file} 0')

random.shuffle(annotated_files)

In [4]:
# split annotated files into train and test
num_files = len(annotated_files)

train_idx = [i for i in range(int(num_files*0.7))]
val_idx = [i for i in range(int(num_files*0.7), int(num_files*0.8))]
test_idx = [i for i in range(int(num_files*0.8), num_files)]

with open('./meta/train.txt', 'w') as f:
    for i in train_idx:
        f.write('' + annotated_files[i] + '\n')

with open('./meta/val.txt', 'w') as f:
    for i in val_idx:
        f.write('' + annotated_files[i] + '\n')

with open('./meta/test.txt', 'w') as f:
    for i in test_idx:
        f.write('' + annotated_files[i] + '\n')

In [3]:
# organize files into desired structure
# root/[40X/100X/200X/400X]/
#    subtype1/
#        benign/
#            image_zyx.png
#        malignant/
#            image_abc.png
#    subtype2/
#    ...

cancer_subtypes = ['adenosis', # benign
                   'fibroadenoma', # benign
                   'phyllodes_tumor', # benign
                   'tubular_adenoma', # benign
                   'ductal_carcinoma', # malignant
                   'lobular_carcinoma', # malignant
                   'mucinous_carcinoma', # malignant
                   'papillary_carcinoma'] # malignant

magnifications = ['40X', '100X', '200X', '400X']
classes = ['benign', 'malignant']

# Create image folders
!mkdir images/

for mag in magnifications:
    !mkdir images/{mag}
    for subtype in cancer_subtypes:
        !mkdir images/{mag}/{subtype}
        for classification in classes:
            !mkdir images/{mag}/{subtype}/{classification}


In [5]:
# Move images from kagglehub download location to local repo
for root, dirs, files in os.walk(img_root_dir):
    # Check magnification (check mag first, as this is the last item of path)
    mag = None
    for magnification in magnifications:
        img_path = 'images/'
        
        if magnification in root:
            mag = magnification
            img_path += f'{mag}/'
            
            # Check subtype
            subtype = None
            for can_subtype in cancer_subtypes:
                if can_subtype in root:
                    subtype = can_subtype  
                    img_path += f'{subtype}/'
                    
            # Check benign/malignant
            ben_mal = None
            for g in classes:
                if g in root:
                    ben_mal = g
                    img_path += f'{ben_mal}/'


            # Add images for each mag/subtype into proper folder (benign or malignant)
            for file in files:
                # move image file to correct location
                !mv {root}/{file} {img_path}
