# Data wrangling

In [1]:
# Standard library imports
import glob
import shutil
from pathlib import Path

# PyPI imports
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


# 1. Data download

In [2]:
%%time

# Download latest version
path=kagglehub.dataset_download('paultimothymooney/breast-histopathology-images')

print(f'Path to dataset files: {path}\n')

Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/breast-histopathology-images?dataset_version_number=1...


100%|██████████| 3.10G/3.10G [00:55<00:00, 59.9MB/s]


Extracting files...
Path to dataset files: /home/vscode/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1


# 2. Data re-structuring

Right now, the data exists in a set of numbered directories, one for each patient. In each patient directory there are two sub-directories `0` and `1` containing IDC negative and IDC positive images. We need to collect all of the IDC negative images and IDC positive images into their own directories. Let's write a function to do all of that.

In [3]:
def restructure_data(path: str) -> bool:
    '''Takes string path to data directory from KaggleHub download call, moves all IDC 
    negative and positive images into corresponding directories in project data folder.'''

    # Get a list of patient directories
    patients=glob.glob(f'{path}/*')

    # Set up target directories for file copy
    idc_negative_directory='../data/images/idc_negative/'
    idc_positive_directory='../data/images/idc_positive/'

    Path(idc_negative_directory).mkdir(parents=True, exist_ok=True)
    Path(idc_positive_directory).mkdir(parents=True, exist_ok=True)

    # Loop on the patient directories and copy the '0' images to the idc_negative directory
    # and the '1' images to the idc_positive directory
    for patient in patients:

        # Copy the idc negative images from this patient
        idc_negative_images=glob.glob(f'{patient}/0/*.png')

        for image in idc_negative_images:
            shutil.copy(image, idc_negative_directory)

        # Copy the idc positive images from this patient
        idc_positive_images=glob.glob(f'{patient}/1/*.png')

        for image in idc_positive_images:
            shutil.copy(image, idc_positive_directory)

In [4]:
%%time

restructure_data(path)

CPU times: user 15.7 s, sys: 54.1 s, total: 1min 9s
Wall time: 3min 20s


Done!