### Dataset import

In [1]:
import kagglehub, os, shutil, random

In [3]:
# Copy to current directory
target_dir = "chest_xray_data"

if not os.path.exists(target_dir):

    # Download the dataset
    path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")

    print("Downloaded to cache:", path)

    shutil.copytree(path, target_dir)
    print(f"Dataset copied to: {target_dir}")
else:
    print(f"Dataset already exists at: {target_dir}")

print("\nDataset ready at:", os.path.abspath(target_dir))

Dataset already exists at: chest_xray_data

Dataset ready at: c:\Users\eirik\Documents\FYSSTK3155\Test\chest_xray_data


### Dataset reshuffle and train, test, val split 80/10/10

Copied from: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia/discussion/485689 

In [4]:
dataset_path = 'chest_xray_data/chest_xray'
new_dataset_path = 'chest_xray_data_new'

if not os.path.exists(new_dataset_path):
    for split in ['train', 'val', 'test']:
        for cls in ['NORMAL', 'PNEUMONIA']:
            os.makedirs(f'{new_dataset_path}/{split}/{cls}', exist_ok=True)

    for cls in ['NORMAL', 'PNEUMONIA']:
        all_files = []
        for split in ['train', 'val', 'test']:
            source_folder = f'{dataset_path}/{split}/{cls}'
            files = os.listdir(source_folder)
            all_files.extend([(file, source_folder) for file in files])

        random.shuffle(all_files)

        train_files = all_files[:int(len(all_files)*0.8)]
        val_files = all_files[int(len(all_files)*0.8):int(len(all_files)*0.9)]
        test_files = all_files[int(len(all_files)*0.9):]

        for file, source_folder in train_files:
            dest = f'{new_dataset_path}/train/{cls}/{file}'
            shutil.copy(f'{source_folder}/{file}', dest)

        for file, source_folder in val_files:
            dest = f'{new_dataset_path}/val/{cls}/{file}'
            shutil.copy(f'{source_folder}/{file}', dest)

        for file, source_folder in test_files:
            dest = f'{new_dataset_path}/test/{cls}/{file}'
            shutil.copy(f'{source_folder}/{file}', dest)  

    print("\nDataset ready at:", os.path.abspath(new_dataset_path))
   
else:
    print(f"Dataset already exists at: {new_dataset_path}")

Dataset already exists at: chest_xray_data_new
