## Process dataset with a proper split

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split # create train/val split
import shutil # copy files
import os

raw = Path("../data/raw/chest_xray")
processed = Path("../data/processed/cxr")
metadata = Path("../data/metadata")

raw_train = raw / "train"
raw_test = raw / "test"
# raw_val = raw / "val"  # validation data is too small so will use training data for it

# Processed data that was cleaned then split for a training, validation, and test dataset
processed_train = processed / "train"
processed_val = processed / "val"
processed_test = processed / "test"

metadata.mkdir(parents=True, exist_ok=True)
processed.mkdir(parents=True, exist_ok=True)

In [4]:
# Load the raw validated data
raw_inventory = metadata / "raw_train_inventory.csv"
# corrupted = metadata / "raw_corrupted_files.csv"

train_df = pd.read_csv(raw_inventory)

# if corrupted.exists():
#   bad_df = pd.read_csv(corrupted)
#   bad_paths = set(bad_df["path"].tolist())
# else:
#   bad_paths = set()

print("Loaded train inventory:", len(train_df))
# print("Corrupted files listed:", len(bad_paths))

# Filter out any unreadable files
# before = len(train_df)
# train_df = train_df[~train_df["path"].isin(bad_paths)].copy()
# after = len(train_df)

# print(f"Filtered any unreadable files: {before} -> {after}")
print(train_df["label"].value_counts())

Loaded train inventory: 5216
label
PNEUMONIA    3875
NORMAL       1341
Name: count, dtype: int64


In [5]:
# Training and validation split with a preserved class ratio
train, val = train_test_split(
  train_df,
  test_size=0.15, # 15% of training set becomes the val set
  random_state=42, # reproducible split
  stratify=train_df["label"] # keeps normal/pneumonia image proportions similar in train and val set
)

# Add new "split" column to label data for identification
train["split"] = "train"
val["split"] = "val"

# Combine training and validation paths into a list
split_data = pd.concat([train, val], ignore_index=True)

print("Train split size:", len(train))
print("Val split size:", len(val))
print("\nTrain class distribution:")
print(train["label"].value_counts(normalize=True)) # normalize -> change raw counts to frequencies
print("\nVal class distribution:")
print(val["label"].value_counts(normalize=True))

Train split size: 4433
Val split size: 783

Train class distribution:
label
PNEUMONIA    0.742838
NORMAL       0.257162
Name: proportion, dtype: float64

Val class distribution:
label
PNEUMONIA    0.743295
NORMAL       0.256705
Name: proportion, dtype: float64


In [6]:
# # Create output folders
# classes = sorted(train["label"].unique().tolist())
# print("Classes:", classes) # normal or pneumonia

# for split in [processed_train, processed_val, processed_test]:
#   for cls in classes:
#     (split / cls).mkdir(parents=True, exist_ok=True)

# print("Created processed folder structure")

In [7]:
def copy_image(src_path: str, folder: Path):
  """
  Copies an image into folder to preserve metadata, and if a file name
  already exists then prefix with a counter
  """
  src = Path(src_path)
  dst = folder / src.name

  if dst.exists():
    # create unique image name
    stem = src.stem
    suffix = src.suffix
    i = 1
    while (folder / f"{stem}_{i}{suffix}").exists():
      i += 1
    dst = folder / f"{stem}_{i}{suffix}"

  shutil.copy2(src, dst) # copy2 preserves metadata
  return str(dst)

In [9]:
# Copy processed data into processed folder
copied_rows = []

for row in split_data.itertuples(index=False):
  src_path = row.path
  label = row.label
  split = row.split

  if split == "train":
    base = processed_train
  else:
    base = processed_val

  folder = base / label
  path = copy_image(src_path, folder)

  copied_rows.append({
    "raw_path": src_path,
    "processed_path": path,
    "label": label,
    "split": split
  })

copied_df = pd.DataFrame(copied_rows)

print("Copied train + val images:", len(copied_df))
print(copied_df["split"].value_counts())
  

Copied train + val images: 5216
split
train    4433
val       783
Name: count, dtype: int64


In [None]:
# Copy everything from raw test set
test_rows = []
bad_test = 0

for class_dir in raw_test.iterdir():
  if not class_dir.is_dir():
    continue
  label = class_dir.name

  for img_path in class_dir.glob("*"):
    try:
      from PIL import Image
      with Image.open(img_path) as _:
        pass

      path = copy_image(str(img_path), processed_test / label)

      test_rows.append({
        "raw_path": str(img_path),
        "processed_path": path,
        "label": label,
        "split": "test"
      })
    except:
      bad_test += 1

test_df = pd.DataFrame(test_rows)

print("Copied test image:", len(test_df))
print("Unreadable test images skipped:", bad_test)
print(test_df["label"].value_counts())

Copied test image: 624
Unreadable test images skippsed: 0
label
PNEUMONIA    390
NORMAL       234
Name: count, dtype: int64


In [None]:
# Copy data into ca sv file
inventory = pd.concat([copied_df, test_df], ignore_index=True)

(split_data[["path", "label", "split"]]
 .to_csv(metadata / "manifest.csv", index=False))

inventory.to_csv(metadata / "processed_inventory.csv", index=False)

print("Saved:")
print(" -", metadata / "split_manifest.csv")
print(" -", metadata / "processed_inventory.csv")

Saved:
 - ..\data\metadata\split_manifest.csv
 - ..\data\metadata\processed_inventory.csv


In [16]:
# Make sure there are no overlapping images in each set
train_set = set(copied_df[copied_df["split"]=="train"]["raw_path"])
val_set   = set(copied_df[copied_df["split"]=="val"]["raw_path"])
test_set  = set(test_df["raw_path"])

print("Train ∩ Val:", len(train_set & val_set))
print("Train ∩ Test:", len(train_set & test_set))
print("Val ∩ Test:", len(val_set & test_set))

Train ∩ Val: 0
Train ∩ Test: 0
Val ∩ Test: 0
