In [10]:
from pathlib import Path
import os
import pandas as pd


notebook_dir = Path.cwd()


data_path = notebook_dir / "../../data/downloads/classification/leukemia_classification"
data_path = data_path.resolve()  

print("Notebook dir:", notebook_dir)
print("Data path:", data_path)


Notebook dir: c:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\notebooks\classification
Data path: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\downloads\classification\leukemia_classification


In [16]:
print("Resolved dataset path:", data_path)
print("Exists?", data_path.exists())
print("Subfolders:", [p.name for p in data_path.iterdir() if p.is_dir()])

records = []
for root, dirs, files in os.walk(data_path):
    img_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    if img_files:
        rel_path = Path(root).relative_to(data_path)
        records.append({
            "folder": str(rel_path),
            "num_images": len(img_files)
        })

if records:
    df_counts = pd.DataFrame(records).sort_values(by="folder").reset_index(drop=True)
    print(df_counts)
    print("\nTotal images:", df_counts['num_images'].sum())
else:
    print("⚠️ No image files found — check dataset path:", data_path)


Resolved dataset path: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\downloads\classification\leukemia_classification
Exists? True
Subfolders: ['testing_data', 'training_data', 'validation_data']
                                         folder  num_images
0      testing_data\C-NMC_test_final_phase_data        2586
1                      training_data\fold_0\all        2397
2                      training_data\fold_0\hem        1130
3                      training_data\fold_1\all        2418
4                      training_data\fold_1\hem        1163
5                      training_data\fold_2\all        2457
6                      training_data\fold_2\hem        1096
7  validation_data\C-NMC_test_prelim_phase_data        1867

Total images: 15114


In [17]:
from collections import defaultdict

folder_filetypes = defaultdict(lambda: defaultdict(int))

for root, dirs, files in os.walk(data_path):
    if files:
        rel_path = Path(root).relative_to(data_path)
        for f in files:
            ext = Path(f).suffix.lower()
            folder_filetypes[str(rel_path)][ext] += 1

records = []
for folder, exts in folder_filetypes.items():
    for ext, count in exts.items():
        records.append({"folder": folder, "extension": ext if ext else "NO_EXT", "count": count})

df_filetypes = pd.DataFrame(records).sort_values(["folder", "extension"]).reset_index(drop=True)
print(df_filetypes)


                                         folder extension  count
0      testing_data\C-NMC_test_final_phase_data      .bmp   2586
1                      training_data\fold_0\all      .bmp   2397
2                      training_data\fold_0\hem      .bmp   1130
3                      training_data\fold_1\all      .bmp   2418
4                      training_data\fold_1\hem      .bmp   1163
5                      training_data\fold_2\all      .bmp   2457
6                      training_data\fold_2\hem      .bmp   1096
7                               validation_data      .csv      1
8  validation_data\C-NMC_test_prelim_phase_data      .bmp   1867


In [19]:
import shutil

test_path = data_path / "testing_data"

if test_path.exists():
    shutil.rmtree(test_path)   
    print(f"Removed: {test_path}")
else:
    print(f"No testing_data folder found at {test_path}")

Removed: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\downloads\classification\leukemia_classification\testing_data


In [20]:
print("Subfolders:", [p.name for p in data_path.iterdir() if p.is_dir()])

Subfolders: ['training_data', 'validation_data']


In [22]:
csv_path = data_path / "validation_data" / "C-NMC_test_prelim_phase_data_labels.csv"

val_labels = pd.read_csv(csv_path)

print("Shape of validation labels:", val_labels.shape)
print("\nFirst 10 rows:\n", val_labels.head(10))


Shape of validation labels: (1867, 3)

First 10 rows:
              Patient_ID new_names  labels
0   UID_57_29_1_all.bmp     1.bmp       1
1   UID_57_22_2_all.bmp     2.bmp       1
2   UID_57_31_3_all.bmp     3.bmp       1
3  UID_H49_35_1_hem.bmp     4.bmp       0
4   UID_58_6_13_all.bmp     5.bmp       1
5   UID_57_8_11_all.bmp     6.bmp       1
6  UID_H49_29_2_hem.bmp     7.bmp       0
7   UID_H30_6_2_hem.bmp     8.bmp       0
8    UID_58_2_1_all.bmp     9.bmp       1
9   UID_54_35_3_all.bmp    10.bmp       1


In [23]:
val_labels_clean = val_labels.drop(columns=["Patient_ID"]).rename(columns={"new_names": "pic_name", "labels": "label"})

print("Cleaned validation labels shape:", val_labels_clean.shape)
print(val_labels_clean.head(10))

Cleaned validation labels shape: (1867, 2)
  pic_name  label
0    1.bmp      1
1    2.bmp      1
2    3.bmp      1
3    4.bmp      0
4    5.bmp      1
5    6.bmp      1
6    7.bmp      0
7    8.bmp      0
8    9.bmp      1
9   10.bmp      1


In [24]:
raw_data_path = Path("../../data/downloads/classification/leukemia_classification").resolve()

processed_data_path = Path("../../data/processed/classification/leukemia_classification").resolve()

processed_data_path.mkdir(parents=True, exist_ok=True)

print("Processed dataset path:", processed_data_path)

Processed dataset path: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification


In [25]:
shutil.copytree(raw_data_path, processed_data_path, dirs_exist_ok=True)
print(f"Copied dataset to: {processed_data_path}")


Copied dataset to: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification


In [27]:
processed_data_path = Path("../../data/processed/classification/leukemia_classification").resolve()

records = []

train_path = processed_data_path / "training_data"
for fold in train_path.iterdir():
    if fold.is_dir():
        for cls in ["all", "hem"]:
            cls_path = fold / cls
            count = len([f for f in cls_path.iterdir() if f.suffix.lower() in [".bmp", ".png", ".jpg", ".jpeg"]])
            records.append({
                "set": f"train_{fold.name}",
                "class": cls,
                "count": count
            })

val_csv = processed_data_path / "validation_data" / "C-NMC_test_prelim_phase_data_labels.csv"
val_df = pd.read_csv(val_csv)

print("Columns in CSV:", val_df.columns.tolist())

if "Patient_ID" in val_df.columns:
    val_df = val_df.drop(columns=["Patient_ID"])

val_df = val_df.rename(columns={"new_names": "pic_name", "labels": "label"})


val_df.to_csv(val_csv, index=False)

print("✅ Cleaned validation CSV saved. Columns now:", val_df.columns.tolist())

Columns in CSV: ['Patient_ID', 'new_names', 'labels']
✅ Cleaned validation CSV saved. Columns now: ['pic_name', 'label']


In [28]:
val_counts = val_df["label"].map({1: "all", 0: "hem"}).value_counts().to_dict()
for cls, count in val_counts.items():
    records.append({
        "set": "validation",
        "class": cls,
        "count": count
    })

df_summary = pd.DataFrame(records)

df_summary = df_summary.groupby(["set", "class"])["count"].sum().reset_index()
df_summary["percentage"] = df_summary["count"] / df_summary["count"].sum() * 100

total_row = pd.DataFrame([{
    "set": "TOTAL",
    "class": "all+hem",
    "count": df_summary["count"].sum(),
    "percentage": 100.0
}])
df_summary = pd.concat([df_summary, total_row], ignore_index=True)

print(df_summary)

            set    class  count  percentage
0  train_fold_0      all   2397   19.133142
1  train_fold_0      hem   1130    9.019796
2  train_fold_1      all   2418   19.300766
3  train_fold_1      hem   1163    9.283206
4  train_fold_2      all   2457   19.612069
5  train_fold_2      hem   1096    8.748404
6    validation      all   1219    9.730204
7    validation      hem    648    5.172414
8         TOTAL  all+hem  12528  100.000000


In [29]:
records = []

# ---- Training (fold_0 + fold_1) ----
for fold in ["fold_0", "fold_1"]:
    for cls in ["all", "hem"]:
        cls_path = processed_data_path / "training_data" / fold / cls
        count = len([f for f in cls_path.iterdir() if f.suffix.lower() in [".bmp", ".png", ".jpg", ".jpeg"]])
        records.append({"set": "train", "class": cls, "count": count})

# ---- Validation (fold_2) ----
for cls in ["all", "hem"]:
    cls_path = processed_data_path / "training_data" / "fold_2" / cls
    count = len([f for f in cls_path.iterdir() if f.suffix.lower() in [".bmp", ".png", ".jpg", ".jpeg"]])
    records.append({"set": "val", "class": cls, "count": count})

# ---- Test (prelim validation with CSV) ----
val_csv = processed_data_path / "validation_data" / "C-NMC_test_prelim_phase_data_labels.csv"
val_df = pd.read_csv(val_csv)

test_counts = val_df["label"].map({1: "all", 0: "hem"}).value_counts().to_dict()
for cls, count in test_counts.items():
    records.append({"set": "test", "class": cls, "count": count})

# ---- summary ----
df_summary = pd.DataFrame(records)
df_summary = df_summary.groupby(["set", "class"])["count"].sum().reset_index()
df_summary["percentage"] = df_summary["count"] / df_summary["count"].sum() * 100


total_row = pd.DataFrame([{
    "set": "TOTAL",
    "class": "all+hem",
    "count": df_summary["count"].sum(),
    "percentage": 100.0
}])
df_summary = pd.concat([df_summary, total_row], ignore_index=True)

print(df_summary)

     set    class  count  percentage
0   test      all   1219    9.730204
1   test      hem    648    5.172414
2  train      all   4815   38.433908
3  train      hem   2293   18.303001
4    val      all   2457   19.612069
5    val      hem   1096    8.748404
6  TOTAL  all+hem  12528  100.000000


In [30]:
import os
import shutil
import random
from pathlib import Path

In [36]:
root_path = Path("../../data/processed/classification/leukemia_classification").resolve()
fold2_path = root_path / "training_data" / "fold_2"
val_path = root_path / "validation" 

In [37]:
for cls in ["all", "hem"]:
    (val_path / cls).mkdir(parents=True, exist_ok=True)

In [38]:
for cls in ["all", "hem"]:
    cls_path = fold2_path / cls
    images = list(cls_path.glob("*.bmp"))
    random.shuffle(images)

    split_idx = len(images) // 2
    val_imgs, train_imgs = images[:split_idx], images[split_idx:]

    # Move half to validation
    for img in val_imgs:
        shutil.move(str(img), str(val_path / cls / img.name))

    print(f"{cls}: {len(val_imgs)} → validation, {len(train_imgs)} remain in fold_2 for training")

all: 1228 → validation, 1229 remain in fold_2 for training
hem: 548 → validation, 548 remain in fold_2 for training


In [41]:
val_data_path = root_path / "validation_data"
test_path = root_path / "test"

In [42]:
if val_data_path.exists():
    if test_path.exists():
        shutil.rmtree(test_path)
    shutil.move(str(val_data_path), str(test_path))
    print(f"Renamed {val_data_path} → {test_path}")
else:
    print("⚠️ No validation_data folder found to rename")

Renamed C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification\validation_data → C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification\test


In [43]:
train_path = root_path / "training_data"
new_train_path = root_path / "train"

if train_path.exists():
    if new_train_path.exists():
        shutil.rmtree(new_train_path)
    shutil.move(str(train_path), str(new_train_path))
    print(f"Renamed {train_path} → {new_train_path}")

Renamed C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification\training_data → C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification\train


In [44]:
test_path = root_path / "test"
csv_path = test_path / "C-NMC_test_prelim_phase_data_labels.csv"
img_dir = test_path / "C-NMC_test_prelim_phase_data"

In [45]:
all_dir = test_path / "all"
hem_dir = test_path / "hem"
all_dir.mkdir(exist_ok=True)
hem_dir.mkdir(exist_ok=True)


In [46]:
df = pd.read_csv(csv_path)

In [47]:
if "labels" in df.columns:
    label_col = "labels"
elif "label" in df.columns:
    label_col = "label"
else:
    raise ValueError("Could not find label column in CSV")

In [49]:
for _, row in df.iterrows():
    img_name = row["pic_name"] 
    label = row[label_col]

    src = img_dir / img_name
    if label == 1:   # ALL (cancer)
        dst = all_dir / img_name
    else:            # HEM (healthy)
        dst = hem_dir / img_name

    if src.exists():
        shutil.move(str(src), str(dst))

In [50]:
empty_test_dir = Path("../../data/processed/classification/leukemia_classification/test/C-NMC_test_prelim_phase_data").resolve()

if empty_test_dir.exists():
    shutil.rmtree(empty_test_dir)
    print(f"✅ Removed empty folder: {empty_test_dir}")
else:
    print("⚠️ Folder does not exist or already removed.")

✅ Removed empty folder: C:\Users\Lenovo\Desktop\Exploration-of-ViT-and-CNN-for-Medical-Image-Processing\data\processed\classification\leukemia_classification\test\C-NMC_test_prelim_phase_data


In [1]:
from PIL import Image
import os
from pathlib import Path

In [7]:
train_dirf0 = Path("../../data/processed/classification/leukemia_classification/train/fold_0/")
train_dirf1 = Path("../../data/processed/classification/leukemia_classification/train/fold_1/")
train_dirf2 = Path("../../data/processed/classification/leukemia_classification/train/fold_2/")
val_dir = Path("../../data/processed/classification/leukemia_classification/validation")
test_dir = Path("../../data/processed/classification/leukemia_classification/test")

In [8]:
def check_image_dimensions(image_dir):
    print(f"Checking images in {image_dir}...")
    for cls in ["all", "hem"]:
        cls_dir = image_dir / cls
        images = list(cls_dir.glob("*.bmp")) + list(cls_dir.glob("*.png")) + list(cls_dir.glob("*.jpg"))
        print(f"\n{cls} images:")
        for img_path in images:
            with Image.open(img_path) as img:
                width, height = img.size
                print(f"{img_path.name}: {width}x{height}")

In [11]:
check_image_dimensions(train_dirf0)

Checking images in ..\..\data\processed\classification\leukemia_classification\train\fold_0...

all images:
UID_11_10_1_all.bmp: 450x450
UID_11_11_1_all.bmp: 450x450
UID_11_11_2_all.bmp: 450x450
UID_11_11_3_all.bmp: 450x450
UID_11_12_1_all.bmp: 450x450
UID_11_12_2_all.bmp: 450x450
UID_11_12_3_all.bmp: 450x450
UID_11_14_1_all.bmp: 450x450
UID_11_15_1_all.bmp: 450x450
UID_11_15_2_all.bmp: 450x450
UID_11_15_3_all.bmp: 450x450
UID_11_15_4_all.bmp: 450x450
UID_11_15_5_all.bmp: 450x450
UID_11_16_1_all.bmp: 450x450
UID_11_16_2_all.bmp: 450x450
UID_11_16_3_all.bmp: 450x450
UID_11_17_1_all.bmp: 450x450
UID_11_17_2_all.bmp: 450x450
UID_11_18_1_all.bmp: 450x450
UID_11_19_1_all.bmp: 450x450
UID_11_1_1_all.bmp: 450x450
UID_11_1_2_all.bmp: 450x450
UID_11_21_1_all.bmp: 450x450
UID_11_21_2_all.bmp: 450x450
UID_11_21_3_all.bmp: 450x450
UID_11_22_1_all.bmp: 450x450
UID_11_22_2_all.bmp: 450x450
UID_11_23_1_all.bmp: 450x450
UID_11_23_2_all.bmp: 450x450
UID_11_24_1_all.bmp: 450x450
UID_11_24_2_all.bmp: 450

In [12]:
check_image_dimensions(val_dir)

Checking images in ..\..\data\processed\classification\leukemia_classification\validation...

all images:
UID_13_11_1_all.bmp: 450x450
UID_13_12_1_all.bmp: 450x450
UID_13_13_2_all.bmp: 450x450
UID_13_14_4_all.bmp: 450x450
UID_13_15_1_all.bmp: 450x450
UID_13_17_1_all.bmp: 450x450
UID_13_18_1_all.bmp: 450x450
UID_13_1_1_all.bmp: 450x450
UID_13_21_1_all.bmp: 450x450
UID_13_21_2_all.bmp: 450x450
UID_13_21_3_all.bmp: 450x450
UID_13_22_1_all.bmp: 450x450
UID_13_23_3_all.bmp: 450x450
UID_13_25_1_all.bmp: 450x450
UID_13_25_2_all.bmp: 450x450
UID_13_25_3_all.bmp: 450x450
UID_13_25_4_all.bmp: 450x450
UID_13_26_1_all.bmp: 450x450
UID_13_26_2_all.bmp: 450x450
UID_13_28_2_all.bmp: 450x450
UID_13_28_3_all.bmp: 450x450
UID_13_29_1_all.bmp: 450x450
UID_13_29_2_all.bmp: 450x450
UID_13_29_4_all.bmp: 450x450
UID_13_30_1_all.bmp: 450x450
UID_13_30_2_all.bmp: 450x450
UID_13_30_4_all.bmp: 450x450
UID_13_31_2_all.bmp: 450x450
UID_13_32_2_all.bmp: 450x450
UID_13_33_1_all.bmp: 450x450
UID_13_33_2_all.bmp: 450x

In [13]:
check_image_dimensions(test_dir)

Checking images in ..\..\data\processed\classification\leukemia_classification\test...

all images:
1.bmp: 450x450
10.bmp: 450x450
100.bmp: 450x450
1000.bmp: 450x450
1001.bmp: 450x450
1003.bmp: 450x450
1004.bmp: 450x450
1005.bmp: 450x450
1006.bmp: 450x450
1008.bmp: 450x450
1009.bmp: 450x450
101.bmp: 450x450
1010.bmp: 450x450
1012.bmp: 450x450
1016.bmp: 450x450
1018.bmp: 450x450
1019.bmp: 450x450
102.bmp: 450x450
1021.bmp: 450x450
1022.bmp: 450x450
1023.bmp: 450x450
1026.bmp: 450x450
1027.bmp: 450x450
1028.bmp: 450x450
1029.bmp: 450x450
103.bmp: 450x450
1030.bmp: 450x450
1031.bmp: 450x450
1032.bmp: 450x450
1033.bmp: 450x450
1036.bmp: 450x450
1037.bmp: 450x450
1038.bmp: 450x450
1039.bmp: 450x450
1040.bmp: 450x450
1041.bmp: 450x450
1042.bmp: 450x450
1044.bmp: 450x450
1045.bmp: 450x450
1046.bmp: 450x450
1047.bmp: 450x450
1049.bmp: 450x450
1051.bmp: 450x450
1053.bmp: 450x450
1054.bmp: 450x450
1055.bmp: 450x450
1058.bmp: 450x450
1059.bmp: 450x450
1060.bmp: 450x450
1061.bmp: 450x450
1065.bmp:

In [14]:
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch

In [15]:
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

In [16]:
train_tfms = transforms.Compose([
    transforms.RandomHorizontalFlip(),       # Random horizontal flip
    transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),  # Color jitter for variety
    transforms.ToTensor(),                   # Convert to tensor
    transforms.Normalize(mean, std),         # Normalize the image
])

In [17]:

val_tfms = transforms.Compose([
    transforms.ToTensor(),                    # Convert to tensor
    transforms.Normalize(mean, std),          # Normalize the image
])

In [18]:
train_datasetf0 = datasets.ImageFolder(train_dirf0, transform=train_tfms)
train_datasetf1 = datasets.ImageFolder(train_dirf1, transform=train_tfms)
train_datasetf2 = datasets.ImageFolder(train_dirf2, transform=train_tfms)
val_dataset = datasets.ImageFolder(val_dir, transform=val_tfms)
test_dataset = datasets.ImageFolder(test_dir, transform=val_tfms) 

In [19]:
batch_size = 32
train_loaderf0 = DataLoader(train_datasetf0, batch_size=batch_size, shuffle=True)
train_loaderf1 = DataLoader(train_datasetf1, batch_size=batch_size, shuffle=True)
train_loaderf2 = DataLoader(train_datasetf2, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [20]:
data_iter = iter(train_loaderf0)
images, labels = next(data_iter)
print(f"Train batch dimensions: {images.shape}")  # Should print: torch.Size([batch_size, 3, 450, 450])

Train batch dimensions: torch.Size([32, 3, 450, 450])
