In [1]:
import os
print(os.listdir())


['.ipynb_checkpoints', 'dicom_batch_1', 'dicom_batch_1.zip', 'dicom_batch_3', 'dicom_batch_3.zip', 'notebooka8cac6eed8 (1).ipynb', 'notebooka8cac6eed8 (2).ipynb', 'notebooka8cac6eed8 (4).ipynb', 'Preprocessing.ipynb', 'processed_images']


In [2]:
input_folders = ["dicom_batch_1", "dicom_batch_3"]
output_folder = "processed_images"

os.makedirs(output_folder, exist_ok=True)


In [3]:
import pydicom
import numpy as np
from PIL import Image

def dicom_to_rgb(dicom_path):
    dcm = pydicom.dcmread(dicom_path)
    arr = dcm.pixel_array.astype(float)

    # normalize 0–255
    arr = (arr - arr.min()) / (arr.max() - arr.min())
    arr = (arr * 255).astype(np.uint8)

    # grayscale → RGB
    img = Image.fromarray(arr).convert("RGB")
    return img


In [4]:
count = 0
for folder in input_folders:
    for idx, file in enumerate(os.listdir(folder)):
        if file.endswith(".dicom") or file.endswith(".dcm"):
            img = dicom_to_rgb(os.path.join(folder, file))
            img.save(os.path.join(output_folder, file.replace(".dicom", ".png").replace(".dcm", ".png")))
            
            count += 1
            
            if count % 50 == 0:
                print(f"{count} images processed...")

print("Total RGB images saved:", count)


50 images processed...
100 images processed...
150 images processed...
200 images processed...
250 images processed...
300 images processed...
350 images processed...
400 images processed...
450 images processed...
500 images processed...
550 images processed...
600 images processed...
650 images processed...
700 images processed...
750 images processed...
800 images processed...
850 images processed...
900 images processed...
950 images processed...
1000 images processed...
1050 images processed...
1100 images processed...
1150 images processed...
1200 images processed...
1250 images processed...
1300 images processed...
1350 images processed...
1400 images processed...
1450 images processed...
1500 images processed...
Total RGB images saved: 1500


In [5]:
from PIL import Image
import os

input_folder = "processed_images"
resized_folder = "resized_images"
os.makedirs(resized_folder, exist_ok=True)

target_size = (224, 224)   # You can change this

count = 0
for file in os.listdir(input_folder):
    if file.endswith(".png"):
        img = Image.open(os.path.join(input_folder, file))
        img = img.resize(target_size)
        img.save(os.path.join(resized_folder, file))
        count += 1

print("Total resized images:", count)

Total resized images: 1500


In [6]:
import numpy as np
from PIL import Image
import os

input_folder = "resized_images"
normalized_folder = "normalized_images"
os.makedirs(normalized_folder, exist_ok=True)

count = 0
for file in os.listdir(input_folder):
    if file.endswith(".png"):
        img = Image.open(os.path.join(input_folder, file)).convert("RGB")
        arr = np.array(img).astype("float32") / 255.0  # normalize
        
        # convert back to image for saving preview (optional)
        arr_uint8 = (arr * 255).astype("uint8")
        out = Image.fromarray(arr_uint8)

        out.save(os.path.join(normalized_folder, file))
        count += 1

print("Total normalized images:", count)


Total normalized images: 1500


In [7]:
from PIL import Image
import os

input_folder = "normalized_images"
downscaled_folder = "downscaled_images"
os.makedirs(downscaled_folder, exist_ok=True)

scale_factor = 0.5   # make images 50% smaller

count = 0
for file in os.listdir(input_folder):
    if file.endswith(".png"):
        img = Image.open(os.path.join(input_folder, file))
        w, h = img.size
        img = img.resize((int(w*scale_factor), int(h*scale_factor)))
        img.save(os.path.join(downscaled_folder, file))
        count += 1

print("Downscaled image count:", count)


Downscaled image count: 1500


In [8]:
import torch
from torchvision import transforms
from PIL import Image
import os

input_folder = "downscaled_images"
tensor_folder = "tensor_images"
os.makedirs(tensor_folder, exist_ok=True)

transform = transforms.ToTensor()  # converts to (C,H,W) tensor in 0–1 range

count = 0
for file in os.listdir(input_folder):
    if file.endswith(".png"):
        img = Image.open(os.path.join(input_folder, file)).convert("RGB")
        tensor = transform(img)

        torch.save(tensor, os.path.join(tensor_folder, file.replace(".png", ".pt")))
        count += 1

print("Total tensors saved:", count)

Total tensors saved: 1500


In [9]:
import os
print(os.listdir())


['.ipynb_checkpoints', 'dicom_batch_1', 'dicom_batch_1.zip', 'dicom_batch_3', 'dicom_batch_3.zip', 'downscaled_images', 'normalized_images', 'notebooka8cac6eed8 (1).ipynb', 'notebooka8cac6eed8 (2).ipynb', 'notebooka8cac6eed8 (4).ipynb', 'Preprocessing.ipynb', 'processed_images', 'resized_images', 'tensor_images']


In [10]:
# Cell 0 — imports + helpers (run once)
import os
from pathlib import Path
import numpy as np
import pydicom
from PIL import Image
import cv2
import torch
from torchvision import transforms
import shutil
from sklearn.model_selection import train_test_split

# folders (adjust if needed)
rgb_folder = Path("processed_images")        # DICOM->RGB output (you said this is done)
grayscale_folder = Path("grayscale_images")
resized_folder = Path("resized_images")
norm_folder = Path("normalized_images")
clahe_folder = Path("clahe_images")
denoise_folder = Path("denoised_images")
cropped_folder = Path("cropped_images")
padded_folder = Path("padded_images")
tensor_folder = Path("tensor_images")
os.makedirs(grayscale_folder, exist_ok=True)
os.makedirs(resized_folder, exist_ok=True)
os.makedirs(norm_folder, exist_ok=True)
os.makedirs(clahe_folder, exist_ok=True)
os.makedirs(denoise_folder, exist_ok=True)
os.makedirs(cropped_folder, exist_ok=True)
os.makedirs(padded_folder, exist_ok=True)
os.makedirs(tensor_folder, exist_ok=True)

def list_pngs(folder):
    return sorted([f for f in os.listdir(folder) if f.lower().endswith(".png")])


In [11]:
# Cell 1 — RGB -> Grayscale
count = 0
for fn in list_pngs(rgb_folder):
    src = rgb_folder / fn
    dst = grayscale_folder / fn
    if dst.exists(): 
        continue
    img = Image.open(src).convert("L")   # L = single channel
    img.save(dst)
    count += 1
    if count % 200 == 0:
        print(f"{count} grayscale images created")
print("Grayscale done:", count)


200 grayscale images created
400 grayscale images created
600 grayscale images created
800 grayscale images created
1000 grayscale images created
1200 grayscale images created
1400 grayscale images created
Grayscale done: 1500


In [15]:
import os

print("Current folder:", os.getcwd())
print("Items in folder:", os.listdir())


Current folder: C:\Users\Rishika Sharma\AI-CliniScan\AI-CliniScan\RishikaSharma
Items in folder: ['.ipynb_checkpoints', 'clahe_images', 'cropped_images', 'denoised_images', 'dicom_batch_1', 'dicom_batch_1.zip', 'dicom_batch_3', 'dicom_batch_3.zip', 'downscaled_images', 'grayscale_images', 'normalized_images', 'notebooka8cac6eed8 (1).ipynb', 'notebooka8cac6eed8 (2).ipynb', 'notebooka8cac6eed8 (4).ipynb', 'padded_images', 'Preprocessing.ipynb', 'processed_images', 'resized_images', 'tensor_images']


In [16]:
len(os.listdir("grayscale_images"))


1501

In [19]:
grayscale_folder = Path("grayscale_images")


In [20]:
import os
print(os.listdir())


['.ipynb_checkpoints', 'clahe_images', 'cropped_images', 'denoised_images', 'dicom_batch_1', 'dicom_batch_1.zip', 'dicom_batch_3', 'dicom_batch_3.zip', 'downscaled_images', 'grayscale_images', 'normalized_images', 'notebooka8cac6eed8 (1).ipynb', 'notebooka8cac6eed8 (2).ipynb', 'notebooka8cac6eed8 (4).ipynb', 'padded_images', 'Preprocessing.ipynb', 'processed_images', 'resized_images', 'tensor_images']


In [21]:
import os

print(os.listdir("grayscale_images"))


['.ipynb_checkpoints', '000434271f63a053c4128a0ba6352c7f.png', '00053190460d56c53cc3e57321387478.png', '0005e8e3701dfb1dd93d53e2ff537b6e.png', '0006e0a85696f6bb578e84fafa9a5607.png', '0007d316f756b3fa0baea2ff514ce945.png', '000ae00eb3942d27e0b97903dd563a6e.png', '000d68e42b71d3eac10ccc077aba07c1.png', '00150343289f317a0ad5629d5b7d9ef9.png', '00176f7e1b1cb835123f95960b9a9efd.png', '001d127bad87592efe45a5c7678f8b8d.png', '0021df30f3fddef551eb3df4354b1d06.png', '00291f7aff0123ea76a59998effef229.png', '0032c6091dc8f1b1245fc2f5f45458fa.png', '003cfe5ce5c0ec5163138eb3b740e328.png', '0046f681f078851293c4e710c4466058.png', '004d2bc2111d639f5e8441ced52d55cb.png', '004dc2a50591fb5f1aaf012bffa95fd9.png', '00575e3846ebd05a909d97ba59c53d30.png', '0059d21bef1793fa9522e4ec8cae1a1a.png', '005be26a68485912e007a3703f43d60a.png', '005d70155f949c7785671800f2c8e1ca.png', '0061cf6d35e253b6e7f03940592cc35e.png', '006501b11e04aec2d403177b9ae0f34c.png', '00675cd546313f912cadd4ad54415d69.png', '006e2726c6aa72f0

In [22]:
import os
from pathlib import Path

folder = Path("grayscale_images")

exts = set([f.suffix.lower() for f in folder.iterdir() if f.is_file()])
print("Extensions found:", exts)

print("Total files:", len(list(folder.iterdir())))


Extensions found: {'.png'}
Total files: 1501


In [24]:
import os

def list_images(folder):
    """Return all .png files in a folder"""
    return [f for f in os.listdir(folder) if f.endswith(".png")]


In [25]:
# Cell 2 — Resize

from PIL import Image
import os
from pathlib import Path

grayscale_folder = Path("grayscale_images")
resized_folder = Path("resized_images")
resized_folder.mkdir(exist_ok=True)

target_size = (512, 512)
count = 0

for fn in list_images(grayscale_folder):
    src = grayscale_folder / fn
    dst = resized_folder / fn

    img = Image.open(src)
    img = img.resize(target_size, Image.BILINEAR)
    img.save(dst)

    count += 1
    if count % 200 == 0:
        print(count, "resized")

print("Resizing done:", count)


200 resized
400 resized
600 resized
800 resized
1000 resized
1200 resized
1400 resized
Resizing done: 1500


In [27]:
# Cell 4 – Normalization (fixed)

from PIL import Image
import numpy as np
import os

def minmax_normalize(img):
    arr = np.array(img).astype('float32')
    arr = (arr - arr.min()) / (arr.max() - arr.min() + 1e-8)
    arr = (arr * 255).astype('uint8')
    return Image.fromarray(arr)

normalized_folder = "normalized_images"
os.makedirs(normalized_folder, exist_ok=True)

count = 0
for fn in list_images(resized_folder):
    src = os.path.join(resized_folder, fn)
    dst = os.path.join(normalized_folder, fn)

    img = Image.open(src)
    img = minmax_normalize(img)
    img.save(dst)

    count += 1
    if count % 200 == 0:
        print(f"{count} normalized")

print("Normalization done:", count)

200 normalized
400 normalized
600 normalized
800 normalized
1000 normalized
1200 normalized
1400 normalized
Normalization done: 1500


In [28]:
import cv2
import numpy as np
from pathlib import Path
from PIL import Image

normalized_folder = Path("normalized_images")
clahe_folder = Path("clahe_images")
clahe_folder.mkdir(exist_ok=True)

def list_images(folder):
    return [f for f in folder.iterdir() if f.suffix.lower() in [".png", ".jpg", ".jpeg"]]

# Create CLAHE object
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))

count = 0
for fn in list_images(normalized_folder):
    img = cv2.imread(str(fn), cv2.IMREAD_GRAYSCALE)

    # Apply CLAHE
    enhanced = clahe.apply(img)

    # Save
    out_path = clahe_folder / fn.name
    cv2.imwrite(str(out_path), enhanced)

    count += 1
    if count % 200 == 0:
        print(f"{count} CLAHE images processed")

print("CLAHE enhancement done:", count)


200 CLAHE images processed
400 CLAHE images processed
600 CLAHE images processed
800 CLAHE images processed
1000 CLAHE images processed
1200 CLAHE images processed
1400 CLAHE images processed
CLAHE enhancement done: 1500


In [30]:
from PIL import Image, ImageFilter
import os

clahe_folder = "clahe_images"
denoised_folder = "denoised_images"

os.makedirs(denoised_folder, exist_ok=True)

count = 0

for fn in os.listdir(clahe_folder):
    src = os.path.join(clahe_folder, fn)
    dst = os.path.join(denoised_folder, fn)

    if not src.lower().endswith(".png"):
        continue

    img = Image.open(src)

    # Very light Gaussian blur
    img = img.filter(ImageFilter.GaussianBlur(radius=1))

    img.save(dst)
    count += 1

    if count % 200 == 0:
        print(count, "denoised")

print("Denoising done:", count)

200 denoised
400 denoised
600 denoised
800 denoised
1000 denoised
1200 denoised
1400 denoised
Denoising done: 1500


In [31]:
# Cell: Crop borders

from PIL import Image, ImageOps
import os

denoised_folder = "denoised_images"   # input folder
cropped_folder = "cropped_images"     # output folder

os.makedirs(cropped_folder, exist_ok=True)

def crop_borders(img):
    """Automatically crop black borders."""
    # Convert to grayscale to detect borders
    gray = img.convert("L")
    
    # Get bounding box of non-black areas
    bbox = ImageOps.invert(gray).getbbox()
    
    if bbox:
        return img.crop(bbox)
    else:
        return img  # return original if no border found


count = 0
for fn in os.listdir(denoised_folder):
    if fn.lower().endswith(".png"):
        src = os.path.join(denoised_folder, fn)
        dst = os.path.join(cropped_folder, fn)

        img = Image.open(src)
        img = crop_borders(img)
        img.save(dst)

        count += 1
        
        if count % 200 == 0:
            print(f"{count} cropped")

print("Cropping done:", count)


200 cropped
400 cropped
600 cropped
800 cropped
1000 cropped
1200 cropped
1400 cropped
Cropping done: 1500


In [32]:
from PIL import Image

def pad_to_square(img):
    w, h = img.size
    max_dim = max(w, h)

    # create a new square image (black background)
    new_img = Image.new("RGB", (max_dim, max_dim))

    # paste the original image in the center
    new_img.paste(img, ((max_dim - w) // 2, (max_dim - h) // 2))

    return new_img


count = 0
src_folder = "cropped_images"      # CHANGE if your cropped folder name is different

for fn in os.listdir(src_folder):
    if fn.lower().endswith(".png"):
        src = os.path.join(src_folder, fn)
        dst = os.path.join(padded_folder, fn)

        img = Image.open(src)
        padded = pad_to_square(img)
        padded.save(dst)

        count += 1
        if count % 200 == 0:
            print(f"{count} padded")

print("Padding done:", count)


200 padded
400 padded
600 padded
800 padded
1000 padded
1200 padded
1400 padded
Padding done: 1500


In [34]:
import pandas as pd
import os

df = pd.read_csv("train.csv")

image_folder = "padded_images"
label_folder = "yolo_labels"

os.makedirs(label_folder, exist_ok=True)

print("CSV Loaded. Total rows:", len(df))


CSV Loaded. Total rows: 67914


In [39]:
import pandas as pd

df = pd.read_csv("train.csv")
print("Total rows:", len(df))


Total rows: 67914


In [36]:
import os

padded_folder = "padded_images"

# extract image ids (remove .png)
my_images = [f.replace(".png","") for f in os.listdir(padded_folder)]
print("My images:", len(my_images))


My images: 1500


In [37]:
filtered_df = df[df["image_id"].isin(my_images)]
print("Filtered rows:", len(filtered_df))


Filtered rows: 6756


In [38]:
missing = set(my_images) - set(filtered_df["image_id"].unique())
print("Images with NO annotations:", len(missing))


Images with NO annotations: 0


In [40]:
import pandas as pd

df = pd.read_csv("train.csv")
print("Total rows in original CSV:", len(df))


Total rows in original CSV: 67914


In [41]:
import os

padded_folder = "padded_images"

my_images = [f.replace(".png", "") for f in os.listdir(padded_folder) if f.endswith(".png")]
print("Total padded images:", len(my_images))


Total padded images: 1500


In [42]:
filtered_df = df[df["image_id"].isin(my_images)]
print("Filtered rows:", len(filtered_df))


Filtered rows: 6756


In [43]:
filtered_df.to_csv("train_filtered.csv", index=False)
print("Saved as train_filtered.csv")


Saved as train_filtered.csv


In [45]:
import os
from PIL import Image

label_folder = "yolo_labels"
os.makedirs(label_folder, exist_ok=True)

def convert_bbox(w, h, x_min, y_min, x_max, y_max):
    bw = x_max - x_min
    bh = y_max - y_min
    x_c = x_min + bw/2
    y_c = y_min + bh/2
    return x_c/w, y_c/h, bw/w, bh/h

count = 0

for img_name in os.listdir(padded_folder):
    if not img_name.endswith(".png"):
        continue

    image_id = img_name.replace(".png", "")
    img_path = os.path.join(padded_folder, img_name)
    label_path = os.path.join(label_folder, image_id + ".txt")

    img = Image.open(img_path)
    w, h = img.size

    rows = filtered_df[filtered_df["image_id"] == image_id]

    with open(label_path, "w") as f:
        for _, row in rows.iterrows():
            cls = int(row["class_id"])
            x_min = row["x_min"]
            y_min = row["y_min"]
            x_max = row["x_max"]
            y_max = row["y_max"]

            x_c, y_c, bw, bh = convert_bbox(w, h, x_min, y_min, x_max, y_max)
            f.write(f"{cls} {x_c:.6f} {y_c:.6f} {bw:.6f} {bh:.6f}\n")

    count += 1
    if count % 200 == 0:
        print(count, "YOLO files created")

print("YOLO conversion completed.")


200 YOLO files created
400 YOLO files created
600 YOLO files created
800 YOLO files created
1000 YOLO files created
1200 YOLO files created
1400 YOLO files created
YOLO conversion completed.


In [46]:
import os
from PIL import Image, ImageEnhance
import numpy as np
import random


In [48]:
def horizontal_flip(img):
    return img.transpose(Image.FLIP_LEFT_RIGHT)

def small_rotate(img, angle=5):
    return img.rotate(random.uniform(-angle, angle), expand=True)

def adjust_contrast(img, factor=1.2):
    enhancer = ImageEnhance.Contrast(img)
    return enhancer.enhance(factor)

def mild_zoom(img, zoom=1.05):
    w, h = img.size
    new_w = int(w * zoom)
    new_h = int(h * zoom)

    img_zoom = img.resize((new_w, new_h))
    left = (new_w - w) // 2
    top = (new_h - h) // 2
    return img_zoom.crop((left, top, left + w, top + h))

# Simple elastic-like warp (very mild)
def mild_elastic(img, alpha=2, sigma=5):
    arr = np.array(img)
    dx = (np.random.rand(*arr.shape[:2]) * 2 - 1) * alpha
    dy = (np.random.rand(*arr.shape[:2]) * 2 - 1) * alpha

    x, y = np.meshgrid(np.arange(arr.shape[1]), np.arange(arr.shape[0]))
    x2 = np.clip(x + dx, 0, arr.shape[1] - 1).astype(np.int32)
    y2 = np.clip(y + dy, 0, arr.shape[0] - 1).astype(np.int32)

    return Image.fromarray(arr[y2, x2])


In [49]:
input_folder = "padded_images"
aug_folder = "augmented_images"
os.makedirs(aug_folder, exist_ok=True)

count = 0

for fn in os.listdir(input_folder):
    if not fn.lower().endswith(".png"):
        continue

    img = Image.open(os.path.join(input_folder, fn))

    # Augmentations
    aug1 = horizontal_flip(img)
    aug2 = small_rotate(img)
    aug3 = adjust_contrast(img)
    aug4 = mild_zoom(img)
    aug5 = mild_elastic(img)

    # Save
    base = fn.replace(".png", "")
    aug1.save(f"{aug_folder}/{base}_flip.png")
    aug2.save(f"{aug_folder}/{base}_rot.png")
    aug3.save(f"{aug_folder}/{base}_contrast.png")
    aug4.save(f"{aug_folder}/{base}_zoom.png")
    aug5.save(f"{aug_folder}/{base}_elastic.png")

    count += 5

print("Augmentation complete. Total new images:", count)


Augmentation complete. Total new images: 7500


In [50]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Your image folder after all preprocessing:
image_folder = "padded_images"     # or augmented_images if you want

# Output split folders
train_dir = "dataset/train"
val_dir   = "dataset/val"
test_dir  = "dataset/test"

for d in [train_dir, val_dir, test_dir]:
    os.makedirs(d, exist_ok=True)

print("Split folders created!")


Split folders created!


In [51]:
# List all PNG files
image_files = [f for f in os.listdir(image_folder) if f.endswith(".png")]

print("Total images found:", len(image_files))


Total images found: 1500


In [52]:
train_files, temp = train_test_split(image_files, test_size=0.30, random_state=42)
val_files, test_files = train_test_split(temp, test_size=0.33, random_state=42)

print(len(train_files), "train images")
print(len(val_files), "val images")
print(len(test_files), "test images")


1050 train images
301 val images
149 test images


In [53]:
def move_files(files, src_folder, dst_folder):
    for f in files:
        shutil.copy(os.path.join(src_folder, f), 
                    os.path.join(dst_folder, f))

move_files(train_files, image_folder, train_dir)
move_files(val_files, image_folder, val_dir)
move_files(test_files, image_folder, test_dir)

print("Dataset split complete!")


Dataset split complete!
