In [8]:
import os

data_dir = r"dicom_data_bulk"  # absolute path
print("Files found:", len(os.listdir(data_dir)))


Files found: 6


In [13]:
import os
import cv2
import numpy as np
import pandas as pd
import pydicom
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# ==== CONFIG ====
CSV_PATH = "scan_inventory.csv"
DATA_DIR = "dicom_data_bulk"
IMG_SIZE = (128, 128)

# ==== Load CSV ====
df = pd.read_csv(CSV_PATH)

# Rename BodyPartExamined to organ_type for clarity
if "BodyPartExamined" not in df.columns:
    raise ValueError("CSV must contain a 'BodyPartExamined' column.")

df["organ_type"] = df["BodyPartExamined"].fillna("Unknown")

# Keep only entries that have a matching DICOM file in your directory
df["dicom_path"] = df["SeriesInstanceUID"].apply(
    lambda uid: os.path.join(DATA_DIR, str(uid))
)
df = df[df["dicom_path"].apply(os.path.exists)]

# ==== Encode organ_type ====
organ_types = sorted(df["organ_type"].unique())
organ_map = {org: idx for idx, org in enumerate(organ_types)}
df["label"] = df["organ_type"].map(organ_map)

if len(organ_map) < 2:
    raise ValueError(f"Need at least 2 classes, found only {organ_types}")

# ==== DICOM Loader ====
def load_dicom_images_from_folder(folder_path, img_size=(128, 128)):
    images = []
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".dcm"):
            dicom_path = os.path.join(folder_path, file_name)
            try:
                dicom_data = pydicom.dcmread(dicom_path)
                img = dicom_data.pixel_array.astype(np.float32)
                img = cv2.resize(img, img_size)
                img = img / np.max(img) if np.max(img) != 0 else img
                images.append(img)
            except Exception as e:
                print(f"Error reading {dicom_path}: {e}")
    return np.array(images)

# ==== Load all images & labels ====
all_images, all_labels = [], []
for _, row in df.iterrows():
    imgs = load_dicom_images_from_folder(row["dicom_path"], IMG_SIZE)
    all_images.extend(imgs)
    all_labels.extend([row["label"]] * len(imgs))

all_images = np.array(all_images)[..., np.newaxis]
all_labels = np.array(all_labels)

# One-hot encoding if multi-class
if len(organ_map) > 2:
    all_labels = to_categorical(all_labels, num_classes=len(organ_map))

# ==== Train/Test split ====
X_train, X_test, y_train, y_test = train_test_split(
    all_images, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# ==== Model ====
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 1)),
    MaxPooling2D((2,2)),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(organ_map) if len(organ_map) > 2 else 1,
          activation='softmax' if len(organ_map) > 2 else 'sigmoid')
])

loss_fn = "categorical_crossentropy" if len(organ_map) > 2 else "binary_crossentropy"

model.compile(optimizer=Adam(1e-4), loss=loss_fn, metrics=["accuracy"])

# ==== Training ====
history = model.fit(
    tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(32).repeat(),
    validation_data=tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32),
    epochs=10,
    steps_per_epoch=max(1, len(X_train)//32),
    validation_steps=max(1, len(X_test)//32)
)

# ==== Results ====
print("Classes found:", organ_map)
print("Final Accuracy:", history.history['accuracy'][-1])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 258ms/step - accuracy: 0.7860 - loss: 0.4982 - val_accuracy: 1.0000 - val_loss: 0.1247
Epoch 2/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 199ms/step - accuracy: 0.9931 - loss: 0.0798 - val_accuracy: 1.0000 - val_loss: 0.0200
Epoch 3/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 181ms/step - accuracy: 1.0000 - loss: 0.0180 - val_accuracy: 1.0000 - val_loss: 0.0055
Epoch 4/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 180ms/step - accuracy: 1.0000 - loss: 0.0096 - val_accuracy: 1.0000 - val_loss: 0.0023
Epoch 5/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 179ms/step - accuracy: 1.0000 - loss: 0.0032 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 6/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 180ms/step - accuracy: 1.0000 - loss: 0.0024 - val_accuracy: 1.0000 - val_loss: 9.9508e-04
Epoch 7/10
[1m14/14[0m

In [16]:
import os
import pandas as pd
import pydicom
import numpy as np
from PIL import Image

# ==== Config ====
csv_file = "scan_inventory.csv"
dicom_dir = "dicom_data_bulk"
output_dir = "jpg_output"

# ==== Step 1: Load CSV ====
df = pd.read_csv(csv_file)
df.columns = [c.strip().lower() for c in df.columns]

# Find matching column for body part
meta_col = "bodypartexamined"

# Normalize labels
df[meta_col] = df[meta_col].astype(str).str.upper().str.strip()

# Only CHEST images (change to None for all)
target_organs = ["CHEST"]

if target_organs:
    df = df[df[meta_col].isin(target_organs)]

print(f"Found {len(df)} rows for {target_organs}")

# ==== Step 2: Map SeriesInstanceUIDs from CSV ====
valid_uids = set(df["seriesinstanceuid"].astype(str))

# ==== Step 3: Scan DICOM directory ====
matched_files = []
for root, _, files in os.walk(dicom_dir):
    for f in files:
        if f.lower().endswith(".dcm"):
            dcm_path = os.path.join(root, f)
            try:
                ds = pydicom.dcmread(dcm_path, stop_before_pixels=True)
                if str(ds.SeriesInstanceUID) in valid_uids:
                    matched_files.append((dcm_path, ds.SeriesInstanceUID))
            except Exception as e:
                print(f"Error reading {dcm_path}: {e}")

print(f"Matched {len(matched_files)} DICOM files from {dicom_dir}")

# ==== Step 4: Convert to JPG ====
os.makedirs(output_dir, exist_ok=True)

for dicom_path, uid in matched_files:
    try:
        ds = pydicom.dcmread(dicom_path)
        pixel_array = ds.pixel_array.astype(float)

        # Normalize to 0–255
        scaled = (np.maximum(pixel_array, 0) / pixel_array.max()) * 255.0
        scaled = np.uint8(scaled)

        # Convert to PIL Image
        img = Image.fromarray(scaled).convert("L")

        # Save under organ-specific folder
        organ_label = df.loc[df["seriesinstanceuid"] == uid, meta_col].values[0]
        organ_folder = os.path.join(output_dir, organ_label)
        os.makedirs(organ_folder, exist_ok=True)

        jpg_path = os.path.join(organ_folder, os.path.basename(dicom_path).replace(".dcm", ".jpg"))
        img.save(jpg_path)
    except Exception as e:
        print(f"Failed to convert {dicom_path}: {e}")

print(f"Conversion complete. JPGs saved to {output_dir}")


Found 123 rows for ['CHEST']
Matched 477 DICOM files from dicom_data_bulk
Conversion complete. JPGs saved to jpg_output
