In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pydicom
import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt

# Configuration
KAGGLE_DICOM_DIR = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/'
ANNOTATIONS_CSV = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv'
PROCESSED_OUTPUT_DIR = '/kaggle/working/processed_images'
TARGET_IMAGE_SIZE = (512, 512)

# Create output directory
os.makedirs(PROCESSED_OUTPUT_DIR, exist_ok=True)

# Dictionary for class labels (used in visualization)
CLASS_LABELS = {
    0: 'Aortic enlargement', 1: 'Atelectasis', 2: 'Calcification', 
    3: 'Cardiomegaly', 4: 'Consolidation', 5: 'Edema', 
    6: 'Fibrosis', 7: 'Infiltration', 8: 'Mass', 
    9: 'Nodule', 10: 'Other lesion', 11: 'Pleural effusion', 
    12: 'Pleural thickening', 13: 'Pneumothorax', 14: 'No finding'
}

In [None]:
def preprocess_dicom_image(dicom_path, output_folder, target_size):
    """Reads DICOM, normalizes, resizes, and saves as 8-bit PNG."""
    try:
        dicom_data = pydicom.dcmread(dicom_path)
        img = dicom_data.pixel_array
    except Exception as e:
        print(f"Error reading DICOM file {os.path.basename(dicom_path)}: {e}")
        return None

    # Normalization (Scale to 0-1)
    img = img.astype(np.float32)
    if img.max() > img.min():
        img = (img - img.min()) / (img.max() - img.min())
    else:
        img = np.zeros_like(img, dtype=np.float32)

    # Resize Image
    img_resized = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR)
    
    # Scale back to 0-255 and convert to 8-bit for saving
    img_final = (img_resized * 255).astype(np.uint8)
    
    # Save Image
    base_filename = os.path.basename(dicom_path).replace('.dicom', '.png').replace('.dcm', '.png')
    output_path = os.path.join(output_folder, base_filename)
    
    cv2.imwrite(output_path, img_final)
    
    return base_filename

def parse_annotations(annotations_csv_path, processed_image_folder):
    """Loads annotations and links them to the path of the newly processed images."""
    df_annotations = pd.read_csv(annotations_csv_path)
    
    # Create the filename column based on 'image_id'
    df_annotations['image_filename'] = df_annotations['image_id'] + '.png' 

    # Add the full path to the processed image
    df_annotations['image_path'] = df_annotations['image_filename'].apply(
        lambda x: os.path.join(processed_image_folder, x)
    )
    
    # Filter to keep only the entries where the processed image exists
    df_model_input = df_annotations[df_annotations['image_path'].apply(os.path.exists)]
    
    return df_model_input.reset_index(drop=True)

In [None]:
import random # <--- Make sure this is imported at the top of your notebook

# --- Execute Preprocessing and Create df_model_input (Step 3 Revised) ---

# Get a list of all DICOM files
dicom_files_all = [os.path.join(KAGGLE_DICOM_DIR, f) 
                   for f in os.listdir(KAGGLE_DICOM_DIR) 
                   if f.lower().endswith(('.dicom', '.dcm'))]

total_files = len(dicom_files_all)
SAMPLE_SIZE = 1000

print(f"Found {total_files} total DICOM files.")

# 1. Randomly sample 1000 files
if total_files > SAMPLE_SIZE:
    dicom_files_sampled = random.sample(dicom_files_all, SAMPLE_SIZE)
else:
    dicom_files_sampled = dicom_files_all
    
print(f"Selecting {len(dicom_files_sampled)} files for processing.")
print(f"Starting conversion and resizing to {TARGET_IMAGE_SIZE}...")

processed_count = 0
# 2. Process the sampled list
for i, dicom_path in enumerate(dicom_files_sampled):
    result = preprocess_dicom_image(dicom_path, PROCESSED_OUTPUT_DIR, TARGET_IMAGE_SIZE)
    if result:
        processed_count += 1
        
    if (i + 1) % 100 == 0: # Check progress every 100 files since the sample is smaller
        print(f"Processed {i + 1} files...")

print(f"Finished processing! Successfully converted {processed_count} images to PNG.")

# Create the df_model_input DataFrame
# This step automatically filters the annotations to match the processed PNG files.
df_model_input = parse_annotations(ANNOTATIONS_CSV, PROCESSED_OUTPUT_DIR)
print(f"df_model_input created with {len(df_model_input['image_id'].unique())} unique images.")

In [None]:
# --- Step 4: Annotation Grouping (Full Consolidated Code) ---

# 1. Filter out the "No finding" label (Class ID 14)
df_boxes = df_model_input[df_model_input['class_id'] != 14].copy()

# 2. Define the grouping function
def get_all_boxes(group):
    """
    Combines class_id and individual coordinate columns into a list of lists.
    """
    boxes = []
    for _, row in group.iterrows():
        # Using the confirmed individual coordinate columns
        xmin = row['x_min']
        ymin = row['y_min']
        xmax = row['x_max']
        ymax = row['y_max']
        
        # Ensure coordinates are valid
        if not np.isnan(xmin) and not np.isnan(ymin):
            boxes.append([
                row['class_id'], 
                float(xmin), float(ymin), 
                float(xmax), float(ymax)
            ])
    return boxes

# 3. CREATE df_grouped (This must execute successfully)
df_grouped = df_boxes.groupby('image_id').apply(get_all_boxes).reset_index(name='boxes')

# 4. Merge to create df_final_model (This is the line that was failing)
df_final_model = df_model_input[['image_id', 'image_path']].drop_duplicates().merge(
    df_grouped, 
    on='image_id', 
    how='left' 
)

# 5. Fill empty lists for images with no findings
df_final_model['boxes'] = df_final_model['boxes'].apply(lambda x: x if isinstance(x, list) else [])

print("\n--- df_final_model created successfully ---")
print(f"Total unique images ready for model training: {len(df_final_model)}")

In [None]:
 def draw_boxes_on_image(image_path, boxes, target_size=TARGET_IMAGE_SIZE[0]):
    """Loads the processed image, scales the boxes, and draws them."""
    
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_with_boxes = img.copy()
    
    # Original image size approximation for VinBigData (needed for rescaling boxes)
    ORIGINAL_SIZE = 1024 
    scale_factor = target_size / ORIGINAL_SIZE
    
    for box_data in boxes:
        class_id = int(box_data[0])
        xmin, ymin, xmax, ymax = box_data[1:]
        
        # Rescale coordinates to the 512x512 image space
        x_min_scaled = int(xmin * scale_factor)
        y_min_scaled = int(ymin * scale_factor)
        x_max_scaled = int(xmax * scale_factor)
        y_max_scaled = int(ymax * scale_factor)
        
        color = (100 + (class_id * 10) % 155, 100 + (class_id * 30) % 155, 100 + (class_id * 50) % 155)
        
        # Draw the rectangle
        cv2.rectangle(img_with_boxes, 
                      (x_min_scaled, y_min_scaled), 
                      (x_max_scaled, y_max_scaled), 
                      color, 2)
        
        # Put the class label text
        label = CLASS_LABELS.get(class_id, f'Class {class_id}')
        cv2.putText(img_with_boxes, label, 
                    (x_min_scaled, y_min_scaled - 10), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        
    return img_with_boxes

# Find the first image that has actual findings (non-empty 'boxes' list)
# This uses the df_final_model created in the previous step.
sample_row = df_final_model[df_final_model['boxes'].apply(len) > 0].iloc[0]

sample_path = sample_row['image_path']
sample_boxes = sample_row['boxes']
sample_id = sample_row['image_id']

print(f"Displaying image ID: {sample_id} with {len(sample_boxes)} findings.")

# Draw and display the image
output_image = draw_boxes_on_image(sample_path, sample_boxes)

plt.figure(figsize=(8, 8))
plt.imshow(output_image)
plt.title(f'Processed X-ray ({TARGET_IMAGE_SIZE[0]}x{TARGET_IMAGE_SIZE[1]}) with Bounding Boxes')
plt.axis('off')
plt.show()