In [None]:
import pandas as pd
import numpy as np
import os

# --- I. CONFIGURATION ---
# This is the directory where the Kaggle data was unzipped.
# Make sure this path reflects where you told the API to download the data.
BASE_DATA_DIR = './vinbigdata_data/'

# The number of unique images you want to sample
N_SAMPLE = 1000
# --- End Configuration ---


# --- II. ROBUSTLY FIND AND LOAD THE train.csv FILE ---
target_file = 'train.csv'
file_path = None
image_dir_path = None # Will store the path to the 'train/' image folder

print(f"Starting robust search for '{target_file}' inside '{BASE_DATA_DIR}'...")

# os.walk recursively searches the directory structure
for root, dirs, files in os.walk(BASE_DATA_DIR):
    # 1. Find the CSV file
    if file_path is None and target_file in files:
        file_path = os.path.join(root, target_file)
        print(f"✅ Found CSV file at: {file_path}")

    # 2. Find the image folder (necessary for the next preprocessing steps)
    if 'train' in dirs:
        image_dir_path = os.path.join(root, 'train')
        print(f"✅ Found image directory at: {image_dir_path}")

    # Stop searching if both paths are found
    if file_path and image_dir_path:
        break


if file_path is None:
    print("-" * 50)
    print(f"FATAL ERROR: '{target_file}' not found.")
    print("Please ensure the Kaggle download and unzip steps were completed in Colab.")
    print("-" * 50)
    raise FileNotFoundError("Could not locate the train.csv file required for labels.")


# --- III. LOAD AND SAMPLE THE DATA ---
try:
    # Load the full training dataframe using the dynamically found path
    train_df_full = pd.read_csv(file_path)
except Exception as e:
    print(f"Error reading CSV file: {e}")
    raise

# 1. Get unique Image IDs
unique_image_ids_full = train_df_full['image_id'].unique()

# 2. Sample 1000 unique Image IDs
if len(unique_image_ids_full) >= N_SAMPLE:
    sampled_ids = np.random.choice(unique_image_ids_full, size=N_SAMPLE, replace=False)
else:
    sampled_ids = unique_image_ids_full

# 3. Filter the original dataframe to include only the sampled images and their annotations
train_df = train_df_full[train_df_full['image_id'].isin(sampled_ids)].copy()

print("-" * 50)
print(f"Original unique image count: {len(unique_image_ids_full)}")
print(f"Successfully sampled and filtered dataframe for {len(train_df['image_id'].unique())} unique images.")
print(f"Total annotation rows in subset: {len(train_df)}")

# --- FINAL STEP (Verification of the result) ---
# The filtered dataframe is named 'train_df' and the image directory path is 'image_dir_path'
# You would use these two variables in the subsequent preprocessing step.
print(f"Variable 'train_df' is ready (sampled labels).")
print(f"Variable 'IMAGE_DIR' for preprocessing should be set to: {image_dir_path}")

Starting robust search for 'train.csv' inside './vinbigdata_data/'...
--------------------------------------------------
FATAL ERROR: 'train.csv' not found.
Please ensure the Kaggle download and unzip steps were completed in Colab.
--------------------------------------------------


FileNotFoundError: Could not locate the train.csv file required for labels.

In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"drashtipatel23","key":"ee37e5b7d1455576e72a78ebad9c63d7"}'}

In [None]:
# Create the .kaggle directory if it doesn't exist
!mkdir -p ~/.kaggle

# Copy the uploaded kaggle.json file into the .kaggle directory
!cp kaggle.json ~/.kaggle/

# Set file permissions to be read/write only by the user for security
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API setup is complete. You can now download datasets.")


Kaggle API setup is complete. You can now download datasets.


In [None]:
# This downloads the ZIP file containing the data
!kaggle competitions download -c vinbigdata-chest-xray-abnormalities-detection

# You must then unzip the file(s) into a known directory for access:
!unzip -q vinbigdata-chest-xray-abnormalities-detection.zip -d vinbigdata_data

403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/competitions/data/download-all/vinbigdata-chest-xray-abnormalities-detection
unzip:  cannot find or open vinbigdata-chest-xray-abnormalities-detection.zip, vinbigdata-chest-xray-abnormalities-detection.zip.zip or vinbigdata-chest-xray-abnormalities-detection.zip.ZIP.


In [None]:
# 1. DELETE THE OLD KEY
!rm ~/.kaggle/kaggle.json

# 2. Upload the NEW 'kaggle.json' file
from google.colab import files
print("Please upload the *NEWLY DOWNLOADED* kaggle.json file now:")
files.upload()

# 3. Re-configure permissions with the new key
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Please upload the *NEWLY DOWNLOADED* kaggle.json file now:


Saving kaggle.json to kaggle (1).json


In [None]:
# --- Clean Up ---
!rm -rf vinbigdata_data
!rm -f vinbigdata-chest-xray-abnormalities-detection.zip

# --- Download & Unzip ---
# Download the ZIP file again
!kaggle competitions download -c vinbigdata-chest-xray-abnormalities-detection

# Create the target directory
!mkdir vinbigdata_data

# Unzip the file quietly into the new directory
!unzip -q vinbigdata-chest-xray-abnormalities-detection.zip -d vinbigdata_data

# --- Verification ---
print("\n--- Verification of Files ---")
!ls vinbigdata_data | grep train.csv

403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/competitions/data/download-all/vinbigdata-chest-xray-abnormalities-detection
unzip:  cannot find or open vinbigdata-chest-xray-abnormalities-detection.zip, vinbigdata-chest-xray-abnormalities-detection.zip.zip or vinbigdata-chest-xray-abnormalities-detection.zip.ZIP.

--- Verification of Files ---


In [None]:
# --- 1. DOWNLOAD THE NEW, ACCESSIBLE DATASET ---
# Note: The command changes from 'competitions download' to 'datasets download'
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

# --- 2. UNZIP THE NEW FILE ---
# The downloaded file is usually named 'chest-xray-pneumonia.zip'
!unzip -q chest-xray-pneumonia.zip -d chest_xray_data

# --- 3. VERIFY PATH ---
# The labels are not in a CSV; they are organized by folders.
print("\n--- NEW DATA STRUCTURE ---")
!ls chest_xray_data/chest_xray/train

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
License(s): other
Downloading chest-xray-pneumonia.zip to /content
100% 2.29G/2.29G [00:32<00:00, 196MB/s]
100% 2.29G/2.29G [00:32<00:00, 76.6MB/s]

--- NEW DATA STRUCTURE ---
NORMAL	PNEUMONIA


In [None]:
import numpy as np
import cv2
from tqdm.notebook import tqdm
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# --- CONFIGURATION ---
# Base directory where the new data was unzipped (e.g., './chest_xray_data/chest_xray/')
# The data is structured as chest_xray_data/chest_xray/train/{NORMAL, PNEUMONIA}
BASE_DIR = './chest_xray_data/chest_xray/'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')

IMG_SIZE = 512 # Target size for standardization (Step 3/8)
N_SAMPLE = 1000 # Your target sample size

# Labels are defined by the folder names
CLASSES = ['NORMAL', 'PNEUMONIA']
CLASS_MAP = {'NORMAL': 0, 'PNEUMONIA': 1}
# --- End Configuration ---

X_images = []
y_labels = []
file_paths = []

In [None]:
# List all files and assign labels based on folder structure
for class_name in CLASSES:
    class_path = os.path.join(TRAIN_DIR, class_name)
    class_id = CLASS_MAP[class_name]

    for file_name in os.listdir(class_path):
        if file_name.endswith(('.jpeg', '.jpg', '.png')): # Filter for image files
            file_paths.append((os.path.join(class_path, file_name), class_id))

# Convert to DataFrame for easier sampling
df_paths = pd.DataFrame(file_paths, columns=['path', 'label'])

# Ensure we sample 1000 images (500 from each class if possible, for balance)
df_normal = df_paths[df_paths['label'] == 0].sample(n=min(N_SAMPLE // 2, len(df_paths[df_paths['label'] == 0])), random_state=42)
df_pneumonia = df_paths[df_paths['label'] == 1].sample(n=min(N_SAMPLE // 2, len(df_paths[df_paths['label'] == 1])), random_state=42)

df_sampled = pd.concat([df_normal, df_pneumonia]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Sampling complete. Processing {len(df_sampled)} images (max 1000).")
print("-" * 50)

Sampling complete. Processing 1000 images (max 1000).
--------------------------------------------------


In [None]:
def process_single_image_robust(file_path, size):
    """Applies cleaning, enhancement, and standardization steps."""

    # 1. Load, Convert to Grayscale (Step 2)
    # The dataset uses JPEG, so DICOM conversion (Step 1) is not needed.
    img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)

    if img is None:
        return None

    # 2. Crop borders (Step 7 - Simple proportional crop)
    h, w = img.shape
    crop_h = int(h * 0.05)
    crop_w = int(w * 0.05)
    img = img[crop_h:h-crop_h, crop_w:w-crop_w]

    # 3. Light Contrast Enhancement - CLAHE Method (Step 5)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    img = clahe.apply(img)

    # 4. Light Denoising - Gaussian Blur (Step 6)
    img = cv2.GaussianBlur(img, (3, 3), 0)

    # 5. Resize and Padding for uniform shape (Steps 3 & 8)
    # Resize handles the primary standardization.
    img = cv2.resize(img, (size, size))

    # 6. Intensity Normalization - MinMax scaling (Step 4)
    img = img.astype(np.float32) / 255.0

    return img

print("Starting robust preprocessing...")

for index, row in tqdm(df_sampled.iterrows(), total=len(df_sampled)):
    processed_img = process_single_image_robust(row['path'], IMG_SIZE)

    if processed_img is not None:
        X_images.append(processed_img)
        y_labels.append(row['label'])

X_images = np.array(X_images)
y_labels = np.array(y_labels)

# Reshape for CNN input (Adding a channel dimension for grayscale)
X_images = X_images.reshape(X_images.shape[0], IMG_SIZE, IMG_SIZE, 1)

print(f"\nProcessed Image Array Shape: {X_images.shape}")
print(f"Processed Label Array Shape: {y_labels.shape}")


Starting robust preprocessing...


  0%|          | 0/1000 [00:00<?, ?it/s]


Processed Image Array Shape: (1000, 512, 512, 1)
Processed Label Array Shape: (1000,)


In [None]:
# 1. Convert integer labels to categorical (one-hot encoding)
num_classes = len(CLASSES)
Y_categorical = to_categorical(y_labels, num_classes=num_classes)

# 2. Train/Test/Val Split (Step 11)
# Split the data into Training and Validation sets (e.g., 80% Train, 20% Val)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_images, Y_categorical, test_size=0.2, random_state=42, stratify=y_labels
)

print("-" * 50)
print("Data Preparation Complete:")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape:   {X_val.shape}")
print(f"Number of classes: {num_classes}")
print("Data is ready to feed into a Neural Network.")


--------------------------------------------------
Data Preparation Complete:
X_train shape: (800, 512, 512, 1)
X_val shape:   (200, 512, 512, 1)
Number of classes: 2
Data is ready to feed into a Neural Network.


In [1]:
!git clone https://github.com/GKSJ-AI-CliniScan/AI-CliniScan.git
%cd AI-CliniScan


Cloning into 'AI-CliniScan'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 67 (delta 16), reused 18 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (67/67), 27.76 MiB | 8.50 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/content/AI-CliniScan


In [2]:
!git checkout -b DrashtiPatel


Switched to a new branch 'DrashtiPatel'


In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
!ls "/content/drive/MyDrive/Colab Notebooks/"


'Copy of Untitled4.ipynb'   Untitled1.ipynb   Untitled3.ipynb
 Untitled0.ipynb	    Untitled2.ipynb   Untitled4.ipynb
