
# Capstone 3 – Thoracic X-Ray Project  
## Step 4: Pre-processing and Training Data Development (Local Paths)



This notebook implements **Step 4: Pre-processing and Training Data Development** for the multi-label thoracic disease classification project using the NIH Chest X-Ray dataset, wired to your **local Windows paths**.


In [1]:

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import joblib

print("Libraries imported successfully.")


Libraries imported successfully.


In [2]:

BASE_DIR = Path(r"C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic")
CLEANED_METADATA_PATH = BASE_DIR / "cleaned_metadata.csv"
IMAGES_ROOT = BASE_DIR / "nih_images_bucket"
OUTPUT_DIR = BASE_DIR / "preprocessed"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("CLEANED_METADATA_PATH:", CLEANED_METADATA_PATH, "| Exists?", CLEANED_METADATA_PATH.exists())
print("IMAGES_ROOT:", IMAGES_ROOT, "| Exists?", IMAGES_ROOT.exists())
print("OUTPUT_DIR:", OUTPUT_DIR)


BASE_DIR: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic
CLEANED_METADATA_PATH: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\cleaned_metadata.csv | Exists? True
IMAGES_ROOT: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\nih_images_bucket | Exists? True
OUTPUT_DIR: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed


In [3]:

df = pd.read_csv(CLEANED_METADATA_PATH)
print("Shape of cleaned metadata:", df.shape)
df.head()


Shape of cleaned metadata: (112120, 29)


Unnamed: 0,image_index,finding_labels,follow_up,patient_id,patient_age,patient_gender,view_position,originalimage_width,originalimage_height,originalimagepixelspacing_x,...,label_Fibrosis,label_Hernia,label_Infiltration,label_Mass,label_Nodule,label_Pleural_Thickening,label_Pneumonia,label_Pneumothorax,label_count,image_path
0,00000001_000.png,Cardiomegaly,0,1,58.0,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,1,C:\Springboard\Data Science at Scale\23.5 Caps...
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58.0,M,PA,2894,2729,0.143,...,0,0,0,0,0,0,0,0,2,C:\Springboard\Data Science at Scale\23.5 Caps...
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58.0,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,2,C:\Springboard\Data Science at Scale\23.5 Caps...
3,00000002_000.png,No Finding,0,2,81.0,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,C:\Springboard\Data Science at Scale\23.5 Caps...
4,00000003_000.png,Hernia,0,3,81.0,F,PA,2582,2991,0.143,...,0,1,0,0,0,0,0,0,1,C:\Springboard\Data Science at Scale\23.5 Caps...


In [4]:

# Build filename -> full path map
image_map = {}
for p in IMAGES_ROOT.rglob("*.png"):
    image_map[p.name] = str(p)
for p in IMAGES_ROOT.rglob("*.jpg"):
    image_map[p.name] = str(p)
for p in IMAGES_ROOT.rglob("*.jpeg"):
    image_map[p.name] = str(p)

print("Number of image files discovered:", len(image_map))

if "image_index" not in df.columns:
    raise KeyError("Column 'image_index' not found in cleaned_metadata.csv")

df["image_path"] = df["image_index"].map(image_map)

num_unresolved = df["image_path"].isna().sum()
print("Unresolved image paths (NaN):", num_unresolved)

df.to_csv(CLEANED_METADATA_PATH, index=False)
print("Updated cleaned metadata (with image_path) saved to:", CLEANED_METADATA_PATH)


Number of image files discovered: 112120
Unresolved image paths (NaN): 0
Updated cleaned metadata (with image_path) saved to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\cleaned_metadata.csv


In [5]:

LABEL_COL = "finding_labels"

DISEASES = [
    "Atelectasis", "Cardiomegaly", "Effusion", "Infiltration",
    "Mass", "Nodule", "Pneumonia", "Pneumothorax",
    "Consolidation", "Edema", "Emphysema", "Fibrosis",
    "Pleural_Thickening", "Hernia", "No Finding"
]

def parse_label_string(label_str):
    if pd.isna(label_str):
        return []
    return [x.strip() for x in label_str.split('|')]

label_lists = df[LABEL_COL].apply(parse_label_string)
mlb = MultiLabelBinarizer(classes=DISEASES)
Y = mlb.fit_transform(label_lists)
label_df = pd.DataFrame(Y, columns=mlb.classes_, index=df.index)

df_labels = pd.concat([df, label_df], axis=1)
print("Shape after adding label columns:", df_labels.shape)
df_labels.head()


Shape after adding label columns: (112120, 44)


Unnamed: 0,image_index,finding_labels,follow_up,patient_id,patient_age,patient_gender,view_position,originalimage_width,originalimage_height,originalimagepixelspacing_x,...,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia,No Finding
0,00000001_000.png,Cardiomegaly,0,1,58.0,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58.0,M,PA,2894,2729,0.143,...,0,0,0,0,0,1,0,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58.0,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,2,81.0,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,1
4,00000003_000.png,Hernia,0,3,81.0,F,PA,2582,2991,0.143,...,0,0,0,0,0,0,0,0,1,0


In [6]:

possible_categorical_cols = ["patient_gender", "gender", "sex", "view_position"]
categorical_cols = [col for col in possible_categorical_cols if col in df_labels.columns]

print("Categorical columns to dummy-encode:", categorical_cols)

if categorical_cols:
    df_labels = pd.get_dummies(df_labels, columns=categorical_cols, drop_first=True)
    print("Shape after dummy-encoding categorical columns:", df_labels.shape)
else:
    print("No categorical columns found for dummy encoding based on the configured list.")


Categorical columns to dummy-encode: ['patient_gender', 'view_position']
Shape after dummy-encoding categorical columns: (112120, 44)


In [7]:

possible_numeric_cols = ["patient_age", "age"]
numeric_cols = [col for col in possible_numeric_cols if col in df_labels.columns]

print("Numeric columns to standardize:", numeric_cols)

if numeric_cols:
    scaler = StandardScaler()
    df_labels[numeric_cols] = scaler.fit_transform(df_labels[numeric_cols])
    scaler_path = OUTPUT_DIR / "numeric_feature_scaler.joblib"
    joblib.dump(scaler, scaler_path)
    print(f"Saved StandardScaler to: {scaler_path}")
else:
    print("No numeric columns found for standardization based on the configured list.")


Numeric columns to standardize: ['patient_age']
Saved StandardScaler to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed\numeric_feature_scaler.joblib


In [8]:

if "image_path" not in df_labels.columns:
    raise ValueError("Expected 'image_path' column to exist in df_labels.")
df_labels[["image_index", "image_path"]].head()


Unnamed: 0,image_index,image_path
0,00000001_000.png,C:\Springboard\Data Science at Scale\23.5 Caps...
1,00000001_001.png,C:\Springboard\Data Science at Scale\23.5 Caps...
2,00000001_002.png,C:\Springboard\Data Science at Scale\23.5 Caps...
3,00000002_000.png,C:\Springboard\Data Science at Scale\23.5 Caps...
4,00000003_000.png,C:\Springboard\Data Science at Scale\23.5 Caps...


In [9]:

possible_patient_cols = ["patient_id", "Patient ID", "PatientID", "patientid"]
patient_col = None
for col in possible_patient_cols:
    if col in df_labels.columns:
        patient_col = col
        break

if patient_col is None:
    raise ValueError(
        "Could not find a patient ID column. "
        "Please ensure one of the following columns exists: "
        f"{possible_patient_cols}"
    )

print("Using patient ID column:", patient_col)

unique_patients = df_labels[patient_col].unique()
print("Number of unique patients:", len(unique_patients))

train_patients, temp_patients = train_test_split(
    unique_patients, test_size=0.30, random_state=42, shuffle=True
)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.50, random_state=42, shuffle=True
)

print("Train patients:", len(train_patients))
print("Validation patients:", len(val_patients))
print("Test patients:", len(test_patients))

def filter_by_patients(df_in, patients):
    return df_in[df_in[patient_col].isin(patients)].copy()

train_df = filter_by_patients(df_labels, train_patients)
val_df = filter_by_patients(df_labels, val_patients)
test_df = filter_by_patients(df_labels, test_patients)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)


Using patient ID column: patient_id
Number of unique patients: 30805
Train patients: 21563
Validation patients: 4621
Test patients: 4621
Train shape: (78566, 44)
Validation shape: (17063, 44)
Test shape: (16491, 44)


In [10]:

full_metadata_path = OUTPUT_DIR / "chest_xray_preprocessed_metadata.csv"
df_labels.to_csv(full_metadata_path, index=False)
print(f"Saved full preprocessed metadata to: {full_metadata_path}")

train_path = OUTPUT_DIR / "train_metadata.csv"
val_path = OUTPUT_DIR / "val_metadata.csv"
test_path = OUTPUT_DIR / "test_metadata.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved train metadata to: {train_path}")
print(f"Saved validation metadata to: {val_path}")
print(f"Saved test metadata to: {test_path}")


Saved full preprocessed metadata to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed\chest_xray_preprocessed_metadata.csv
Saved train metadata to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed\train_metadata.csv
Saved validation metadata to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed\val_metadata.csv
Saved test metadata to: C:\Springboard\Data Science at Scale\23.5 Capstone 3 - Project Proposals\Thoracic\preprocessed\test_metadata.csv
