In [None]:
!pip install pandas pydicom numpy opencv-python requests tqdm google-cloud-storage

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


#### Google Drive and Colab Auth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import auth
auth.authenticate_user()

#### Importing Required Libraries

In [None]:
import os
import pandas as pd
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import numpy as np
import cv2
from tqdm import tqdm
import zipfile
import re
import subprocess
from google.colab import drive
from google.cloud import storage


EXCEL_PATH = '/content/drive/MyDrive/Dual view Slava/cxr-record-list.csv'
REPORTS_ZIP_PATH = '/content/drive/MyDrive/Dual view Slava/mimic-cxr-reports.zip'
OUTPUT_JPEG_DIR = '/content/drive/MyDrive/Dual view Slava/CXR_IMAGES'
TEMP_DIR = '/content/temp'
DICOM_CACHE_DIR = '/content/drive/MyDrive/Dual view Slava/dicom_cache'
PROGRESS_CSV = '/content/drive/MyDrive/Dual view Slava/download_progress.csv'

# Create directories
os.makedirs(OUTPUT_JPEG_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(DICOM_CACHE_DIR, exist_ok=True)





#### GCS (Google Cloud Storage) Setup

Initializes the GCS client using your project ID and connects to the MIMIC-CXR bucket.
Ensure you have access to the MIMIC-CXR v2 Google Cloud bucket.

In [None]:
project_id = 'myproject64137'
client = storage.Client(project=project_id)
bucket_name = 'mimic-cxr-2.0.0.physionet.org'
bucket = client.bucket(bucket_name, user_project=project_id)

####  Initialize Progress CSV

Checks if a local progress file exists for keeping track of processed studies.
If it doesn't exist, a new one is created.

In [None]:
if not os.path.exists(PROGRESS_CSV):
    with open(PROGRESS_CSV, 'w') as f:
        f.write("study_id,subject_id,dicom_id,status,timestamp\n")


#### Dynamically extract all reports from zip file

In [None]:
def extract_reports(zip_path):
    reports = {}
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:

        txt_files = [f for f in zip_ref.namelist() if f.endswith('.txt')]

        for file_path in tqdm(txt_files, desc="Extracting reports"):
            try:

                path_parts = file_path.split('/')


                if len(path_parts) >= 4 and path_parts[-1].startswith('s'):
                    with zip_ref.open(file_path) as f:
                        content = f.read().decode('utf-8')


                        subject_folder = path_parts[-2]
                        study_file = path_parts[-1]


                        subject_id = subject_folder[1:]
                        study_id = study_file[1:-4]

                        key = (subject_id, study_id)
                        reports[key] = content
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    return reports


 #### Identify Normal Impressions

In [None]:
def impression_starts_with_normal_phrases(report_text):
    # Extract impression/conclusion/diagnosis section
    impression_match = re.search(
        r"(IMPRESSION|CONCLUSION|DIAGNOSIS):\s*(.*)",
        report_text, re.DOTALL | re.IGNORECASE
    )
    impression_text = impression_match.group(2).strip() if impression_match else ''

    # Phrases considered as "normal" starts
    starts_with_patterns = [
        r"^no\b",
        r"^no evidence\b",
        r"^no acute\b",
        r"^normal\b"
    ]

    # Common disease-related terms in chest X-ray
    disease_patterns = [
        r"\bcardiomegaly\b",
        r"\bpneumonia\b",
        r"\bconsolidation\b",
        r"\bpleural effusion\b",
        r"\binfiltrate[s]?\b",
        r"\bedema\b",
        r"\batelectasis\b",
        r"\bopacity\b",
        r"\bmass(es)?\b",
        r"\bnodule(s)?\b",
        r"\bfibrosis\b",
        r"\bscarring\b",
        r"\bpleural thickening\b",
        r"\bpneumothorax\b",
        r"\bemphysema\b",
        r"\bhyperinflation\b",
        r"\btuberculosis\b",
        r"\binterstitial\b",
        r"\bfracture(s)?\b",
        r"\blesion(s)?\b",
        r"\bcalcification(s)?\b",
        r"\bbronchiectasis\b",
        r"\bvascular congestion\b"
    ]

    # Check if starts with a normal phrase
    starts_with_normal = any(
        re.match(pattern, impression_text, re.IGNORECASE)
        for pattern in starts_with_patterns
    )

    # Check if disease term is present
    contains_disease = any(
        re.search(pattern, impression_text, re.IGNORECASE)
        for pattern in disease_patterns
    )

    # Return True only if not normal OR if disease is present
    return not starts_with_normal or contains_disease

####  Detect PA and Lateral Views

In [None]:
def is_pa_and_lateral_study(report_text):

    patterns = [
        r'PA\s+AND\s+LAT',
        r'PA\/LAT',
        r'PA\s*&\s*LAT',
        r'PA\s*\+\s*LAT',
        r'PA\s*AND\s*LATERAL',
        r'CHEST\s*\(PA\s*AND\s*LAT\)'
    ]
    combined_pattern = '|'.join(patterns)
    return bool(re.search(combined_pattern, report_text, re.IGNORECASE))

#### Validate Report Sections

In [None]:
def has_non_empty_findings_and_impression(report_text):
    findings_match = re.search( r"(FINDINGS|OBSERVATION|DESCRIPTION):\s*(.*?)(IMPRESSION|CONCLUSION|DIAGNOSIS|$)",  report_text, re.DOTALL | re.IGNORECASE)
    impression_match = re.search(r"(IMPRESSION|CONCLUSION|DIAGNOSIS):\s*(.*)", report_text, re.DOTALL | re.IGNORECASE)

    findings_text = findings_match.group(1).strip() if findings_match else ''
    impression_text = impression_match.group(1).strip() if impression_match else ''

    return bool(findings_text) and bool(impression_text)

In [None]:
def has_non_empty_findings_and_impression(report_text):
    findings_match = re.search(r'FINDINGS:\s*(.*?)(?:IMPRESSION:|$)', report_text, re.DOTALL | re.IGNORECASE)
    impression_match = re.search(r'IMPRESSION:\s*(.*)', report_text, re.DOTALL | re.IGNORECASE)

    findings_text = findings_match.group(1).strip() if findings_match else ''
    impression_text = impression_match.group(1).strip() if impression_match else ''

    return bool(findings_text) and bool(impression_text)

#### Convert DICOM to JPEG

In [None]:
def dicom_to_jpeg(dcm_path, study_dir):

    try:
        ds = pydicom.dcmread(dcm_path)
        img = apply_voi_lut(ds.pixel_array, ds)
        img = ((img - img.min()) / (img.max() - img.min()) * 255).astype(np.uint8)

        if len(img.shape) == 3 and img.shape[2] == 3:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        output_jpg_path = os.path.join(study_dir, f'{ds.ViewPosition}.jpeg')
        cv2.imwrite(output_jpg_path, img)
        return True
    except Exception as e:
        print(f"Error converting {dcm_path}: {e}")
        return False


#### Get Local DICOM Path

In [None]:

def get_dicom_path(local_base, remote_path):
    """Convert remote path to local cached path"""
    return os.path.join(local_base, remote_path)


##### Extracting reports from zip

In [None]:
df = pd.read_csv(EXCEL_PATH)


print("Extracting reports from zip...")
reports = extract_reports(REPORTS_ZIP_PATH)
print(f"Found {len(reports)} report files")
grouped = df.groupby(['subject_id', 'study_id'])

pa_lateral_studies = []
print("Identifying PA and lateral studies...")

for (subject_id, study_id), group in tqdm(grouped):
    report_key = (str(subject_id), str(study_id))
    if report_key in reports:
        report_text = reports[report_key]
        if is_pa_and_lateral_study(report_text) and has_non_empty_findings_and_impression(report_text) and impression_starts_with_normal_phrases(report_text):
            pa_lateral_studies.append((subject_id, study_id, group))

print(f"Found {len(pa_lateral_studies)} studies with PA and lateral views")


pa_lateral_studies_sorted = sorted(pa_lateral_studies, key=lambda x: x[1])

Extracting reports from zip...


Extracting reports: 100%|██████████| 227835/227835 [00:09<00:00, 25314.01it/s]


Found 227835 report files
Identifying PA and lateral studies...


100%|██████████| 227835/227835 [00:27<00:00, 8275.60it/s] 

Found 29028 studies with PA and lateral views





#### Resume from Previous Progress

In [None]:
if os.path.exists(PROGRESS_CSV):
    progress_df = pd.read_csv(PROGRESS_CSV)
    completed_studies = set(progress_df['study_id'].astype(str))
else:
    completed_studies = set()

df = pd.read_csv(EXCEL_PATH)
filtered_df = df[~df['study_id'].astype(str).isin(completed_studies)]


In [None]:
filtered_df

Unnamed: 0,subject_id,study_id,dicom_id,path
4,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,files/p10/p10000032/s53911762/68b5c4b1-227d048...
5,10000032,53911762,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,files/p10/p10000032/s53911762/fffabebf-74fd3a1...
6,10000032,56699142,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,files/p10/p10000032/s56699142/ea030e7a-2e3b134...
12,10000898,54205396,8959e402-2175d68d-edba5a6c-baab51c3-9359f700,files/p10/p10000898/s54205396/8959e402-2175d68...
13,10000898,54205396,9e7a6aae-2580e589-6212d336-9813ebbd-a9239a34,files/p10/p10000898/s54205396/9e7a6aae-2580e58...
...,...,...,...,...
377105,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,files/p19/p19999733/s57132437/428e2c18-5721d8f...
377106,19999733,57132437,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,files/p19/p19999733/s57132437/58c403aa-35ff8bd...
377107,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,files/p19/p19999987/s55368167/58766883-376a15c...
377108,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,files/p19/p19999987/s58621812/7ba273af-3d290f8...


#### Filter for Valid Studies

In [None]:

print("Getting DICOM file path ...")
unique_paths = set()
filtered_paths = set(filtered_df['path'])
for _, _, dicom_paths in pa_lateral_studies:
  for _, row in dicom_paths.iterrows():
      if row['path'] in filtered_paths:
            unique_paths.add(row['path'])

unique_paths = sorted(
    unique_paths,
    key=lambda x: int(x.split('/')[3][1:])
)

Getting DICOM file path ...


In [None]:
unique_studies_in_paths = {path.split('/')[3] for path in unique_paths}
print(f"Number of unique studies to process: {len(unique_studies_in_paths)}")


if os.path.exists(PROGRESS_CSV):
    progress_df = pd.read_csv(PROGRESS_CSV)
    unique_studies_completed = set(progress_df['study_id'].astype(str))
    print(f"Number of already processed studies: {len(unique_studies_completed)}")


    overlap = unique_studies_in_paths & unique_studies_completed
    print(f"Studies both in queue and completed: {len(overlap)} (should be 0 after filtering)")
else:
    print("No progress file found - starting fresh")

Number of unique studies to process: 1979
Number of already processed studies: 39431
Studies both in queue and completed: 0 (should be 0 after filtering)


#### Download Dicom from GCP

In [None]:


def download_dicom(blob_path, output_base_dir):
    try:

        relative_path = blob_path

        local_path = os.path.join(output_base_dir,'physionet.org/files/mimic-cxr/2.0.0/'+ relative_path)
        local_dir = os.path.dirname(local_path)

        os.makedirs(local_dir, exist_ok=True)

        blob = bucket.blob(blob_path)
        with open(local_path, 'wb') as f:
            blob.download_to_file(f)

        # print(f"Downloaded to {local_path}")
        return local_path

    except Exception as e:
        print(f"Failed to download {blob_path}: {e}")
        return None


##### Downloading and processing DICOMs

In [None]:


base_url = 'https://physionet.org/files/mimic-cxr/2.0.0/'

# count= 0
print("Downloading and processing DICOMs...")
for path in tqdm(unique_paths, desc="Processing"):
  # if count<10000:
      path_parts = path.split('/')
      study_id = path_parts[3][1:]
      remote_url = base_url + path
      dcm_filename = os.path.basename(path)
      dicom_id = dcm_filename.split('.')[0]
      download_dicom(path, DICOM_CACHE_DIR)
      study_dir = os.path.join(OUTPUT_JPEG_DIR, study_id)
      os.makedirs(study_dir, exist_ok=True)
      local_dcm_path = get_dicom_path(DICOM_CACHE_DIR,'physionet.org/files/mimic-cxr/2.0.0/'+path)

      dicom_to_jpeg(local_dcm_path,study_dir)

      temp_dcm = os.path.join(DICOM_CACHE_DIR, 'physionet.org/files/mimic-cxr/2.0.0/'+path)

      if os.path.exists(temp_dcm) and os.path.isfile(temp_dcm):
        with open(PROGRESS_CSV, 'a') as f:
                f.write(f"{study_id},{subject_id},{dicom_id},success,{pd.Timestamp.now()}\n")
        os.remove(temp_dcm)
        # print(f"Removed temporary DICOM: {temp_dcm}")
      else:
          print(f"File not found or is a directory: {temp_dcm}")
      # count+=1;

print(f"Processing complete! JPEGs saved in {OUTPUT_JPEG_DIR}")

Downloading and processing DICOMs...


Processing: 100%|██████████| 4294/4294 [1:29:13<00:00,  1.25s/it]

Processing complete! JPEGs saved in /content/drive/MyDrive/Dual view Slava/CXR_IMAGES



