## Description 
This script should load the data in and scan over everything printing it all out so we can analyze the fields and theri values to check if everything was properly anonymized

In [None]:
import os
import pydicom 
import pandas as pd
from pydicom.errors import InvalidDicomError
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

ROOT_DIR = "Z:/DFCI_Anon"
MAX_THREADS = 8

def is_printable(value):
    if isinstance(value, bytes):
        try:
            decoded = value.decode('utf-8')
            return decoded.isprintable()
        except UnicodeDecodeError:
            return False
    elif isinstance(value, str):
        return value.isprintable()
    return True  

def print_dicom_tags(dicom_path, series_path):
    output = [f"\n[Series] {series_path}"]
    try:
        ds = pydicom.dcmread(dicom_path, stop_before_pixels=True, force=True)
        for elem in ds:
            if elem.tag.is_private or not is_printable(elem.value):
                continue
            output.append(f"{elem.name}: {elem.value}")
    except InvalidDicomError:
        output.append(f"[SKIP] Invalid DICOM: {dicom_path}")
    except Exception as e:
        output.append(f"[ERROR] {dicom_path} -> {e}")
    return "\n".join(output)

def find_all_series(root_dir):
    all_series = []
    for patient_id in os.listdir(root_dir):
        patient_path = os.path.join(root_dir, patient_id)
        if not os.path.isdir(patient_path):
            continue
        for series in os.listdir(patient_path):
            series_path = os.path.join(patient_path, series)
            if os.path.isdir(series_path):
                all_series.append(series_path)
    return all_series

def process_series(series_path):
    try:
        dcm_files = [
            f for f in os.listdir(series_path)
            if f.lower().endswith(".dcm")
        ]
        if not dcm_files:
            return f"[SKIP] No DICOM files in {series_path}"
        first_file = os.path.join(series_path, dcm_files[0])
        return print_dicom_tags(first_file, series_path)
    except Exception as e:
        return f"[ERROR] Failed to process {series_path} -> {e}"

def main():
    all_series = find_all_series(ROOT_DIR)
    print(f"Found {len(all_series)} series...\n")

    outputs = []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        future_to_series = {executor.submit(process_series, series): series for series in all_series}
        for future in tqdm(as_completed(future_to_series), total=len(future_to_series), desc="Reading series"):
            outputs.append(future.result())

    for output in outputs:
        print(output)

main()
