In [2]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import Path

def extract_image_info_from_report(xml_content):
    """
    Parse XML report content and extract the report ID and associated image IDs.
    
    Args:
        xml_content: String containing XML content
        
    Returns:
        A tuple of (report_id, list_of_image_ids, report_findings, report_impression)
    """
    try:
        # Parse the XML string
        root = ET.fromstring(xml_content)
        
        # Extract report ID
        uid_element = root.find('.//uId')
        report_id = uid_element.get('id') if uid_element is not None else None
        
        # Extract findings and impression if available
        findings_element = root.find('.//AbstractText[@Label="FINDINGS"]')
        findings = findings_element.text if findings_element is not None else None
        
        impression_element = root.find('.//AbstractText[@Label="IMPRESSION"]')
        impression = impression_element.text if impression_element is not None else None
        
        # Extract all parentImage elements to get associated image IDs
        parent_images = root.findall('.//parentImage')
        image_ids = [img.get('id') for img in parent_images if img.get('id') is not None]
        
        return report_id, image_ids, findings, impression
    
    except Exception as e:
        print(f"Error processing XML content: {e}")
        return None, [], None, None

def find_xml_files(directory):
    """
    Recursively find all XML files in the given directory and its subdirectories
    """
    xml_files = []
    
    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return xml_files
    
    # Walk through directory and all subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check for XML files
            if file.endswith('.xml'):
                xml_files.append(os.path.join(root, file))
            # Also check for files that might contain XML content but don't have .xml extension
            elif file == 'paste.txt' or 'report' in file.lower():
                xml_files.append(os.path.join(root, file))
    
    print(f"Found {len(xml_files)} potential XML files in {directory}")
    return xml_files

def map_reports_to_images(reports_dir, output_file='mapping_results.csv'):
    """
    Process all XML report files in the given directory and create a mapping
    between reports and their associated images.
    
    Args:
        reports_dir: Directory containing the XML report files
        output_file: File to save the mapping results
    """
    # Find all potential XML files in the reports directory
    xml_files = find_xml_files(reports_dir)
    
    # Lists to store the mapping data
    all_report_ids = []
    all_image_ids = []
    all_findings = []
    all_impressions = []
    
    # Process each file
    for xml_file in xml_files:
        try:
            with open(xml_file, 'r', encoding='utf-8') as f:
                xml_content = f.read()
                
            # Check if this is actually XML content
            if '<?xml' in xml_content:
                report_id, image_ids, findings, impression = extract_image_info_from_report(xml_content)
                if report_id and image_ids:
                    print(f"Found report {report_id} with {len(image_ids)} images")
                    for img_id in image_ids:
                        all_report_ids.append(report_id)
                        all_image_ids.append(img_id)
                        all_findings.append(findings)
                        all_impressions.append(impression)
        except Exception as e:
            print(f"Error reading file {xml_file}: {e}")
    
    # Create a DataFrame with the mapping data
    mapping_df = pd.DataFrame({
        'ReportID': all_report_ids,
        'ImageID': all_image_ids,
        'Findings': all_findings,
        'Impression': all_impressions
    })
    
    # Save the mapping to a CSV file
    mapping_df.to_csv(output_file, index=False)
    print(f"Mapping saved to {output_file}")
    
    return mapping_df

def verify_image_existence(mapping_df, png_dir):
    """
    Verify that the PNG files corresponding to the image IDs exist in the given directory.
    
    Args:
        mapping_df: DataFrame containing the mapping between reports and images
        png_dir: Directory containing the PNG image files
    
    Returns:
        DataFrame with an additional column indicating if the image file exists
    """
    # Check if directory exists
    if not os.path.exists(png_dir):
        print(f"PNG directory not found: {png_dir}")
        if len(mapping_df) > 0:
            mapping_df['ImageExists'] = False
        return mapping_df
    
    # Check if DataFrame is empty
    if len(mapping_df) == 0:
        print("No mappings found to verify")
        return mapping_df
    
    # Add a column to track if the image file exists
    mapping_df['ImageExists'] = False
    mapped_to_png = []
    
    # Check each image ID
    for i, image_id in enumerate(mapping_df['ImageID']):
        # Try different potential file patterns
        for pattern in [
            f"{image_id}.png",  # Direct match
            f"{image_id.lower()}.png",  # Lowercase
            f"{image_id.replace('CXR', 'cxr')}.png",  # Different case format
            f"{image_id.split('_')[1]}.png"  # Just the IM-XXXX part
        ]:
            # Check for the file using a recursive search
            found = False
            for root, dirs, files in os.walk(png_dir):
                if pattern in files:
                    mapping_df.at[i, 'ImageExists'] = True
                    mapped_to_png.append(os.path.join(root, pattern))
                    found = True
                    break
            
            if found:
                break
    
    # Print a summary
    existing_count = mapping_df['ImageExists'].sum()
    total_count = len(mapping_df)
    if total_count > 0:
        print(f"Found {existing_count} out of {total_count} images ({existing_count/total_count*100:.2f}%)")
        if existing_count > 0:
            print(f"Sample matched PNG paths:")
            for path in mapped_to_png[:5]:  # Show first 5 examples
                print(f"  {path}")
    else:
        print("No images to verify")
    
    return mapping_df

def main():
    # Set the directories containing the XML reports and PNG images
    reports_dir = input("Enter the directory containing XML reports: ")
    png_dir = input("Enter the directory containing PNG images: ")
    
    # Create the mapping
    mapping_df = map_reports_to_images(reports_dir)
    
    # Verify image existence
    mapping_df = verify_image_existence(mapping_df, png_dir)
    
    # Save the updated mapping
    mapping_df.to_csv('verified_mapping_results.csv', index=False)
    print("Verified mapping saved to verified_mapping_results.csv")
    
    # Display some statistics
    print("\nMapping Statistics:")
    print(f"Total reports: {mapping_df['ReportID'].nunique()}")
    print(f"Total images: {len(mapping_df)}")
    
    # Avoid division by zero
    unique_reports = mapping_df['ReportID'].nunique()
    if unique_reports > 0:
        print(f"Average images per report: {len(mapping_df) / unique_reports:.2f}")
        
        # Display reports with multiple images
        report_counts = mapping_df['ReportID'].value_counts()
        multi_image_reports = report_counts[report_counts > 1]
        print(f"\nReports with multiple images: {len(multi_image_reports)}")
        if len(multi_image_reports) > 0:
            print("Top 5 reports with the most images:")
            print(multi_image_reports.head())
    else:
        print("No reports found to calculate statistics.")

if __name__ == "__main__":
    main()

Enter the directory containing XML reports:  /Users/nuthankishoremaddineni/Downloads/ecgen-radiology
Enter the directory containing PNG images:  /Users/nuthankishoremaddineni/Downloads/NLMCXR_png


Found 3955 potential XML files in /Users/nuthankishoremaddineni/Downloads/ecgen-radiology
Found report CXR162 with 2 images
Found report CXR1390 with 2 images
Found report CXR604 with 2 images
Found report CXR2699 with 2 images
Found report CXR2841 with 1 images
Found report CXR3587 with 2 images
Found report CXR2855 with 2 images
Found report CXR3593 with 2 images
Found report CXR88 with 1 images
Found report CXR610 with 1 images
Found report CXR1384 with 2 images
Found report CXR176 with 2 images
Found report CXR638 with 3 images
Found report CXR1435 with 2 images
Found report CXR3222 with 2 images
Found report CXR2882 with 2 images
Found report CXR3544 with 2 images
Found report CXR1353 with 2 images
Found report CXR1347 with 2 images
Found report CXR3550 with 1 images
Found report CXR2128 with 2 images
Found report CXR3236 with 2 images
Found report CXR1421 with 1 images
Found report CXR2100 with 2 images
Found report CXR1409 with 2 images
Found report CXR63 with 2 images
Found rep