In [None]:
%matplotlib inline

## Imports
import glob
import os
import pickle
import pandas as pd
from pandarallel import pandarallel
from PIL import Image
from tqdm import tqdm
from PIL import UnidentifiedImageError
import stanza

pandarallel.initialize(progress_bar=1)

In [None]:
PROJECT_DIR = '/vol/aimspace/projects/practical_WS2425/vision_language'

NLMCXR_DIR = os.path.join(PROJECT_DIR, 'data/nlmcxr')
NLMCXR_PNG_DIR = os.path.join(NLMCXR_DIR, 'images')
NLMCXR_REPORTS_DIR = os.path.join(NLMCXR_DIR, 'texts/ecgen-radiology')

STANZA_DIR = os.path.join(PROJECT_DIR, 'stanza_resources')

# Verify paths exist
print(f"NLMCXR directory exists: {os.path.exists(NLMCXR_DIR)}")
print(f"NLMCXR PNG directory exists: {os.path.exists(NLMCXR_PNG_DIR)}")
print(f"NLMCXR REPORTS directory exists: {os.path.exists(NLMCXR_REPORTS_DIR)}")

In [None]:
def get_image_paths():
    """Get all PNG image paths from the NLMCXR dataset"""
    image_paths = glob.glob(f'{NLMCXR_DIR}/**/*.png', recursive=True)
    return sorted(image_paths)

image_paths = get_image_paths()
print(f"Found {len(image_paths)} images")

In [None]:
def get_report_paths():
    """Get all report paths from the NLMCXR dataset"""
    report_paths = glob.glob(f'{NLMCXR_REPORTS_DIR}/**/*.xml', recursive=True)
    return sorted(report_paths)

report_paths = get_report_paths()
print(f"Found {len(report_paths)} reports")

In [None]:
import xml.etree.ElementTree as ET

def parse_xml_files():
    """
    Parse XML files and extract relevant information into a DataFrame
    
    Args:
        xml_dir (str): Directory containing XML files
        
    Returns:
        pd.DataFrame: DataFrame with extracted information
    """
    data = []
    for xml_file in tqdm(report_paths):
        try:

            # Parse XML file
            tree = ET.parse(xml_file)
            root = tree.getroot()
            
            # Initialize empty values
            comparison = ""
            indication = ""
            findings = ""
            impression = ""
            
            # Extract abstract text fields
            abstract = root.find(".//Abstract")
            if abstract is not None:
                for abstract_text in abstract.findall("AbstractText"):
                    label = abstract_text.get("Label")
                    text_value = abstract_text.text
                    
                    # Check for "None." and set to None
                    if text_value == "None.":
                        text_value = None
                    
                    if label == "COMPARISON":
                        comparison = text_value
                    elif label == "INDICATION": 
                        indication = text_value
                    elif label == "FINDINGS":
                        findings = text_value
                    elif label == "IMPRESSION":
                        impression = text_value
            # Extract image information
            for parent_image in root.findall(".//parentImage"):
                image_id = parent_image.get("id", "")
                caption = parent_image.find("caption")
                caption_text = caption.text or None
                image_path = os.path.join(NLMCXR_PNG_DIR, image_id+'.png')  
                # Create a row for each image
                data.append({
                    'file_path': xml_file,
                    'comparison': comparison,
                    'indication': indication,
                    'findings': findings,
                    'impression': impression,
                    'image_file_path': image_path, 
                    'image_caption': caption_text
                })
                
        except Exception as e:
            print(f"Error processing {xml_file}: {str(e)}")
            
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

nlmcxr_df = parse_xml_files()

In [None]:
nlmcxr_df.head()

In [None]:
nlmcxr_df.info() 
nlmcxr_df.shape

In [None]:
nlmcxr_df.nunique()

In [None]:
# Count the number of empty cells in each column
null_cells_count = nlmcxr_df.isnull().sum()

print("Number of null cells:")
print(null_cells_count)

In [None]:
mask_to_drop = (nlmcxr_df['comparison'].isnull() & nlmcxr_df['indication'].isnull() & nlmcxr_df['findings'].isnull() & nlmcxr_df['impression'].isnull())

print(f"Number of rows with all report fields empty: {mask_to_drop.sum()}")

print("Before removing samples: ", nlmcxr_df.shape)

nlmcxr_df = nlmcxr_df[~mask_to_drop]
nlmcxr_df.reset_index(drop=True, inplace=True)

print("After removing samples: ", nlmcxr_df.shape)

In [11]:
def resize_image(img_path: str, max_size: int=512) -> Image:
    try:
        img_copy = Image.open(img_path).copy()
        img_copy.thumbnail((max_size, max_size))
        return img_copy
    except (UnidentifiedImageError, OSError) as e:
        print(f"Skipping corrupt image file: {img_path}")
        return None

In [12]:
def resize_and_save_images(row, target_size=512):

    img_path = row['image_file_path']
        
    # Get the relative path of the image within the NLMCXR_PNG_DIR
    relative_path = os.path.relpath(img_path, NLMCXR_PNG_DIR)
    
    # Create the resized image directory if it doesn't exist
    resized_dir = os.path.join(NLMCXR_DIR, 'preprocessed', os.path.dirname(relative_path))
    os.makedirs(resized_dir, exist_ok=True)
    
    # Get the resized image path
    resized_image_path = os.path.join(resized_dir, f"{os.path.splitext(os.path.basename(img_path))[0]}_resized.png")
    
    #Add the resized image path to the dataframe
    nlmcxr_df.loc[row.name, 'image_file_path_resized'] = resized_image_path
    
    # Resize and save image if it doesn't exist
    if not os.path.exists(resized_image_path):
        resized_image = resize_image(img_path, target_size)
        if resized_image is not None:
            resized_image.save(resized_image_path)
        else:
            return None
    
    return resized_image_path

In [None]:
for _, row in tqdm(nlmcxr_df.iterrows(), total=len(nlmcxr_df)):
    resize_and_save_images(row)

In [None]:
#Drop the Image_File_Path column if it exists
if 'image_file_path' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['image_file_path'], inplace=True)

# Display all columns
pd.set_option('display.max_columns', None)

# Display all rows
pd.set_option('display.max_rows', None)

# Increase width to show full content of each cell
pd.set_option('display.max_colwidth', None)

# Increase display width to show more horizontal content
pd.set_option('display.width', None)

#display first 40 rows in head
nlmcxr_df.head()

In [None]:

model_dir = STANZA_DIR + '/stanza_en'
    # Initialize the Stanza pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize', model_dir = model_dir)

def tokenize_sentences(text: str) -> list:
    if not isinstance(text, str) or not text.strip():  # Check if text is a non-empty string
        return []  # Return an empty list for invalid input

    # Process the text
    doc = nlp(text)

    # Extract sentences
    tokenized_words = []
    for sentence in doc.sentences:
        tokenized_words.extend([word.text for word in sentence.words])
    
    return tokenized_words

In [21]:
nlmcxr_df['comparison_tokenized'] = nlmcxr_df['comparison'].apply(lambda x: tokenize_sentences(x))
nlmcxr_df['indication_tokenized'] = nlmcxr_df['indication'].apply(lambda x: tokenize_sentences(x))
nlmcxr_df['findings_tokenized'] = nlmcxr_df['findings'].apply(lambda x: tokenize_sentences(x))
nlmcxr_df['impression_tokenized'] = nlmcxr_df['impression'].apply(lambda x: tokenize_sentences(x))
nlmcxr_df['image_caption_tokenized'] = nlmcxr_df['image_caption'].apply(lambda x: tokenize_sentences(x))


In [None]:
nlmcxr_df.head()

In [None]:
# if columns comparison, indication, findings, impression exist, drop them 
if 'comparison' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['comparison'], inplace=True)
if 'indication' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['indication'], inplace=True)
if 'findings' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['findings'], inplace=True)
if 'impression' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['impression'], inplace=True)
if 'image_caption' in nlmcxr_df.columns:
    nlmcxr_df.drop(columns=['image_caption'], inplace=True)

nlmcxr_df.head()

In [3]:
directory = os.path.join(PROJECT_DIR, 'data/interims')

file_path = os.path.join(directory, 'nlmcxr_preprocessed.pkl')

print("Target Directory:", directory)

# Save the DataFrame
with open(file_path, 'wb') as f:
    pickle.dump(nlmcxr_df, f)

In [5]:
# Verifying the file was saved correctly
with open(file_path, 'rb') as f:
    data_pkl = pickle.load(f)

In [None]:
data_pkl.head()