In [1]:
import pandas as pd
import re

In [None]:
# keeping the rows if it has clinical findings or xray issues.
def row_has_clinical_findings(row):
    impression = str(row.get('impression', ''))
    findings = str(row.get('findings', ''))
    
    impression_lower = impression.lower()
    findings_lower = findings.lower()
    combined_lower = (impression_lower + " " + findings_lower).strip()
    
    # skipping empty or very short impressions.
    if len(impression.strip()) < 15:
        return False
    

    # pathology patterns.
    pathology_patterns = [
        # masses and growths.
        r'\b(?:mass|tumor|lesion|growth|nodule|cyst)\b',
        # trauma.
        r'\b(?:fracture|dislocation|subluxation|tear)\b',
        # infections.
        r'\b(?:pneumonia|infection|abscess|septic)\b',
        # fluid or swelling.
        r'\b(?:effusion|edema|hematoma|hemorrhage|bleeding)\b',
        # vascular.
        r'\b(?:thrombosis|embolism|aneurysm|ischemi[ac])\b',
        # structural.
        r'\b(?:stenosis|obstruction|compression|impingement)\b',
        r'\b(?:hydrocephalus|herniation)\b',
        # cancer-related.
        r'\b(?:metasta\w+|malignan\w+|carcinom\w+|adenoma)\b',
        # lung-specific.
        r'\b(?:atelectasis|consolidation|pneumothorax|infiltrate)\b',
        # degenerative.
        r'\b(?:degenerative|arthritis|spondylosis)\b',
        # abnormality descriptors.
        r'\b(?:abnormal|pathologic|enlarged|thickened|dilated)\b',
        r'\b(?:opacity|lucency|sclerosis|infarct\w*|necrosis)\b',
    ]

    # strongly show the study has no problems.
    normal_indicators = [
        r'\bunremarkable\s+exam\b',
        r'\bnormal\s+exam\b',
        r'\bwithin\s+normal\s+limits\b',
        r'\banatomic\s+alignment\b',
        r'\bno\s+evidence\s+of\s+(?:metastatic\s+)?disease\b',
        r'\bno\s+acute\s+(?:intracranial\s+)?abnormalit',
        r'\bno\s+acute\s+chest\s+syndrome\b',
        r'\bno\s+evidence\s+of\s+pneumoperitoneum\b',
        r'\bcontinue\s+routine\s+screening\b',
        r'\bcomponents?\s+in\s+anatomic\s+alignment\b',
    ]
    
    # strict patterns that we check in numbered points and general text.
    strict_patterns = [
        # no evidence patterns.
        r'\bno\s+(?:evidence|definite\s+evidence|ct\s+evidence|mammographic\s+evidence|conclusive\s+evidence)\b',
        r'\bno\s+(?:interval\s+)?change\b',
        r'\bno\s+acute\b',
        
        # medical negations.
        r'\bno\s+(?:pneumothorax|hemorrhage|bowel\s+obstruction|leak|orbital|eye)\b',
        r'\bno\s+(?:enlarged|suspicious)\s+lymph\b',
        r'\bno\s+(?:intracranial|acute|significant)\s+(?:hemorrhage|bleeding|abnormality)\b',
        r'\bno\s+(?:lung|orbital|eye)\s+abnormality\b',
        r'\bno\s+metastases\b',
        
        # patterns for common medical negations.
        r'\bno\s+evidence\s+(?:of|for)\s+(?:acute|chronic|significant|definite|conclusive|recurrent)?\s*\w+',
        r'\bno\s+(?:acute|chronic|significant|definite|suspicious|obvious|conclusive)\s+\w+',
        r'(?:^|\.\s*|\d+\.\s*)no\s+\w+',
        
        # patterns for specific cases.
        r'(?:lesions?\s+likely\s+benign|changes?\s+unchanged)',
        r'hernia\s+without\s+obstruction',
        r'mild\s+chronic.*changes',
        r'follow-?up\s+(?:recommended|advised)',
        
        # patterns for mild/minimal findings with improvement.
        r'mild\s+\w+(?:\s+\w+)?\s+(?:but\s+)?(?:significantly\s+)?improved',
        r'minimal\s+\w+(?:\s+\w+)?\s+(?:but\s+)?(?:significantly\s+)?improved',
        
        # pstterns for unchanged findings.
        r'\bunchanged\b(?!\s+(?:from|compared))',
        r'(?:remain|appears?|essentially|grossly)\s+unchanged',
        
        # patterns for non-significant findings,
        r'nondisplaced\s+(?:fracture|break)',
        r'\w+\s+(?:but|however)\s+(?:significantly\s+)?improved',
    ]
    

    def has_mostly_negative_statements(text):
        statements = []
        for part in text.split(';'):
            sub_parts = re.split(r'\s+(?:but|however)\s+', part)
            statements.extend(sub_parts)
            
        negative_count = 0
        for stmt in statements:
            stmt = stmt.strip().lower()
            # for negative patterns.
            if any(re.search(pattern, stmt) for pattern in strict_patterns):
                negative_count += 1
                
        # if we have multiple statements, require more positives.
        if len(statements) > 1:
            return negative_count >= len(statements) / 2  
        return negative_count > 0 
    
    # checking if the text contains a positive finding.
    def has_positive_finding(text):
        parts = text.split("but")
        text_to_check = parts[-1].strip() if len(parts) > 1 else ""
        

        if text_to_check:
            return any(re.search(p, text_to_check) for p in pathology_patterns)
        return False
    
    # checking for normal indicators.
    for pattern in normal_indicators:
        if re.search(pattern, impression_lower):
            return False
            
    # check ingif the text has mostly negative statements.
    if has_mostly_negative_statements(impression_lower):
        return False
        

    for pattern in strict_patterns:
        if re.search(pattern, impression_lower):
            if not has_positive_finding(impression_lower):
                return False
    

    sections = re.split(r'\d+\.\s*', impression_lower)
    for section in sections:
        if section.strip():
            for pattern in strict_patterns:
                if re.search(pattern, section.strip()):
                    if not has_positive_finding(section):
                        return False
    
    # statements that don't really show clinical findings.
    vague_patterns = [
        r'^enteric\s+tube\s+as\s+above\.?\s*$',
        r'^tube\s+(?:placement|position)',
        r'^status\s+post\s+.*\s+appearing\s+similar',
    ]
    
    for pattern in vague_patterns:
        if re.search(pattern, impression_lower.strip()):
            return False
    
    # checking if there are any pathology patterns in the text.
    has_pathology = any(re.search(pattern, impression_lower) 
                       for pattern in pathology_patterns)
    
    has_pathology = any(re.search(pattern, impression_lower) 
                       for pattern in pathology_patterns)
    
    # checking if the impression is just negatives.
    pathology_mention_count = sum(1 for pattern in pathology_patterns 
                                  if re.search(pattern, impression_lower))
    
    no_evidence_phrases = len(re.findall(
        r'\bno\s+(?:evidence|significant|definite|acute)\b', 
        impression_lower
    ))
    
    # checking if the impression is dominated by no evidence statements if it has few or no pathology mentions, I remove it.
    if no_evidence_phrases >= 2 and pathology_mention_count == 0:
        return False
    
    # if theimpression starts with no significant and that's the main message.
    if impression_lower.strip().startswith('no significant'):
        if not re.search(r'\d+\.\s*\w+.*(?:fracture|mass|tumor|stenosis)', impression_lower):
            return False
    
    return has_pathology

In [3]:
# Filtering the data to keep only rows with clinical findings.
def filter_xray_data(input_file, output_file, verbose=True):
    if verbose:
        print(f"Reading data from {input_file}.")
    
    df = pd.read_csv(input_file, index_col=0)
    original_count = len(df)
    
    if verbose:
        print(f"Original dataset size: {original_count:,} rows")
        print("Filtering the dataset.")
    
    filtered_df = df[df.apply(row_has_clinical_findings, axis=1)]
    filtered_count = len(filtered_df)
    removed_count = original_count - filtered_count
    
    if verbose:
        print(f"\n{'='*70}")
        print(f"Filtering results:")
        print(f"{'='*70}")
        print(f"Original rows:  {original_count:,}")
        print(f"Kept rows:      {filtered_count:,} ({100 * filtered_count / original_count:.1f}%)")
        print(f"Removed rows:   {removed_count:,} ({100 * removed_count / original_count:.1f}%)")
        print(f"{'-'*70}\n")
    

    if verbose:
        print(f"Saving filtered data to {output_file}.")
    
    filtered_df.to_csv(output_file)
    
    if verbose:
        print("Done!\n")
        

        removed_df = df[~df.apply(row_has_clinical_findings, axis=1)]
        
        if len(removed_df) > 0:
            print(f"{'='*70}")
            print("Sample of the removed rows (Normal/Unremarkable).")
            print(f"{'-'*70}")
            for i, (idx, row) in enumerate(removed_df.head(5).iterrows(), 1):
                print(f"\n[{i}] {row['report_id']}")
                print(f"    {row['impression'][:150]}...")
        
        if len(filtered_df) > 0:
            print(f"\n{'='*70}")
            print("Sample kept rows (clinical findings present).")
            print(f"{'='*70}")
            for i, (idx, row) in enumerate(filtered_df.head(5).iterrows(), 1):
                print(f"\n[{i}] {row['report_id']}")
                print(f"    {row['impression'][:150]}...")
        
        print(f"\n{'='*70}\n")
    
    return filtered_df

In [4]:
if __name__ == "__main__":
    INPUT_FILE = "/Users/casey/Documents/GitHub/LLM_Healthcare/Radiology_cleaning/Rad_filtered_data_final_v4.csv"
    OUTPUT_FILE = "/Users/casey/Documents/GitHub/LLM_Healthcare/Radiology_cleaning/Rad_filtered_data_final_v5.csv"
    
    print("X-Ray Data Filtering Script v2.0")
    print("-"*70 + "\n")
    
    filtered_data = filter_xray_data(INPUT_FILE, OUTPUT_FILE, verbose=True)
    
    print(f"Filtered dataset saved to: {OUTPUT_FILE}")

X-Ray Data Filtering Script v2.0
----------------------------------------------------------------------

Reading data from /Users/casey/Documents/GitHub/LLM_Healthcare/Radiology_cleaning/Rad_filtered_data_final_v4.csv.
Original dataset size: 248,041 rows
Filtering the dataset.

Filtering results:
Original rows:  248,041
Kept rows:      226,213 (91.2%)
Removed rows:   21,828 (8.8%)
----------------------------------------------------------------------

Saving filtered data to /Users/casey/Documents/GitHub/LLM_Healthcare/Radiology_cleaning/Rad_filtered_data_final_v5.csv.
Done!

Sample of the removed rows (Normal/Unremarkable).
----------------------------------------------------------------------

[1] RAD_19
    1. Syringohydromyelia of the cervical spinal cord extending from C2 through C6 is approximately unchanged relative to the prior examination. Also unch...

[2] RAD_33
    Nondisplaced fracture angle of the left mandible....

[3] RAD_35
    Patchy atelectasis bilaterally in a backg