#### Train and Test split including Prompts

In [None]:
import json
import random
import re


INPUT_JSON = "radgraph_processed.json"
OUTPUT_JSON = "slava_llava_split_"
TEST_SPLIT = 0.05# 5% test split

IMAGE_TOKEN = "<image>"


recognition_prompts = [
    "Enumerate all abnormal radiographic findings seen on the frontal and lateral chest X-rays, along with their precise anatomical locations.",
    "List every visible pathology in the lungs, heart, pleura, and bones, as observed on both frontal and lateral views.",
    "Describe only the radiographic abnormalities visible in these dual-view chest X-rays. Exclude normal structures.",
    "Identify and localize any abnormal opacities, effusions, consolidations, or structural deviations present in the chest radiographs.",
    "Specify all observed abnormalities in the dual chest views, including their type (e.g., mass, effusion, opacity) and anatomical location.",
    "Report abnormal findings only from the provided frontal and lateral chest X-ray images. Do not describe normal appearances.",
    "What pathological signs can be identified across both X-ray views? Be specific about laterality and anatomical regions.",
    "Describe all clinically relevant radiographic findings, focusing on abnormalities in the lungs, mediastinum, and chest wall.",
    "From the dual-view chest radiographs, list any deviations from normal radiographic anatomy or pathology that requires clinical attention.",
    "Summarize all abnormal chest X-ray findings, organized by anatomical region (e.g., lungs, heart, pleura, bones)."
]


reasoning_prompts = [
    "Based on the findings in the frontal and lateral chest X-rays, what is the most likely clinical diagnosis?",
    "Interpret the radiographic abnormalities observed in both views and explain their clinical implications.",
    "Given the dual-view chest radiographs, what is your diagnostic impression and reasoning behind it?",
    "Using the observed abnormalities in these X-rays, infer the likely pathology and explain its clinical relevance.",
    "What clinical condition best explains the abnormal findings visible in these frontal and lateral chest radiographs?"
]
def impression_starts_with_normal_phrases(impression):
    impression_text = impression.strip() if impression else ''
    starts_with_patterns = [
        r"^no\b",
        r"^no evidence\b",
        r"^no acute\b",
        r"^normal\b"
    ]

    for pattern in starts_with_patterns:
        if re.match(pattern, impression_text, re.IGNORECASE):
            return False

    return True
def infer_view_hint(findings_text):
    text_upper = findings_text.upper()
    if "PA" in text_upper and "AP" in text_upper:
        return "PA and AP chest X-ray views are shown."
    elif "PA" in text_upper:
        return "PA and lateral chest X-ray views are shown."
    elif "AP" in text_upper:
        return "AP and lateral chest X-ray views are shown."
    else:
        return "Frontal and lateral chest X-ray views are shown."

# Load RadGraph-processed JSON
with open(INPUT_JSON, "r") as f:
    data = json.load(f)

output = []
normalcount=0
abnormalCount = 0
for study_id, entry in data.items():
    findings = entry.get("findings", "").strip()
    impression = entry.get("impression", "").strip()
    image_paths = entry.get("image_paths", [])
    frontal = entry.get("image_paths", "")
    lateral= entry.get("image_paths", "")
    if not findings or not impression:
        continue
    if len(image_paths) < 2:
        continue  # Need both frontal and lateral views

    # Dynamic view hint based on findings
    view_hint = infer_view_hint(findings)

    # Build prompts with image tokens and helpful context
    recognition_prompt = f"{IMAGE_TOKEN}{view_hint} {random.choice(recognition_prompts)}"
    reasoning_prompt = f"{IMAGE_TOKEN}{view_hint} {random.choice(reasoning_prompts)}"
    item = {
        "frontal": frontal[0],
        "lateral": lateral[1],
        "recognition_input": recognition_prompt,
        "reasoning_input": reasoning_prompt,
        "findings": findings,
        "impression": impression
    }
    output.append(item)
    
    # if impression_starts_with_normal_phrases(impression) and normalcount<100:
    #     output.append(item)
    #     normalcount+=1
    # elif not impression_starts_with_normal_phrases(impression):
    #     abnormalCount+=1
    #     output.append(item)  

# Shuffle and split into train/test
random.shuffle(output)
split_index = int(len(output) * (1 - TEST_SPLIT))
train_data = output[:split_index]
test_data = output[split_index:]


with open(OUTPUT_JSON + 'train.json', "w") as f:
    json.dump(train_data, f, indent=2)

with open(OUTPUT_JSON + 'test.json', "w") as f:
    json.dump(test_data, f, indent=2)



print(f"Saved {len(train_data)} train, {len(test_data)} test")
print(f"   '{OUTPUT_JSON}train.json'")
print(f"   '{OUTPUT_JSON}test.json'")



#### Generate Train Json For Recognition

In [None]:
import json

input_json = "slava_llava_split_train.json"  
output_json = "slava_llava_recognition.json" 

with open(input_json, "r") as f:
    data = json.load(f)

converted = []

for item in data:
        try:
            frontal = item["frontal"]
            lateral = item["lateral"]
            instruction = item["recognition_input"]
            response = item["findings"]
    
           
            if not response.strip():
                continue
    
            sample = {
                "frontal": frontal,
                "lateral":lateral,
                "conversations": [
                    {"from": "human", "value": instruction},
                    {"from": "gpt", "value": response}
                ]
            }
    
            converted.append(sample)
           
    
        except KeyError as e:
            print(f"[Skip] Missing key {e} in item")

        

with open(output_json, "w") as f:
    json.dump(converted, f, indent=2)

print(f"Converted {len(converted)} samples to {output_json}")


#### Generate Train Json For Reasoning

In [None]:
import json
from collections import defaultdict
import random

input_json = "slava_llava_split_train.json"
output_json = "slava_llava_reasoning.json"


with open(input_json, "r") as f:
    data = json.load(f)

sample_buckets = defaultdict(list)
for item in data:
    impression = item.get("impression", "").lower()
    is_normal = "no acute" in impression or "normal" in impression or "no evidence" in impression
    sample_buckets["normal" if is_normal else "abnormal"].append(item)

print(f"Original counts - Normal: {len(sample_buckets['normal'])}, Abnormal: {len(sample_buckets['abnormal'])}")


ABNORMAL_OVERSAMPLE_FACTOR = 3 
ENHANCED_PROMPT_TEMPLATES = [
    "GIVEN THESE FINDINGS: {findings}\n{original_instruction}",
    "THE RADIOLOGIST NOTED: {findings}\nBASED ON THIS, {original_instruction}",
    "CLINICAL CONTEXT: {findings}\nPLEASE PROVIDE YOUR ANALYSIS: {original_instruction}",
    "{original_instruction}\nRELEVANT FINDINGS INCLUDE: {findings}"
]
REASONING_PROMPTS = [
    "<image> ANALYZE THE DUAL-VIEW CHEST RADIOGRAPHS AND DESCRIBE THE MOST CLINICALLY SIGNIFICANT FINDINGS.",
    "<image> IDENTIFY AND PRIORITIZE THE TOP 3 RADIOGRAPHIC ABNORMALITIES THAT REQUIRE CLINICAL ATTENTION.",
    "<image> COMPARE THE CURRENT STUDY WITH PRIOR IMAGING (IF AVAILABLE). WHAT INTERVAL CHANGES ARE MOST CONCERNING?",
    "<image> WHAT RADIOGRAPHIC SIGNS SUGGEST DECOMPENSATION IN THIS PATIENT'S CONDITION?",
    "<image> WHICH FINDINGS WOULD YOU IMMEDIATELY REPORT TO THE TREATING PHYSICIAN AND WHY?",
    "<image> ASSESS THE POSITIONING AND PLACEMENT OF ALL TUBES, LINES, AND DEVICES.",
    "<image> DESCRIBE ANY FINDINGS THAT SUGGEST ACUTE VERSUS CHRONIC PATHOLOGICAL PROCESSES.",
    "<image> WHAT DIFFERENTIAL DIAGNOSES WOULD YOU CONSIDER BASED ON THESE RADIOGRAPHIC FINDINGS?",
    "<image> EVALUATE THE CARDIOPULMONARY STATUS AND COMMENT ON ANY DECOMPENSATION SIGNS.",
    "<image> IDENTIFY ANY FINDINGS THAT MAY REQUIRE IMMEDIATE INTERVENTION VERSUS FOLLOW-UP MONITORING."
]


converted = []

def build_enhanced_prompt(item):
    """Generate multiple prompt variations incorporating findings"""
    original_instruction = random.choice(REASONING_PROMPTS)
    findings = item.get("findings", "").strip()
    
    if not findings:
        return [original_instruction]
    
    return [
        template.format(
            findings=findings,
            original_instruction=original_instruction
        )
        for template in ENHANCED_PROMPT_TEMPLATES
    ]


for item in sample_buckets["normal"]:
    try:
        enhanced_prompts = build_enhanced_prompt(item)
        converted.append({
            "frontal": item["frontal"],
            "lateral": item["lateral"],
            "conversations": [
                {"from": "human", "value": enhanced_prompts[0]},
                {"from": "gpt", "value": item["impression"]}
            ],
            "category": "normal"
        })
    except KeyError as e:
        print(f"[Skip] Missing key {e} in normal sample")


for item in sample_buckets["abnormal"]:
    try:

        enhanced_prompts = build_enhanced_prompt(item)
        converted.append({
            "frontal": item["frontal"],
            "lateral": item["lateral"],
            "conversations": [
                {"from": "human", "value": enhanced_prompts[0] },
                {"from": "gpt", "value": item["impression"]}
            ],
            "category": "abnormal_original"
        })
        

        for i, prompt in enumerate(enhanced_prompts[:ABNORMAL_OVERSAMPLE_FACTOR]):
            converted.append({
                "frontal": item["frontal"],
                "lateral": item["lateral"],
                "conversations": [
                    {"from": "human", "value": prompt},
                    {"from": "gpt", "value": item["impression"]}
                ],
                "category": f"abnormal_enhanced_{i+1}"
            })
            
    except KeyError as e:
        print(f"[Skip] Missing key {e} in abnormal sample")


random.shuffle(converted)

with open(output_json, "w") as f:
    json.dump(converted, f, indent=2)


category_counts = defaultdict(int)
for item in converted:
    category_counts[item["category"]] += 1

print("\nFinal Dataset Composition:")
for category, count in category_counts.items():
    print(f"- {category}: {count} samples")
print(f"\nSaved {len(converted)} samples to {output_json}")

#### Generate Train Json For Reporting

In [None]:
import json
from collections import defaultdict
import random

input_json = "slava_llava_split_train.json"
output_json = "slava_llava_report.json"

with open(input_json, "r") as f:
    data = json.load(f)


sample_buckets = defaultdict(list)
for item in data:
    impression = item.get("impression", "").lower()
    is_normal = "no acute" in impression or "normal" in impression or "no evidence" in impression
    sample_buckets["normal" if is_normal else "abnormal"].append(item)

print(f"Original counts - Normal: {len(sample_buckets['normal'])}, Abnormal: {len(sample_buckets['abnormal'])}")


ABNORMAL_OVERSAMPLE_FACTOR = 4  # 3x more abnormal cases
REPORT_INSTRUCTIONS = [
    "<image> Describe the findings in these frontal and lateral chest X-rays in a structured radiology report format.",
    "<image> Generate a complete radiology report for these CXRs including findings and impression.",
    "<image> Interpret these chest X-rays and provide a professional radiology report.",
    "<image> What abnormalities are visible in these CXRs? Provide a structured report.",
    "<image> Analyze these frontal and lateral chest X-rays and summarize the key findings.",
    "<image> Prepare a radiology report for these images with findings and clinical impression.",
    "<image> Evaluate these CXRs and document your observations in standard report format.",
    "<image> Provide a detailed interpretation of these chest X-rays with findings and conclusion.",
    "<image> Write a radiology report for these images following clinical documentation standards.",
    "<image> Identify and describe any pathological findings in these CXRs in report format."
]


converted = []


for item in sample_buckets["normal"]:
    try:
        base_instruction = random.choice(REPORT_INSTRUCTIONS)
        findings = item["findings"]
        impression = item["impression"] 
        report_text = f"FINDINGS: {findings}"
        report_text += f"IMPRESSION: {impression}"
        converted.append({
            "frontal": item["frontal"],
            "lateral": item["lateral"],
            "conversations": [
                {"from": "human", "value": base_instruction},
                {"from": "gpt", "value":report_text}
            ],
            "category": "normal"
        })
    except KeyError as e:
        print(f"[Skip] Missing key {e} in normal sample")


for item in sample_buckets["abnormal"]:
    try:

        findings = item["findings"]
        impression = item["impression"] 
        report_text = f"FINDINGS: {findings}"
        report_text += f"IMPRESSION: {impression}"
        converted.append({
            "frontal": item["frontal"],
            "lateral": item["lateral"],
            "conversations": [
                {"from": "human", "value": random.choice(REPORT_INSTRUCTIONS) },
                {"from": "gpt", "value":report_text}
            ],
            "category": "abnormal_original"
        })
        
   
        for i in range(ABNORMAL_OVERSAMPLE_FACTOR):
            converted.append({
                "frontal": item["frontal"],
                "lateral": item["lateral"],
                "conversations": [
                    {"from": "human", "value": random.choice(REPORT_INSTRUCTIONS)},
                    {"from": "gpt", "value": report_text}
                ],
                "category": f"abnormal_enhanced_{i+1}"
            })
            
    except KeyError as e:
        print(f"[Skip] Missing key {e} in abnormal sample")

random.shuffle(converted)


with open(output_json, "w") as f:
    json.dump(converted, f, indent=2)


category_counts = defaultdict(int)
for item in converted:
    category_counts[item["category"]] += 1

print("\nFinal Dataset Composition:")
for category, count in category_counts.items():
    print(f"- {category}: {count} samples")
print(f"\n Saved {len(converted)} samples to {output_json}")