#### Importing Required Libraries

In [4]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [177]:
data=pd.read_csv(r"C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\open_ave_data.csv")

In [178]:
data.head()

Unnamed: 0.1,Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [148]:
data.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [149]:
df=data[["ReportText"]]
df.head()

Unnamed: 0,ReportText
0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...
3,Exam: - CHEST-PORTABLE History: Chest pain Com...
4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...


In [150]:
l=df.to_numpy().tolist()

#### Setting up LLM

##### loading environ variable / API Keys to use model

In [179]:
from dotenv import load_dotenv
load_dotenv()



True

In [156]:
from langchain_groq import ChatGroq

llm=ChatGroq(model="llama-3.1-8b-instant")

#### Prompt Setup

In [157]:
from langchain_core.prompts import ChatPromptTemplate

#### Pydantic Class setup  and output parser

- For Data validation
- To ensure the output is in the required JSON format

In [158]:
from pydantic import BaseModel,Field
from langchain_core.output_parsers import PydanticOutputParser


In [159]:
data.columns

Index(['ReportText', 'findings', 'clinicaldata', 'ExamName', 'impression'], dtype='object')

In [160]:
class FieldsExtraction(BaseModel):
    findings:str=Field(description="Radiologist's technical observations")
    clinicaldata:str=Field(description="Reason for examination (e.g., symptoms like chest pain, shortness of breath)")
    ExamName:str=Field(description="Exam type and date")
    impression:str=Field(description="Final diagnosis or summary")

In [161]:
output_parser=PydanticOutputParser(pydantic_object=FieldsExtraction)
format_instructions = output_parser.get_format_instructions()
output_parser.get_format_instructions()
result=[]

In [162]:
prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a helpful medical data extraction assistant. 

            From the given "Report Text", extract the following fields and return ONLY a JSON object, and nothing else. 
            Use the format described in {format_instructions}. Do not add explanations, Python code, or markdown formatting. 
            Return exactly this structure:
         
            Only return a JSON object with the following fields:

            ### Fields to be extracted:
            - findings
            - clinicaldata
            - ExamName
            - impression

            ### Example:

            Input:
            EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. CLINICAL HISTORY: Cough. COMPARISON: None. TECHNIQUE: 2 views. FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None. IMPRESSION: Normal 2-view chest radiography Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.

            Extracted:
            findings = FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None.  
            clinicaldata = CLINICAL HISTORY: Cough.  
            ExamName = EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. TECHNIQUE: 2 views. COMPARISON: None.  
            impression = IMPRESSION: Normal 2-view chest radiography    Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.
         
            ### NOTE: Return all fields as flat strings — do not nest them or break into subfields like 'examdate' or 'examname'. Return 'ExamName' as a single string, just as found in the report. """),
                
                
            ("user", "{input}") ])

In [163]:
chain = prompt | llm | output_parser

In [None]:
import asyncio
from langchain_core.exceptions import OutputParserException
from tqdm import tqdm
import math
import time

# Number of requests per minute (to stay below the limit)

BATCH_SIZE = 25
DELAY_BETWEEN_BATCHES = 60  # seconds

async def process_async(index, text):
    try:
        response = await chain.ainvoke({
            "input": text,
            "format_instructions": output_parser.get_format_instructions()
        })
        return index, response
    except OutputParserException as e:
        print(f"[OutputParseError] Index {index}:\n", e.llm_output)
        return index, None
    except Exception as e:
        print(f"[Error] Index {index}: {e}")
        return index, None

async def run_all(text_list, start_idx=0):
    tasks = [process_async(start_idx + i, text) for i, text in enumerate(text_list)]
    return await asyncio.gather(*tasks)

async def run_batched(text_list):
    total = len(text_list)
    results = [None] * total
    failed_indices = []

    num_batches = math.ceil(total / BATCH_SIZE)

    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start = batch_idx * BATCH_SIZE
        end = min(start + BATCH_SIZE, total)
        batch_texts = text_list[start:end]

        print(f"Running batch {batch_idx + 1}/{num_batches} ({len(batch_texts)} items)...")

        batch_results = await run_all(batch_texts, start_idx=start)

        for i, res in enumerate(batch_results):
            idx = start + i
            results[idx] = res[1]
            if res[1] is None:
                failed_indices.append(idx)

        if batch_idx < num_batches - 1:
            print(f"Waiting {DELAY_BETWEEN_BATCHES}s before next batch...")
            await asyncio.sleep(DELAY_BETWEEN_BATCHES)

    # Retry logic (optional)
    
    if failed_indices:
        print(f"\nRetrying {len(failed_indices)} failed items...")
        retry_texts = [text_list[i] for i in failed_indices]
        retry_results = await run_all(retry_texts, start_idx=0)
        for i, (idx, res) in enumerate(zip(failed_indices, retry_results)):
            results[idx] = res[1]

    return results


In [None]:
import nest_asyncio
import asyncio

nest_asyncio.apply()  

final_results = await run_batched(l)



In [None]:
import pandas as pd


cleaned_results = [res.dict() if res else {} for res in final_results]


df = pd.DataFrame(cleaned_results)


df.to_csv("extracted_medical_reports2.csv", index=False)

print("Saved to 'extracted_medical_reports2.csv'")


Saved to 'extracted_medical_reports2.csv'


### Evaluation of extracted fields with groun truth values

In [5]:
extracted_df=pd.read_csv(r'C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\Final_extracted_medical_reports.csv')

In [6]:
ground_truth_df=pd.read_csv(r'C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\ground_truth.csv')

In [7]:
fields = ["findings", "clinicaldata", "ExamName", "impression"]

In [None]:
from bert_score import score as bert_score

# Compute BERTScore for each field separately
bert_results = []

for field in fields:
    references = ground_truth_df[field].tolist()
    candidates = extracted_df[field].tolist()
    
    P, R, F1 = bert_score(candidates, references, lang="en", verbose=False)
    
    bert_results.append({
        "Field": field,
        "BERTScore Precision": round(P.mean().item() * 100, 2),
        "BERTScore Recall": round(R.mean().item() * 100, 2),
        "BERTScore F1": round(F1.mean().item() * 100, 2)
    })

bert_df = pd.DataFrame(bert_results)
bert_df


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import precision_recall_fscore_support
from rouge_score import rouge_scorer

# Prepare lists for ROUGE/BERT-style comparison (treat each field as a separate generation task)
rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

rouge_results = []

# Compute ROUGE scores for each field
for field in fields:
    scores = [rouge.score(ground_truth_df[field][i], extracted_df[field][i]) for i in range(len(extracted_df))]
    
    avg_rouge1 = sum([s['rouge1'].fmeasure for s in scores]) / len(scores)
    avg_rougeL = sum([s['rougeL'].fmeasure for s in scores]) / len(scores)
    
    rouge_results.append({
        "Field": field,
        "ROUGE-1 F1": round(avg_rouge1 * 100, 2),
        "ROUGE-L F1": round(avg_rougeL * 100, 2)
    })

rouge_df = pd.DataFrame(rouge_results)
rouge_df


In [None]:
from fuzzywuzzy import fuzz

# Clean text: remove leading/trailing spaces and newlines
def clean_text(s):
    if isinstance(s, str):
        return s.strip().replace('\n', ' ').replace('\r', '').lower()
    return ""

# Clean both extracted and ground truth dataframes

for field in fields:
    extracted_df[field] = extracted_df[field].apply(clean_text)
    ground_truth_df[field] = ground_truth_df[field].apply(clean_text)

# Initialize evaluation metrics
results = []

# Calculate exact match and fuzzy match for each field
for field in fields:
    exact_matches = (extracted_df[field] == ground_truth_df[field]).sum()
    total = len(extracted_df)
    exact_accuracy = exact_matches / total * 100

    fuzzy_scores = [
        fuzz.token_set_ratio(extracted_df[field][i], ground_truth_df[field][i])
        for i in range(total)
    ]
    avg_fuzzy_score = sum(fuzzy_scores) / total

    results.append({
        "Field": field,
        "Exact Match Accuracy (%)": round(exact_accuracy, 2),
        "Average Fuzzy Match Score (%)": round(avg_fuzzy_score, 2)
    })

# Convert results to a DataFrame for display
evaluation_df = pd.DataFrame(results)
evaluation_df
