#### Importing Required Libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv(r"C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\open_ave_data.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [4]:
data.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [5]:
df=data[["ReportText"]]
df.head()

Unnamed: 0,ReportText
0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...
3,Exam: - CHEST-PORTABLE History: Chest pain Com...
4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...


In [6]:
l=df.to_numpy().tolist()
l=l[:30]

#### Setting up LLM

##### loading environ variable / API Keys to use model

In [7]:
from dotenv import load_dotenv
load_dotenv()



True

In [8]:
from langchain_groq import ChatGroq

llm=ChatGroq(model="llama-3.1-8b-instant")

#### Prompt Setup

In [9]:
from langchain_core.prompts import ChatPromptTemplate

#### Pydantic Class setup  and output parser

- For Data validation
- To ensure the output is in the required JSON format

In [10]:
from pydantic import BaseModel,Field
from langchain_core.output_parsers import PydanticOutputParser


In [11]:
data.columns

Index(['ReportText', 'findings', 'clinicaldata', 'ExamName', 'impression'], dtype='object')

In [12]:
class FieldsExtraction(BaseModel):
    findings:str=Field(description="Radiologist's technical observations")
    clinicaldata:str=Field(description="Reason for examination (e.g., symptoms like chest pain, shortness of breath)")
    ExamName:str=Field(description="Exam type and date")
    impression:str=Field(description="Final diagnosis or summary")

In [13]:
output_parser=PydanticOutputParser(pydantic_object=FieldsExtraction)
format_instructions = output_parser.get_format_instructions()
output_parser.get_format_instructions()
result=[]

In [14]:
prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a helpful medical data extraction assistant. 

            From the given "Report Text", extract the following fields and return ONLY a JSON object, and nothing else. 
            Use the format described in {format_instructions}.
            Return exactly this structure:
         
            Only return a JSON object with the following fields:

            ### Fields to be extracted:
            - findings
            - clinicaldata
            - ExamName
            - impression

            ### Example:

            Input:
            EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. CLINICAL HISTORY: Cough. COMPARISON: None. TECHNIQUE: 2 views. FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None. IMPRESSION: Normal 2-view chest radiography Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.

            Extracted:
            findings = FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None.  
            clinicaldata = CLINICAL HISTORY: Cough.  
            ExamName = EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. TECHNIQUE: 2 views. COMPARISON: None.  
            impression = IMPRESSION: Normal 2-view chest radiography    Dictated by: [[PERSONALNAME]] on 06/01/2019 08:42 PM. Electronically signed by: [[PERSONALNAME]] on 06/01/2019 08:43 PM.
         
            ### NOTE: Return all fields as flat strings - do not nest them or break into subfields like 'examdate' or 'examname'. Return 'ExamName' as a single string, just as found in the report. """),
                
                
            ("user", "{input}") ])

In [15]:
chain = prompt | llm | output_parser

#### Asynchronously Processing requests in Batches

In [16]:
import asyncio
from langchain_core.exceptions import OutputParserException
from tqdm import tqdm
import math
import time



BATCH_SIZE = 25
DELAY_BETWEEN_BATCHES = 60  # seconds

In [17]:


async def process_async(index, text):

    try:
        response = await chain.ainvoke({
            "input": text,
            "format_instructions": output_parser.get_format_instructions()
        })
        return index, response
    
    except OutputParserException as e:
        print(f"[OutputParseError] Index {index}:\n", e.llm_output)
        return index, None
    
    except Exception as e:
        print(f"[Error] Index {index}: {e}")
        return index, None

async def run_all(text_list, start_idx=0):
    tasks = [process_async(start_idx + i, text) for i, text in enumerate(text_list)]
    return await asyncio.gather(*tasks)

In [18]:


async def run_batched(text_list):
    total = len(text_list)
    results = [None] * total
    failed_indices = []

    num_batches = math.ceil(total / BATCH_SIZE)

    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start = batch_idx * BATCH_SIZE
        end = min(start + BATCH_SIZE, total)
        batch_texts = text_list[start:end]

        print(f"Running batch {batch_idx + 1}/{num_batches} ({len(batch_texts)} items)...")

        batch_results = await run_all(batch_texts, start_idx=start)

        for i, res in enumerate(batch_results):
            idx = start + i
            results[idx] = res[1]
            if res[1] is None:
                failed_indices.append(idx)

        if batch_idx < num_batches - 1:
            print(f"Waiting {DELAY_BETWEEN_BATCHES}s before next batch...")
            await asyncio.sleep(DELAY_BETWEEN_BATCHES)

    
    
    if failed_indices:
        print(f"\nRetrying {len(failed_indices)} failed items...")
        retry_texts = [text_list[i] for i in failed_indices]
        retry_results = await run_all(retry_texts, start_idx=0)
        for i, (idx, res) in enumerate(zip(failed_indices, retry_results)):
            results[idx] = res[1]

    return results


In [19]:
import nest_asyncio
import asyncio

nest_asyncio.apply()  

final_results = await run_batched(l)



Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]

Running batch 1/2 (25 items)...
Waiting 60s before next batch...


Processing batches:  50%|█████     | 1/2 [01:01<01:01, 61.11s/it]

Running batch 2/2 (5 items)...


Processing batches: 100%|██████████| 2/2 [01:02<00:00, 31.00s/it]


In [20]:
import pandas as pd


cleaned_results = [res.dict() if res else {} for res in final_results]
df = pd.DataFrame(cleaned_results)

df.to_csv("extracted_medical_reports_new.csv", index=False)
print("Saved to 'extracted_medical_reports.csv'")


Saved to 'extracted_medical_reports.csv'


### Evaluation of extracted fields with ground truth values

In [21]:
extracted_df=pd.read_csv(r'C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\Final_extracted_medical_reports.csv')

In [22]:
extracted_df.head()

Unnamed: 0,findings,clinicaldata,ExamName,impression
0,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography
1,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormality.
2,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN.,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,Findings: Heart size appears normal. Lungs clear.,History: Chest pain,Exam: - CHEST-PORTABLE History: Chest pain Com...,Impression: Lungs clear
4,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [23]:
ground_truth_df=pd.read_csv(r'C:\Users\divya\Documents\GitHub\Build_Projects_Open_Avenue\week2\ground_truth.csv')

In [24]:
ground_truth_df.head()

Unnamed: 0.1,Unnamed: 0,findings,clinicaldata,ExamName,impression
0,0,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,2,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [25]:
fields = ["findings", "clinicaldata", "ExamName", "impression"]

In [26]:
from fuzzywuzzy import fuzz
import numpy as np

- `fuzz` module provides string similarity metrics.

- `fuzz.token_set_ratio` compares two strings and gives a similarity score (0 - 100), handling word order and partial matches.

In [27]:
fuzz.token_set_ratio("no pleural effusion", "pleural effusion not seen")


91

#### Cleaning text

- Removing non-string entries like `\n\n` or `extra-spaces` and lowering the text

In [28]:
def clean_text(s):
    if isinstance(s, str):
        return s.strip().replace('\n', ' ').replace('\r', '').lower()
    return ""


for field in fields:
    extracted_df[field] = extracted_df[field].apply(clean_text)
    ground_truth_df[field] = ground_truth_df[field].apply(clean_text)


results = []

In [29]:

for field in fields:
    exact_matches = (extracted_df[field] == ground_truth_df[field]).sum()
    total = len(extracted_df)
    exact_accuracy = exact_matches / total * 100

    fuzzy_scores = [
        fuzz.token_set_ratio(extracted_df[field][i], ground_truth_df[field][i])
        for i in range(total)
    ]
    avg_fuzzy_score = sum(fuzzy_scores) / total

    results.append({
        "Field": field,
        "Exact Match Accuracy (%)": round(exact_accuracy, 2),
        "Average Fuzzy Match Score (%)": round(avg_fuzzy_score, 2)
    })

evaluation_df = pd.DataFrame(results)
evaluation_df


Unnamed: 0,Field,Exact Match Accuracy (%),Average Fuzzy Match Score (%)
0,findings,76.52,99.22
1,clinicaldata,83.23,98.82
2,ExamName,0.63,99.28
3,impression,70.34,99.26


In [30]:
Report_Text="EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 08:30 PM. CLINICAL HISTORY: Cough. COMPARISON: None. TECHNIQUE: 2 views. FINDINGS: Lungs/Pleura: No focal opacities evident. No pleural effusion. No pneumothorax. Normal volumes. Mediastinum: Heart and mediastinal contours are unremarkable. Other: None. IMPRESSION: Normal 2-view chest radiography. "

gt= "exam: chest radiography exam date: 06/01/2019 08:30 pm. technique: 2 views. comparison: none."

pred="exam: chest radiography. technique: 2 views. comparison: none. exam date: 06/01/2019 08:30 pm"


In [31]:
exact_matches = np.array((gt == pred)).sum()
total = len(extracted_df)
exact_accuracy = exact_matches / total * 100
exact_accuracy

0.0

In [32]:
fuzz.token_set_ratio(pred, gt)

100

#### Checking cosine similarity for semantic similarity

In [33]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')  

cosine_results = []

In [34]:

for field in fields:
    refs = ground_truth_df[field].tolist()
    preds = extracted_df[field].tolist()

    # Converting text to vector - embeddings
    
    ref_embeddings = model.encode(refs, convert_to_tensor=True)
    pred_embeddings = model.encode(preds, convert_to_tensor=True)

    # Cosine similarity
    cos_sim = cosine_similarity(ref_embeddings, pred_embeddings).diagonal()
    avg_sim = np.mean(cos_sim)

    cosine_results.append({
        "Field": field,
        "Average Cosine Similarity": round(avg_sim * 100, 2)
    })


cosine_df1 = pd.DataFrame(cosine_results)



In [35]:
cosine_df1

Unnamed: 0,Field,Average Cosine Similarity
0,findings,98.55
1,clinicaldata,96.89
2,ExamName,88.3
3,impression,96.16
