In [19]:
import pandas as pd
import time
import os
from langchain_ollama import ChatOllama
from langchain.schema import HumanMessage

In [20]:
Input_File = 'OneDrive - UNT System/cleaned_discharge.csv'
Output_File = 'OneDrive - UNT System/clinical_text_summaries (llama3).csv'
Batch_Size = 5
Start_Index = 0
Model_Name = 'llama3'

In [21]:
df = pd.read_csv(Input_File)

In [22]:
try:
    import torch
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU in use:", torch.cuda.get_device_name(0))
except ImportError:
    print("PyTorch not installed — skipping GPU check")

CUDA Available: True
GPU in use: NVIDIA GeForce RTX 4090


In [23]:
if os.path.exists(Output_File):
    done_df = pd.read_csv(Output_File)
    Start_Index = len(done_df)
    print(f" Resuming from row {Start_Index}")
else:
    done_df = pd.DataFrame(columns=['id', 'text', 'summary'])

 Resuming from row 1000


In [24]:
llm = ChatOllama(model=Model_Name,
                temperature = 0.7)

In [26]:
Max_Rows = 1500

for i in range(Start_Index, min(len(df), Max_Rows), Batch_Size):
    batch = df.iloc[i:i+Batch_Size].copy()
    summaries = []

    print(f"Processing rows {i} to {i + len(batch) - 1}...")

    for note in batch['text']:
        prompt = f"Summarize the following clinical note in one paragraph only. Do not include any headings, section titles, or formatting. Just return a well-written paragraph:\n\n{note}"
        messages = [HumanMessage(content=prompt)] 
        try:
            response = llm.invoke(messages)
            summary = response.content if response and response.content else "EMPTY"
        except Exception as e:
            print(f"Error while summarizing: {e}")
            summary = "ERROR"
        summaries.append(summary)
        time.sleep(1)


    if len(summaries) == len(batch):
        batch.loc[:, 'summary'] = summaries
        done_df = pd.concat([done_df, batch], ignore_index=True)
        done_df.to_csv(Output_File, index=False)
        print(f"Saved up to row {i + len(batch)}")
    else:
        print(f"Length mismatch — Skipping batch {i}-{i+len(batch)}: summaries={len(summaries)}, rows={len(batch)}")


Processing rows 1000 to 1004...
Saved up to row 1005
Processing rows 1005 to 1009...
Saved up to row 1010
Processing rows 1010 to 1014...
Saved up to row 1015
Processing rows 1015 to 1019...
Saved up to row 1020
Processing rows 1020 to 1024...
Saved up to row 1025
Processing rows 1025 to 1029...
Saved up to row 1030
Processing rows 1030 to 1034...
Saved up to row 1035
Processing rows 1035 to 1039...
Saved up to row 1040
Processing rows 1040 to 1044...
Saved up to row 1045
Processing rows 1045 to 1049...
Saved up to row 1050
Processing rows 1050 to 1054...
Saved up to row 1055
Processing rows 1055 to 1059...
Saved up to row 1060
Processing rows 1060 to 1064...
Saved up to row 1065
Processing rows 1065 to 1069...
Saved up to row 1070
Processing rows 1070 to 1074...
Saved up to row 1075
Processing rows 1075 to 1079...
Saved up to row 1080
Processing rows 1080 to 1084...
Saved up to row 1085
Processing rows 1085 to 1089...
Saved up to row 1090
Processing rows 1090 to 1094...
Saved up to ro

In [16]:
df1 =  pd.read_csv(Output_File)


In [17]:
df1.head()

Unnamed: 0,id,text,summary,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime
0,,Name: ** Unit No: ** Admission Date: ** Discha...,Here is a summary of the clinical note in one ...,10000032-DS-21,10000032.0,22595853.0,DS,21.0,2180-05-07 00:00:00,2180-05-09 15:26:00
1,,Name: ** Unit No: ** Admission Date: ** Discha...,Here is a summary of the clinical note in one ...,10000032-DS-22,10000032.0,22841357.0,DS,22.0,2180-06-27 00:00:00,2180-07-01 10:15:00
2,,Name: ** Unit No: ** Admission Date: ** Discha...,Here is a summary of the clinical note in one ...,10000032-DS-23,10000032.0,29079034.0,DS,23.0,2180-07-25 00:00:00,2180-07-25 21:42:00
3,,Name: ** Unit No: ** Admission Date: ** Discha...,Here is a summary of the clinical note in one ...,10000032-DS-24,10000032.0,25742920.0,DS,24.0,2180-08-07 00:00:00,2180-08-10 05:43:00
4,,Name: ** Unit No: ** Admission Date: ** Discha...,"A 65-year-old male with Parkinson's disease, d...",10000084-DS-17,10000084.0,23052089.0,DS,17.0,2160-11-25 00:00:00,2160-11-25 15:09:00


In [18]:
for i in range(len(df1)):
    print(f"\n\U0001f4dd Original Note {i+1} (excerpt):\n{df.loc[i, 'text'][:10]}...\n")
    print(f"\U0001f4cb Summary {i+1}:\n{df1.loc[i, 'summary']}")
    print("=" * 80)


📝 Original Note 1 (excerpt):
Name: ** U...

📋 Summary 1:
Here is a summary of the clinical note in one paragraph:

A 56-year-old female with Hepatitis C Virus cirrhosis, HIV on antiretroviral therapy, history of IV drug use, chronic obstructive pulmonary disease, bipolar disorder, and post-traumatic stress disorder presented from the emergency department (ED) with worsening abdominal distension over the past week and confusion. The patient was admitted to the hospital with ascites due to portal hypertension, likely caused by non-compliance with medication and lack of diet restriction. The hospital course included paracentesis to remove 1.5L of fluid, treatment with furosemide and spironolactone, and a plan for follow-up care in liver clinic and outpatient screening procedures such as colonoscopy and esophagogastroduodenoscopy.

📝 Original Note 2 (excerpt):
Name: ** U...

📋 Summary 2:
Here is a summary of the clinical note in one paragraph:

A patient with a history of Hepatitis C Viru