In [2]:
import pandas as pd
import time
import os
from langchain_ollama import ChatOllama
from langchain.schema import HumanMessage

In [3]:
Input_File = 'cleaned_discharge.csv'
Output_File = 'clinical_text_summaries (deepseek-r1_32b).csv'
Batch_Size = 5
Start_Index = 0
MODEL_NAME = 'deepseek-r1:32b'

In [4]:
df = pd.read_csv(Input_File)

In [5]:
try:
    import torch
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU in use:", torch.cuda.get_device_name(0))
except ImportError:
    print("PyTorch not installed — skipping GPU check")

CUDA Available: True
GPU in use: NVIDIA GeForce RTX 4090


In [6]:
if os.path.exists(Output_File):
    done_df = pd.read_csv(Output_File)
    Start_Index = len(done_df)
    print(f" Resuming from row {Start_Index}")
else:
    done_df = pd.DataFrame(columns=['id', 'text', 'summary'])

 Resuming from row 1101


In [7]:
llm = ChatOllama(model=MODEL_NAME,
                temperature = 0.7)

In [8]:
Max_Rows = 1300

for i in range(Start_Index, min(len(df), Max_Rows), Batch_Size):
    batch = df.iloc[i:i+Batch_Size].copy()
    summaries = []

    print(f"Processing rows {i} to {i + len(batch) - 1}...")

    for note in batch['text']:
        prompt = f"Summarize the following clinical note in one paragraph only. Do not include any headings, section titles, or formatting. Just return a well-written paragraph:\n\n{note}"
        messages = [HumanMessage(content=prompt)]
        try:
            response = llm.invoke(messages)
            summary = response.content if response and response.content else "EMPTY"
        except Exception as e:
            print(f"Error while summarizing: {e}")
            summary = "ERROR"
        summaries.append(summary)
        time.sleep(1)


    if len(summaries) == len(batch):
        batch.loc[:, 'summary'] = summaries
        done_df = pd.concat([done_df, batch], ignore_index=True)
        done_df.to_csv(Output_File, index=False)
        print(f"Saved up to row {i + len(batch)}")
    else:
        print(f"Length mismatch — Skipping batch {i}-{i+len(batch)}: summaries={len(summaries)}, rows={len(batch)}")


Processing rows 1101 to 1105...
Saved up to row 1106
Processing rows 1106 to 1110...
Saved up to row 1111
Processing rows 1111 to 1115...
Saved up to row 1116
Processing rows 1116 to 1120...
Saved up to row 1121
Processing rows 1121 to 1125...
Saved up to row 1126
Processing rows 1126 to 1130...
Saved up to row 1131
Processing rows 1131 to 1135...
Saved up to row 1136
Processing rows 1136 to 1140...
Saved up to row 1141
Processing rows 1141 to 1145...
Saved up to row 1146
Processing rows 1146 to 1150...
Saved up to row 1151
Processing rows 1151 to 1155...
Saved up to row 1156
Processing rows 1156 to 1160...
Saved up to row 1161
Processing rows 1161 to 1165...
Saved up to row 1166
Processing rows 1166 to 1170...
Saved up to row 1171
Processing rows 1171 to 1175...
Saved up to row 1176
Processing rows 1176 to 1180...
Saved up to row 1181
Processing rows 1181 to 1185...
Saved up to row 1186
Processing rows 1186 to 1190...
Saved up to row 1191
Processing rows 1191 to 1195...
Saved up to ro

In [9]:
df1 =  pd.read_csv(Output_File)


In [10]:
df1.head()

Unnamed: 0,id,text,summary,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,,Name: ** Unit No: ** Admission Date: ** Discha...,"<think>\nOkay, so I'm trying to understand thi...",10000032-DS-21,10000032,22595853,DS,21,5/7/2180 0:00,5/9/2180 15:26,...,,,,,,,,,,
1,,Name: ** Unit No: ** Admission Date: ** Discha...,"<think>\nOkay, so I'm trying to understand thi...",10000032-DS-22,10000032,22841357,DS,22,6/27/2180 0:00,7/1/2180 10:15,...,,,,,,,,,,
2,,Name: ** Unit No: ** Admission Date: ** Discha...,"<think>\nOkay, I'm trying to understand this m...",10000032-DS-23,10000032,29079034,DS,23,7/25/2180 0:00,7/25/2180 21:42,...,,,,,,,,,,
3,,Name: ** Unit No: ** Admission Date: ** Discha...,"<think>\nOkay, so I need to summarize this det...",10000032-DS-24,10000032,25742920,DS,24,8/7/2180 0:00,8/10/2180 5:43,...,,,,,,,,,,
4,,Name: ** Unit No: ** Admission Date: ** Discha...,"<think>\nOkay, so I need to figure out how to ...",10000084-DS-17,10000084,23052089,DS,17,11/25/2160 0:00,11/25/2160 15:09,...,,,,,,,,,,


In [11]:
for i in range(len(df1)):
    print(f"\n\U0001f4dd Original Note {i+1} (excerpt):\n{df.loc[i, 'text'][:10]}...\n")
    print(f"\U0001f4cb Summary {i+1}:\n{df1.loc[i, 'summary']}")
    print("=" * 80)


📝 Original Note 1 (excerpt):
Name: ** U...

📋 Summary 1:
<think>
Okay, so I'm trying to understand this medical summary of a patient's hospital stay. Let me read through it again and see what I can make sense of.

The patient is Ms. **, admitted with worsening abdominal distension and confusion. She has a history of Hepatitis C cirrhosis, HIV on antiretroviral therapy, past IV drug use, COPD, bipolar disorder, and PTSD. Her main issues were ascites due to portal hypertension from her liver disease.

They did some tests: ultrasound showed nodular liver (cirrhosis), small ascites, splenomegaly, cholelithiasis, but no blockage in the portal veins. They tried a diagnostic paracentesis but couldn't get fluid on admission night. She had chest X-ray and urinalysis which were normal, and urine and blood cultures were negative.

Her treatment included diuretics: furosemide 40 mg daily and spironolactone 50 mg daily. They adjusted the spironolactone dose because her potassium was a bit high (4.