In [1]:
#Version 1 - Simple Prompting (Chunking - To avoid hallunication)
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import pandas as pd
from tqdm import tqdm

login(token="HuggingFaceToken")  # Replace with your actual token

model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# Step 3: Load and prepare data
df = pd.read_csv("filtered_unstructured.csv", low_memory=False)
df = df.iloc[10:20] 

# Step 4: Identify note chunk columns
note_chunk_cols = [col for col in df.columns if col.startswith("note_chunk")]

# Combine only non-empty note chunks per row
def combine_chunks(row):
    return " ".join([str(row[col]).strip() for col in note_chunk_cols if pd.notna(row[col]) and str(row[col]).strip() != ""])

df["full_note"] = df.apply(combine_chunks, axis=1)

# Step 5: Paraphrasing function

def split_chunks(text, chunk_size=400, overlap=100):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def paraphrase(note):
    prompt = f"""Rewrite the following clinical note in a more concise and clear way. Keep all important medical details like diagnoses, treatments, medications, history, and vital signs.

Original Note:
{note}

Paraphrased Note:"""

    # Tokenize and move to model device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=1.0,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode output
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract only the paraphrased note
    if "Paraphrased Note:" in decoded:
        return decoded.split("Paraphrased Note:")[-1].strip()
    elif "Paraphrased:" in decoded:
        return decoded.split("Paraphrased:")[-1].strip()
    elif note.strip() in decoded:
        return decoded.replace(note.strip(), "").strip()
    else:
        return decoded.strip()


def paraphrase_long_note(note):
    chunks = split_chunks(note)
    paraphrased_chunks = []

    for chunk in chunks:
        if chunk.strip():  # avoid empty inputs
            paraphrased = paraphrase(chunk)
            paraphrased_chunks.append(paraphrased)

    return " ".join(paraphrased_chunks).strip()

# Apply paraphrasing with chunking to each note
tqdm.pandas()
df["paraphrased_note"] = df["full_note"].progress_apply(paraphrase_long_note)

# Step 6: Apply paraphrasing
#df["paraphrased_note"] = df["full_note"].progress_apply(paraphrase)

# Step 7: Save results
df.to_csv("paraphrased_notes_output.csv", index=False)
print("Paraphrased notes saved to 'paraphrased_notes_output.csv'")


In [7]:
df["paraphrased_note"].to_csv('para_out.csv')

In [3]:
#Version 2: Sentence tokenization + Chunking
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import pandas as pd
import re

# Step 1: Login to Hugging Face
login(token="Token")  # Replace with your actual token

# Step 2: Load tokenizer and model
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# Step 3: Load and prepare data (ONE NOTE ONLY)
df = pd.read_csv("filtered_unstructured.csv", low_memory=False)
note_chunk_cols = [col for col in df.columns if col.startswith("note_chunk")]
row = df.iloc[11]

# Combine non-empty chunks
full_note = " ".join([str(row[col]).strip() for col in note_chunk_cols if pd.notna(row[col]) and str(row[col]).strip() != ""])

# Step 4: Regex-based sentence-aware chunking
def regex_sentence_split(text):
    return re.split(r'(?<=[.!?])\s+', text)

def split_chunks_by_sentence(text, chunk_token_limit=350, overlap_sentences=2):
    sentences = regex_sentence_split(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sent in sentences:
        sent_token_len = len(tokenizer.encode(sent, add_special_tokens=False))
        if current_length + sent_token_len > chunk_token_limit:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = current_chunk[-overlap_sentences:] + [sent]
                current_length = sum(len(tokenizer.encode(s, add_special_tokens=False)) for s in current_chunk)
            else:
                chunks.append(sent)
                current_chunk = []
                current_length = 0
        else:
            current_chunk.append(sent)
            current_length += sent_token_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


# Step 5: Paraphrasing function with medical factuality preserving prompt
def paraphrase(note):
    prompt = f"""<s>[INST] You are a clinical documentation assistant. 
    Rephrase the following clinical note to make it clearer, but do not add, change, or remove any medical information.
    Keep exact medication names, dosages, and frequencies if present. Do not infer or summarize anything that is not explicitly stated.


{note}

Rephrased Note: [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=0.7,  # Reduced to discourage creativity
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove prompt and everything before the response
    if "[/INST]" in decoded:
        return decoded.split("[/INST]")[-1].strip()
    else:
        return decoded.strip()


# Step 6: Split and paraphrase
chunks = split_chunks_by_sentence(full_note)

print("\n=== ORIGINAL vs PARAPHRASED CHUNKS ===\n")
for i, chunk in enumerate(chunks):
    if chunk.strip():
        paraphrased = paraphrase(chunk)
        print(f"\n--- Chunk {i+1} ---")
        print(f"Original:\n{chunk.strip()}\n")
        print(f"Paraphrased:\n{paraphrased}\n")
