In [4]:
from bert_score import score
import numpy as np
import pandas as pd
import json
import os
from typing import List, Dict, Union
from prompt import *
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


# SOAP Datasets

In [None]:
a_train_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Train.csv") # 603
a_eval_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Eval.csv") # 75
a_test_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Test.csv") # 87

soap_train_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/BioNLP2023-1A-Train.csv") # 765, this includes all of the three above
soap_test_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/BioNLP2023-1A-Test.csv") # for shots

filtered_df = soap_train_df[soap_train_df['File ID'].isin(set(a_test_df['File ID']).union({'190862.txt', '109943.txt', '195790.txt'}))]
len(filtered_df) # 90 - 1 = 89

In [None]:
filtered_df.columns

In [None]:
for i, row in filtered_df.iterrows():
    print(row['Summary'])
    print("-----------")

In [None]:
len_lst = []
for i in range(len(soap_test_df)):
    row = soap_test_df.iloc[i]
    try:
        length = len(row["Subjectives"]) + len(row["Objectives"]) + len(row["Assessment"]) + len(row["Summary"])
        len_lst.append(length)
    except:
        print(row)

smallest_number = min(len_lst)
print(smallest_number)
index_of_smallest = len_lst.index(smallest_number)
index_of_smallest

In [None]:
soap_test_df.iloc[50]["Summary"] # ["Objectives"], ["Assessment"], ["Summary"]

In [None]:
filtered_df.head(2)

# SOAP result

In [None]:
filtered_df['Summary'].iloc[0]

In [5]:
# # zero-shot
# soa_zs_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_soa_zs.csv")
# sa_zs_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_sa_zs.csv")
# so_zs_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_so_zs.csv")
# a_zs_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_a_zs.csv")

# # one-shot
# soa_os_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_soa_os.csv")
# sa_os_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_sa_os.csv")
# so_os_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_so_os.csv")
# a_os_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/soap_result/1024_soap_a_os.csv")

# closed
a_cl_df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1031_soap_closed_a.csv")
so_cl_df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1031_soap_closed_so.csv")

# ltm
so_ltm_open_df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1031_ltm_open.csv")
so_ltm_closed_df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1031_ltm_close.csv")


# SOAP Evaluation

### ROUGE-L F-score

In [None]:
a_zs_df.loc[a_zs_df["File ID"].isin({'190862.txt', '109943.txt'})][['File ID','pred', 'Summary']]

a_zs_df.loc[a_zs_df["File ID"]== '109943.txt']['Summary'].values[0]

In [None]:
rg_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

predictions = ['CAD s/p CABG; recent NSTEMI; HTN; CKD; cholangitis; hypotension (resolved)',
               'ESRD on HD; HTN; HL; acute CHF exacerbation; NSTEMI; s/p cath with DES in LAD and LMCA.']

references = ['# Cholangitis; CAD; HTN; AoCRF',
              'S/P NSTEMI; HTN; Chronic renal failure; HL']

for pred, ref in zip(predictions, references):
    scores = rg_scorer.score(ref, pred)
    rougeL_f = scores['rougeL'].fmeasure
    print(f"ROUGE-L F-score: {rougeL_f:.4f}")

In [None]:
len(so_os_df)

### BERTScore - using SapBERT

In [None]:
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"

predictions = ['CAD s/p CABG; recent NSTEMI; HTN; CKD; cholangitis; hypotension (resolved)',
               'ESRD on HD; HTN; HL; acute CHF exacerbation; NSTEMI; s/p cath with DES in LAD and LMCA.']

references = ['# Cholangitis; CAD; HTN; AoCRF',
              'S/P NSTEMI; HTN; Chronic renal failure; HL']

P, R, F1 = score(predictions, references, model_type=model_name, lang="en", rescale_with_baseline=True, num_layers=12)

for i in range(len(predictions)):
    print(f"BERTScore F1: {F1[i]:.4f}")

In [None]:
# Example using BioBERT as a substitute for SapBERT
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"

# Define your predictions and references
predictions = sa_zs_df['pred'].tolist()
references = ground_truth.tolist()

# Compute BERTScore
P, R, F1 = score(predictions, references, model_type=model_name, lang="en", rescale_with_baseline=True, num_layers=12)

for i in range(len(predictions)):
    print(f"BERTScore F1: {F1[i]:.4f}")

### Sentence Embedding (the last hidden layer)

In [66]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [68]:
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=1024) # , truncation=True, max_length=1024
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden states
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [None]:
predictions = ['CAD s/p CABG; recent NSTEMI; HTN; CKD; cholangitis; hypotension (resolved)',
               'ESRD on HD; HTN; HL; acute CHF exacerbation; NSTEMI; s/p cath with DES in LAD and LMCA.']

references = ['# Cholangitis; CAD; HTN; AoCRF',
              'S/P NSTEMI; HTN; Chronic renal failure; HL']


pred_embeddings = [get_sentence_embedding(pred) for pred in predictions]
ref_embeddings = [get_sentence_embedding(ref) for ref in references]

for pred_emb, ref_emb in zip(pred_embeddings, ref_embeddings):
    cos_sim = cosine_similarity(pred_emb, ref_emb)[0][0]
    print(f"Sentence Embedding Cosine Similarity: {cos_sim:.4f}")

In [None]:
pred_embeddings[0][0]

In [None]:
cosine_similarity(pred_embeddings[0], ref_embeddings[0])

### ALL

In [6]:
rg_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [11]:
# def get_sentence_embedding(text):
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=1024) # , truncation=True, max_length=1024
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     # Take the mean of the last hidden states
#     embeddings = outputs.last_hidden_state.max(dim=1).values.cpu().numpy()
#     return embeddings
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    mean_embedding = outputs.last_hidden_state.mean(dim=1)
    # Max pooling
    max_embedding = outputs.last_hidden_state.max(dim=1).values
    # Concatenate mean and max
    combined_embedding = torch.cat((mean_embedding, max_embedding), dim=1).cpu().numpy()
    return max_embedding.cpu().numpy()

def count_tokens(sentence):
    return len(tokenizer.tokenize(sentence))

In [12]:
df = so_ltm_open_df
ground_truth = so_ltm_open_df["Summary"]
max_pred_length = max(count_tokens(s) for s in df['pred'].tolist())
max_ref_length = max(count_tokens(s) for s in ground_truth.tolist())

print(f"Max tokens in predictions: {max_pred_length}")
print(f"Max tokens in ground truth: {max_ref_length}")
print()

Max tokens in predictions: 13220
Max tokens in ground truth: 54


In [10]:
# df_lst = [(a_cl_df, "a"), (so_cl_df, "so")]
df_lst = [(so_ltm_open_df, "so_ltm_open")] #(so_ltm_closed_df, "so_ltm_closed")
ground_truth = so_ltm_open_df["Summary"]

outer_dict = {}

for df, name in df_lst:
    print(name)
    outer_dict[name] = {'rougeL': {'scores':[]}, 'bert_score': {'scores':[]}, 'sent_emb_sim': {'scores':[]}}
    for pred, ref in zip(df['pred'].tolist(), ground_truth.tolist()):

        # Calculate ROUGE-L F-score
        scores = rg_scorer.score(ref, pred)
        rougeL_f = scores['rougeL'].fmeasure
        outer_dict[name]['rougeL']['scores'].append(rougeL_f)

        # Calculate embedding similarity
        pred_emb = get_sentence_embedding(pred)
        ref_emb = get_sentence_embedding(ref)
        sim = cosine_similarity(pred_emb, ref_emb)[0][0]
        outer_dict[name]['sent_emb_sim']['scores'].append(sim)

    # Calculate BERTScore
    _, _, bert_scores = score(df['pred'].tolist(), ground_truth.tolist(), model_type=model_name, lang="en", rescale_with_baseline=True, num_layers=12)
    outer_dict[name]['bert_score']['scores'] = bert_scores.tolist()

    outer_dict[name]['rougeL']['average'] = np.mean(outer_dict[name]['rougeL']['scores'])
    outer_dict[name]['rougeL']['std'] = np.std(outer_dict[name]['rougeL']['scores'])

    outer_dict[name]['bert_score']['average'] = np.mean(outer_dict[name]['bert_score']['scores'])
    outer_dict[name]['bert_score']['std'] = np.std(outer_dict[name]['bert_score']['scores'])
    
    outer_dict[name]['sent_emb_sim']['average'] = np.mean(outer_dict[name]['sent_emb_sim']['scores'])
    outer_dict[name]['sent_emb_sim']['std'] = np.std(outer_dict[name]['sent_emb_sim']['scores'])



so_ltm_open


RuntimeError: The expanded size of the tensor (13222) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [64, 13222].  Tensor sizes: [1, 512]

In [16]:
max_length = 510

# Function to truncate sentences
def truncate_sentence(sentence, max_length=max_length):
    tokens = tokenizer.tokenize(sentence)
    if len(tokens) > max_length:
        tokens = tokens[:max_length]
        return tokenizer.convert_tokens_to_string(tokens)
    return sentence

df_lst = [(so_ltm_closed_df, "so_ltm_closed")]  # Add more tuples as needed
ground_truth = so_ltm_closed_df["Summary2"]

outer_dict = {}

for df, name in df_lst:
    print(name)
    outer_dict[name] = {
        'rougeL': {'scores': []},
        'bert_score': {'scores': []},
        'sent_emb_sim': {'scores': []}
    }
    
    # Truncate sentences before processing
    truncated_preds = [truncate_sentence(s) for s in df['pred'].tolist()]
    truncated_refs = [truncate_sentence(s) for s in ground_truth.tolist()]
    
    for pred, ref in zip(truncated_preds, truncated_refs):
        # Calculate ROUGE-L F-score
        scores = rg_scorer.score(ref, pred)
        rougeL_f = scores['rougeL'].fmeasure
        outer_dict[name]['rougeL']['scores'].append(rougeL_f)
        
        # Calculate embedding similarity
        pred_emb = get_sentence_embedding(pred)
        ref_emb = get_sentence_embedding(ref)
        sim = cosine_similarity(pred_emb, ref_emb)[0][0]
        outer_dict[name]['sent_emb_sim']['scores'].append(sim)
    
    # Calculate BERTScore using truncated sentences
    _, _, bert_scores = score(
        truncated_preds,
        truncated_refs,
        model_type=model_name,
        lang="en",
        rescale_with_baseline=True,
        num_layers=12
    )
    outer_dict[name]['bert_score']['scores'] = bert_scores.tolist()
    
    # Compute statistics
    outer_dict[name]['rougeL']['average'] = np.mean(outer_dict[name]['rougeL']['scores'])
    outer_dict[name]['rougeL']['std'] = np.std(outer_dict[name]['rougeL']['scores'])
    
    outer_dict[name]['bert_score']['average'] = np.mean(outer_dict[name]['bert_score']['scores'])
    outer_dict[name]['bert_score']['std'] = np.std(outer_dict[name]['bert_score']['scores'])
    
    outer_dict[name]['sent_emb_sim']['average'] = np.mean(outer_dict[name]['sent_emb_sim']['scores'])
    outer_dict[name]['sent_emb_sim']['std'] = np.std(outer_dict[name]['sent_emb_sim']['scores'])

print(outer_dict)

so_ltm_closed
{'so_ltm_closed': {'rougeL': {'scores': [0.0, 0.4444444444444445, 0.0, 0.15384615384615383, 0.6666666666666666, 0.33333333333333337, 0.22222222222222224, 0.0, 0.11764705882352942, 0.36363636363636365, 0.0, 0.0, 0.30769230769230765, 0.2222222222222222, 0.0, 0.2, 0.0, 0.14285714285714285, 0.2222222222222222, 0.39999999999999997, 0.2608695652173913, 0.5, 0.6666666666666666, 0.1904761904761905, 0.0, 0.1818181818181818, 0.0, 0.0, 0.30769230769230765, 0.20000000000000004, 0.15384615384615385, 0.6, 0.7499999999999999, 0.2727272727272727, 0.0, 0.16666666666666666, 0.0, 0.16666666666666666, 0.26666666666666666, 0.11764705882352941, 0.0, 0.2222222222222222, 0.22222222222222224, 0.0, 0.0, 0.25, 0.3333333333333333, 0.27272727272727276, 0.3333333333333333, 0.0, 0.0, 0.09090909090909091, 0.45454545454545453, 0.16666666666666666, 0.25, 0.36363636363636365, 0.125, 0.17391304347826086, 0.0, 0.3225806451612903, 0.0, 0.0, 0.0, 0.20000000000000004, 0.0, 0.5555555555555556, 0.0, 0.0, 0.258064



In [None]:
#  Max Pooling
for method in zs_outer_dict.keys():
    print(f"{method} ROUGE-L: {zs_outer_dict[method]['rougeL']['average'] * 100:.2f} ± {zs_outer_dict[method]['rougeL']['std'] * 100:.2f}")
    print(f"{method} BERTScore: {zs_outer_dict[method]['bert_score']['average'] * 100:.2f} ± {zs_outer_dict[method]['bert_score']['std'] * 100:.2f}")
    print(f"{method} Sentence Embedding Similarity: {zs_outer_dict[method]['sent_emb_sim']['average'] * 100:.2f} ± {zs_outer_dict[method]['sent_emb_sim']['std'] * 100:.2f}")
    print("-----------")

In [17]:
for method in outer_dict.keys():
    print(f"{method} ROUGE-L: {outer_dict[method]['rougeL']['average'] * 100:.2f} ± {outer_dict[method]['rougeL']['std'] * 100:.2f}")
    print(f"{method} BERTScore: {outer_dict[method]['bert_score']['average'] * 100:.2f} ± {outer_dict[method]['bert_score']['std'] * 100:.2f}")
    print(f"{method} Sentence Embedding Similarity: {outer_dict[method]['sent_emb_sim']['average'] * 100:.2f} ± {outer_dict[method]['sent_emb_sim']['std'] * 100:.2f}")
    print("-----------")

so_ltm_closed ROUGE-L: 18.82 ± 17.84
so_ltm_closed BERTScore: 53.69 ± 13.87
so_ltm_closed Sentence Embedding Similarity: 52.86 ± 14.95
-----------


In [None]:
# 마지막: Max Pooling
for method in zs_outer_dict.keys():
    print(method)
    print(zs_outer_dict[method]['rougeL']['average'])
    # print(zs_outer_dict[method]['rougeL']['std'])
    print(zs_outer_dict[method]['bert_score']['average'])
    # print(zs_outer_dict[method]['bert_score']['std'])
    print(zs_outer_dict[method]['sent_emb_sim']['average'])
    # print(zs_outer_dict[method]['sent_emb_sim']['std'])
    print("-----------")

In [None]:
# 마지막: Mean Pooling
for method in zs_outer_dict.keys():
    print(method)
    print(zs_outer_dict[method]['rougeL']['average'])
    # print(zs_outer_dict[method]['rougeL']['std'])
    print(zs_outer_dict[method]['bert_score']['average'])
    # print(zs_outer_dict[method]['bert_score']['std'])
    print(zs_outer_dict[method]['sent_emb_sim']['average'])
    # print(zs_outer_dict[method]['sent_emb_sim']['std'])
    print("-----------")

In [6]:
import pickle
with open('/home/yl3427/cylab/SOAP_MA/soap_result/1031_raw_ltm.pkl', 'rb') as file:
    raw_ltm = pickle.load(file)
print(len(raw_ltm))
raw_ltm

297


['If the patient has a high potassium level (e.g., K 6.5) in the Subjective section and it is mentioned again in the Objective section, assess for hyperkalemia.',
 'If the patient has a high glucose level or history of high glucose in the Subjective section, assess for hyperglycemia.',
 'If the patient is on respiratory support such as CPAP and has a low PaO2/FiO2 ratio in the Objective section, assess for COPD with acute exacerbation.',
 'If the patient presents with symptoms like malaise and hypoxia in the Subjective section, consider assessing for respiratory conditions such as COPD exacerbation.',
 'If the Subjective section mentions a recent colonoscopy with findings such as ulceration or visible vessels, and the Objective section confirms these findings with imaging or lab results, the Assessment may include complications related to the colonoscopy, such as bleeding.',
 'If the Subjective section notes a history of cardiovascular issues like CAD and recent procedures such as sten

In [7]:
with open('/home/yl3427/cylab/SOAP_MA/soap_result/1031_refined_ltm.pkl', 'rb') as file:
    refined_ltm = pickle.load(file)
print(len(refined_ltm))
refined_ltm

297


['If a patient has elevated potassium levels in both the Subjective and Objective sections, assess for hyperkalemia.',
 'High glucose levels or a history of high glucose in the Subjective section should prompt an assessment for hyperglycemia.',
 'Respiratory support and low PaO2/FiO2 ratio in the Objective section suggest assessing for COPD exacerbation.',
 'Symptoms like malaise and hypoxia in the Subjective section may indicate respiratory conditions such as COPD exacerbation.',
 'Findings from a colonoscopy in the Subjective section, confirmed by Objective data, may lead to an assessment of colonoscopy-related complications.',
 'A history of cardiovascular issues and recent procedures, combined with hypotension in the Objective section, may indicate cardiovascular complications.',
 'Bright red blood per rectum in the Subjective section, supported by low hematocrit or imaging in the Objective section, suggests gastrointestinal bleeding.',
 'Bradycardia and hypotension in the Objectiv

In [5]:
import pandas as pd
df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1031_soap_closed_so.csv")
df.columns

Index(['File ID', 'Assessment', 'Summary', 'Subjective', 'Objective',
       'Summary2', 'pred'],
      dtype='object')

In [9]:
mean_summary2_length = df["Summary2"].astype(str).apply(len).mean()
mean_pred_length = df["pred"].astype(str).apply(len).mean()

mean_summary2_length, mean_pred_length

(np.float64(38.12777777777778), np.float64(56.03888888888889))

In [8]:
mean_summary2_length = df["Summary2"].apply(lambda x: x.split(";")).apply(len).mean()
mean_pred_length = df["pred"].apply(lambda x: x.split(";")).apply(len).mean()

mean_summary2_length, mean_pred_length


(np.float64(2.688888888888889), np.float64(3.6555555555555554))

In [13]:
df[["Summary", "Summary2"]].to_csv("/home/yl3427/cylab/SOAP_MA/soap_result/df.csv", index=False)