# 1. UMLS mapping (QuickUMLS)

In [1]:
import pandas as pd

train_df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/BioNLP2023-1A-Train.csv')
test_df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/BioNLP2023-1A-Test.csv')
reference_df = pd.concat([train_df, test_df], ignore_index=True)
reference_df # Summary

Unnamed: 0,File ID,Assessment,Summary,Subjective,Objective
0,188026.txt,"H/O HYPERKALEMIA (HIGH POTASSIUM, HYPERPOTASSE...",# Hypoxia:; Hyperkalemia,ULTRASOUND - At [**2121-3-16**] 11:32 AM\n- MD...,Last dose of Antibiotics:\nAzithromycin - [**2...
1,101616.txt,67 y/o M CAD s/[**Initials (NamePattern4) **] ...,Lower GI bleed; Hypotension; CAD,COLONOSCOPY - At [**2171-1-25**] 12:31 PM\n- C...,Last dose of Antibiotics:\nInfusions:\nOther I...
2,102486.txt,81F with h/o chronic eosinophilic lung disease...,PULMONARY EMBOLISM; FEVER; HYPOTENSION; OLIGUR...,Pleuritic right chest pain\n- patient started ...,Last dose of Antibiotics:\nCiprofloxacin - [**...
3,198989.txt,"79 yo F w/ a h/o CHF (EF of 20-30%), carotid s...",Sepsis; Altered/Depressed MS\n thought to be [...,- ID: rec bedside echo\n- continued fluid bolu...,Last dose of Antibiotics:\nMetronidazole - [**...
4,193604.txt,Mr. [**Known firstname 1908**] [**Known lastna...,# Bradycardia / Rhythm; #. Hypertension; # CAD...,High-grade AV nodal block\n- Had successful [*...,Last dose of Antibiotics:\nInfusions:\nOther I...
...,...,...,...,...,...
997,194574.txt,"73 yo M w/ mmp, C4-5 paraplegia, TF dependenc...",Resp distress; recurrent asp pna/pneumonitis; ...,Chief Complaint: resp distress and hypotension...,Last dose of Antibiotics: Cefipime - 03:00 PM...
998,194815.txt,"82yo M with h/o CAD s/p PCTA 15 years ago,\n ...",Hypoxia; Fever: Most likely infectious with ei...,Chief Complaint: dyspnea 24 Hour Events: FEVER...,Last dose of Antibiotics: Infusions: Other ICU...
999,198184.txt,"The patient is a 44 yo man with HIV, Aortic st...",Critical Aortic Stenosis; Transaminitis; Hypon...,TITLE: Chief Complaint: Critical AS awaiting ...,Last dose of Antibiotics: Infusions: Other ICU...
1000,199046.txt,"74 yo M w/ hypotension, septic knee and cirrho...",Respiratory failure; mucous plugging\n and G...,TITLE: Chief Complaint: 24 Hour Events: - Cont...,Last dose of Antibiotics: Daptomycin - 12:23 ...


### set up nltk lemmatizer

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import os


def setup_nltk(custom_data_dir=None):
    """Setup NLTK with optional custom data directory"""
    if custom_data_dir:
        # Clean up existing paths to avoid duplicates
        if custom_data_dir in nltk.data.path:
            nltk.data.path.remove(custom_data_dir)
        try:
            # Test if directory is writable
            os.makedirs(custom_data_dir, exist_ok=True)
            test_file = os.path.join(custom_data_dir, 'test.txt')
            with open(test_file, 'w') as f:
                f.write('test')
            os.remove(test_file)
            
            # Add custom path to NLTK data path (only once)
            nltk.data.path.insert(0, custom_data_dir)
            print(f"Using custom NLTK data directory: {custom_data_dir}")
        except (PermissionError, OSError) as e:
            print(f"Warning: Cannot write to {custom_data_dir}: {str(e)}")
            print("Falling back to default locations")
            custom_data_dir = None
    
    # Show NLTK data paths (unique paths only)
    unique_paths = list(dict.fromkeys(nltk.data.path))
    print("NLTK Data Paths:", unique_paths)
    
    # Required packages with their subdirectories
    
    packages = {
        'tokenizers': ['punkt'],
        'taggers': ['averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng'],  # Added eng version
        'corpora': ['wordnet', 'omw-1.4']
    }
    
    for subdir, pkgs in packages.items():
        for pkg in pkgs:
            try:
                nltk.data.find(f'{subdir}/{pkg}')
            except LookupError:
                nltk.download(pkg, download_dir=custom_data_dir, quiet=True)

custom_nltk_path = "/secure/shared_data/nltk_data"
setup_nltk(custom_nltk_path)

class ClinicalLemmatizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        # Common clinical abbreviations dictionary (you can expand this)
        # self.clinical_abbrev = {
        #     'pt': 'patient',
        #     'dx': 'diagnosis',
        #     'tx': 'treatment',
        #     'hx': 'history',
        #     'temp': 'temperature',
        #     'hr': 'heart rate',
        #     'bp': 'blood pressure',
        #     'htn': 'hypertension',
        #     # Add more abbreviations as needed
        # }

    def get_wordnet_pos(self, word):
        """Map POS tag to WordNet POS tag"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        return tag_dict.get(tag, wordnet.NOUN)

    def lemmatize_text(self, text):
        # Convert to lowercase
        text = text.lower()
        
        # Tokenize
        words = word_tokenize(text)
        
        # Process each word
        lemmatized_words = []
        for word in words:
            # Check if it's an abbreviation
            # if word in self.clinical_abbrev:
            #     lemmatized_words.append(self.clinical_abbrev[word])
            # else:
                # Lemmatize based on POS
            pos = self.get_wordnet_pos(word)
            lemmatized_word = self.lemmatizer.lemmatize(word, pos)
            lemmatized_words.append(lemmatized_word)
    
        return ' '.join(lemmatized_words)

clinical_lemmatizer = ClinicalLemmatizer()

Using custom NLTK data directory: /secure/shared_data/nltk_data
NLTK Data Paths: ['/secure/shared_data/nltk_data', '/home/yl3427/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/share/nltk_data', '/home/yl3427/miniconda3/envs/umls_env_py310/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [3]:
# example
raw_text = "Sepsis; Altered/Depressed MS\n thought to be [**1-25**] to ativan yesterday,"
clinical_lemmatizer.lemmatize_text(raw_text)

'sepsis ; altered/depressed m thought to be [ * * 1-25 * * ] to ativan yesterday ,'

In [4]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Width of the display in characters
pd.set_option('display.max_colwidth', None)
reference_df['Summary_lemma'] = reference_df['Summary'].apply(lambda x: clinical_lemmatizer.lemmatize_text(x) if pd.notnull(x) else x)
reference_df[['Summary', 'Summary_lemma']].head()

Unnamed: 0,Summary,Summary_lemma
0,# Hypoxia:; Hyperkalemia,# hypoxia : ; hyperkalemia
1,Lower GI bleed; Hypotension; CAD,low gi bleed ; hypotension ; cad
2,"PULMONARY EMBOLISM; FEVER; HYPOTENSION; OLIGURIA; NON-GAP ACIDOSIS; HISTORY OF EOSINOPHILIC LUNG DISEASE; DEEP VENOUS THROMBOSIS (DVT), LOWER EXTREMITY; H/O CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, BRONCHITIS,\n EMPHYSEMA) WITH ACUTE EXACERBATION; HEART FAILURE (CHF), DIASTOLIC, CHRONIC","pulmonary embolism ; fever ; hypotension ; oliguria ; non-gap acidosis ; history of eosinophilic lung disease ; deep venous thrombosis ( dvt ) , low extremity ; h/o chronic obstructive pulmonary disease ( copd , bronchitis , emphysema ) with acute exacerbation ; heart failure ( chf ) , diastolic , chronic"
3,"Sepsis; Altered/Depressed MS\n thought to be [**1-25**] to ativan yesterday,; # Acute on Chronic Renal Failure; supertherapeutic INR; Afib, now rate controlled - likely worsened by underlying\n sepsis; s/p BiV placement [**8-31**].; demand ischemia","sepsis ; altered/depressed m thought to be [ * * 1-25 * * ] to ativan yesterday , ; # acute on chronic renal failure ; supertherapeutic inr ; afib , now rate control - likely worsen by underlie sepsis ; s/p biv placement [ * * 8-31 * * ] . ; demand ischemia"
4,# Bradycardia / Rhythm; #. Hypertension; # CAD; #. Chronic Kidney Disease Stage V on HD; . Peripheral Vascular Disease,# bradycardia / rhythm ; # . hypertension ; # cad ; # . chronic kidney disease stage v on hd ; . peripheral vascular disease


### set up a gpt-4 agent

In [5]:
from dotenv import load_dotenv
from pathlib import Path
from openai import OpenAI
from tqdm import tqdm
env_path = Path.home()
load_dotenv(dotenv_path=env_path / ".env")
client = OpenAI()

In [6]:
SYSTEM_MESSAGE = \
    "You are a medical documentation assistant. " \
    "Your task is to expand all abbreviations in the provided patient's progress notes accurately, " \
    "preserving the original formatting exactly. " \
    "Do not reorder, add, or omit any text other than expanding abbreviations. " \
    "Each expanded abbreviation should seamlessly replace the original " \
    "without altering the list's structure or adding comments."

In [7]:
def get_response(text):
    messages = [
    {"role": "system", "content": SYSTEM_MESSAGE},  # System message
    {"role": "user", "content": text}  # Patient's notes with medical abbreviations.
    ]
    response = client.chat.completions.create(
        model = "gpt-4o-2024-08-06",
        messages=messages,
        temperature = 0
    )

    return response.choices[0].message.content

In [8]:
for i in tqdm(range(len(reference_df)), desc="Processing rows"):
    if pd.notnull(reference_df.loc[i, 'Summary']):
        reference_df.loc[i, 'Summary_expanded'] = get_response(reference_df.loc[i, 'Summary'])
print(f"\nCompleted processing {len(reference_df)} rows")

Processing rows: 100%|██████████| 1002/1002 [19:18<00:00,  1.16s/it]


Completed processing 1002 rows





In [9]:
reference_df[['Summary', 'Summary_lemma','Summary_expanded']].head()

Unnamed: 0,Summary,Summary_lemma,Summary_expanded
0,# Hypoxia:; Hyperkalemia,# hypoxia : ; hyperkalemia,# Hypoxia:; Hyperkalemia
1,Lower GI bleed; Hypotension; CAD,low gi bleed ; hypotension ; cad,Lower Gastrointestinal bleed; Hypotension; Coronary Artery Disease
2,"PULMONARY EMBOLISM; FEVER; HYPOTENSION; OLIGURIA; NON-GAP ACIDOSIS; HISTORY OF EOSINOPHILIC LUNG DISEASE; DEEP VENOUS THROMBOSIS (DVT), LOWER EXTREMITY; H/O CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, BRONCHITIS,\n EMPHYSEMA) WITH ACUTE EXACERBATION; HEART FAILURE (CHF), DIASTOLIC, CHRONIC","pulmonary embolism ; fever ; hypotension ; oliguria ; non-gap acidosis ; history of eosinophilic lung disease ; deep venous thrombosis ( dvt ) , low extremity ; h/o chronic obstructive pulmonary disease ( copd , bronchitis , emphysema ) with acute exacerbation ; heart failure ( chf ) , diastolic , chronic","PULMONARY EMBOLISM; FEVER; HYPOTENSION; OLIGURIA; NON-GAP ACIDOSIS; HISTORY OF EOSINOPHILIC LUNG DISEASE; DEEP VENOUS THROMBOSIS (DEEP VENOUS THROMBOSIS), LOWER EXTREMITY; HISTORY OF CHRONIC OBSTRUCTIVE PULMONARY DISEASE (CHRONIC OBSTRUCTIVE PULMONARY DISEASE, BRONCHITIS, EMPHYSEMA) WITH ACUTE EXACERBATION; HEART FAILURE (CONGESTIVE HEART FAILURE), DIASTOLIC, CHRONIC"
3,"Sepsis; Altered/Depressed MS\n thought to be [**1-25**] to ativan yesterday,; # Acute on Chronic Renal Failure; supertherapeutic INR; Afib, now rate controlled - likely worsened by underlying\n sepsis; s/p BiV placement [**8-31**].; demand ischemia","sepsis ; altered/depressed m thought to be [ * * 1-25 * * ] to ativan yesterday , ; # acute on chronic renal failure ; supertherapeutic inr ; afib , now rate control - likely worsen by underlie sepsis ; s/p biv placement [ * * 8-31 * * ] . ; demand ischemia","Sepsis; Altered/Depressed Mental Status thought to be [**1-25**] to ativan yesterday,; # Acute on Chronic Renal Failure; supertherapeutic International Normalized Ratio; Atrial fibrillation, now rate controlled - likely worsened by underlying sepsis; status post Biventricular placement [**8-31**].; demand ischemia"
4,# Bradycardia / Rhythm; #. Hypertension; # CAD; #. Chronic Kidney Disease Stage V on HD; . Peripheral Vascular Disease,# bradycardia / rhythm ; # . hypertension ; # cad ; # . chronic kidney disease stage v on hd ; . peripheral vascular disease,# Bradycardia / Rhythm; #. Hypertension; # Coronary Artery Disease; #. Chronic Kidney Disease Stage V on Hemodialysis; . Peripheral Vascular Disease


In [10]:
reference_df.to_csv('/home/yl3427/cylab/SOAP_MA/mergedBioNLP2023.csv', index=False)

In [10]:
reference_df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/mergedBioNLP2023.csv')

### mapping with QuickUMLS

In [11]:
from quickumls import QuickUMLS
all_matcher = QuickUMLS('/home/yl3427/cylab/QuickUMLS', 
                    overlapping_criteria="length",
                    threshold=0.7,
                    accepted_semtypes= {f"T{str(i).zfill(3)}" for i in range(1, 1000)},
                    )
filtered_matcher = QuickUMLS('/home/yl3427/cylab/QuickUMLS', 
                    overlapping_criteria="length", # fixed
                    # threshold=0.9,
                    similarity_name='dice',
                    accepted_semtypes=['T037', 'T046', 'T047', 'T048', 'T049', 'T190', 'T191'],
                    )

In [12]:
for i in range(len(reference_df)):
    print(reference_df.loc[i, "Summary"])
    print("-------------------")
    for sum in ["Summary", "Summary_lemma", "Summary_expanded"]:
        raw_mapped_results = filtered_matcher.match(reference_df.loc[i, sum], best_match=True)
        term_set = set()
        for result in raw_mapped_results:
            # extracted term (only select the first, sorted by similarity)
            extracted_term = result[0]['term']
            print(f"{sum}: {extracted_term}")
            term_set.add(extracted_term)
        print("\n")
        reference_df.loc[i, f"{sum}_umls"] = ";".join(term_set)


# Hypoxia:; Hyperkalemia
-------------------
Summary: Hypoxia


Summary_lemma: Acute hyperkalemia
Summary_lemma: NB hypoxia


Summary_expanded: Hypoxia


Lower GI bleed; Hypotension; CAD
-------------------
Summary: Lower GI bleeding
Summary: Hypotension NEC


Summary_lemma: gi bleed
Summary_lemma: hypotensive


Summary_expanded: Coronary Artery Disease
Summary_expanded: Gastrointestinal bleed


PULMONARY EMBOLISM; FEVER; HYPOTENSION; OLIGURIA; NON-GAP ACIDOSIS; HISTORY OF EOSINOPHILIC LUNG DISEASE; DEEP VENOUS THROMBOSIS (DVT), LOWER EXTREMITY; H/O CHRONIC OBSTRUCTIVE PULMONARY DISEASE (COPD, BRONCHITIS,
   EMPHYSEMA) WITH ACUTE EXACERBATION; HEART FAILURE (CHF), DIASTOLIC, CHRONIC
-------------------
Summary: chronic obstructive pulmonary disease
Summary: Acute exacerbation of chronic heart failure
Summary: deep venous thrombosis
Summary: eosinophilic lung
Summary: Chr pulmonary embolism
Summary: bronchitis emphysema
Summary: edema lower extremity
Summary: Non-gap acidosis
Summary: h

In [13]:
reference_df.to_csv('/home/yl3427/cylab/SOAP_MA/mergedBioNLP2023.csv', index=False)

In [3]:
reference_df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/mergedBioNLP2023.csv')

### evaluation

In [4]:
from bert_score import score
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
rg_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    mean_embedding = outputs.last_hidden_state.mean(dim=1)
    # Max pooling
    max_embedding = outputs.last_hidden_state.max(dim=1).values
    # Concatenate mean and max
    combined_embedding = torch.cat((mean_embedding, max_embedding), dim=1).cpu().numpy()
    return max_embedding.cpu().numpy()

def count_tokens(sentence):
    return len(tokenizer.tokenize(sentence))

In [7]:
outer_dict = {}

for col in ['Summary_umls', 'Summary_lemma_umls', 'Summary_expanded_umls']:
    print(col)
    outer_dict[col] = {'rougeL': {'scores':[]}, 'bert_score': {'scores':[]}, 'sent_emb_sim': {'scores':[]}}
    
    for pred, ref in zip(reference_df[col].tolist(), reference_df["Summary"].tolist()):
        if pd.isna(pred) or pd.isna(ref):
            # Set all scores to 0 for null values
            outer_dict[col]['rougeL']['scores'].append(0.0)
            outer_dict[col]['sent_emb_sim']['scores'].append(0.0)
            outer_dict[col]['bert_score']['scores'].append(0.0)
            print(f"Null value found in {col}")
            continue

        # Calculate ROUGE-L F-score
        scores = rg_scorer.score(str(ref), str(pred))
        rougeL_f = scores['rougeL'].fmeasure
        outer_dict[col]['rougeL']['scores'].append(rougeL_f)

        # Calculate embedding similarity
        pred_emb = get_sentence_embedding(str(pred))
        ref_emb = get_sentence_embedding(str(ref))
        sim = cosine_similarity(pred_emb, ref_emb)[0][0]
        outer_dict[col]['sent_emb_sim']['scores'].append(sim)

        # Calculate BERTScore for single pair
        try:
            _, _, bert_score = score([str(pred)], [str(ref)], 
                                   model_type=model_name, 
                                   lang="en", 
                                   rescale_with_baseline=False, 
                                   num_layers=12)
            outer_dict[col]['bert_score']['scores'].append(bert_score[0])
        except:
            outer_dict[col]['bert_score']['scores'].append(0.0)

    # Calculate statistics
    for metric in ['rougeL', 'bert_score', 'sent_emb_sim']:
        outer_dict[col][metric]['average'] = np.mean(outer_dict[col][metric]['scores'])
        outer_dict[col][metric]['std'] = np.std(outer_dict[col][metric]['scores'])

Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_umls
Null value found in Summary_um

In [10]:
for k, v in outer_dict.items():
    print(k)
    for metric in ['rougeL', 'bert_score', 'sent_emb_sim']:
        print(f"{metric}: {v[metric]['average']:.4f} +/- {v[metric]['std']:.4f}")
    print("\n")

Summary_umls
rougeL: 0.4264 +/- 0.1970
bert_score: 0.7045 +/- 0.1799
sent_emb_sim: 0.7032 +/- 0.1860


Summary_lemma_umls
rougeL: 0.4269 +/- 0.2049
bert_score: 0.7055 +/- 0.1786
sent_emb_sim: 0.7061 +/- 0.1840


Summary_expanded_umls
rougeL: 0.3323 +/- 0.1980
bert_score: 0.7398 +/- 0.1260
sent_emb_sim: 0.7474 +/- 0.1303




In [11]:
import pickle
with open('/home/yl3427/cylab/SOAP_MA/outer_dict.pkl', 'wb') as f:
    pickle.dump(outer_dict, f)

# 2. Getting a Bag of Terms

In [1]:
import pandas as pd

reference_df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/mergedBioNLP2023.csv')

In [2]:
def sepexp(text):
    return [element.strip().lower() for element in text.split(';') if element.strip()]

summary_terms = reference_df['Summary'].apply(lambda x: set(sepexp(x)) if pd.notnull(x) else set())
summary_lemma_terms = reference_df['Summary_lemma'].apply(lambda x: set(sepexp(x)) if pd.notnull(x) else set())
summary_expanded_terms = reference_df['Summary_expanded'].apply(lambda x: set(sepexp(x)) if pd.notnull(x) else set())
summary_terms

0                              {# hypoxia:, hyperkalemia}
1                      {cad, hypotension, lower gi bleed}
2       {oliguria, non-gap acidosis, pulmonary embolis...
3       {supertherapeutic inr, demand ischemia, afib, ...
4       {#. chronic kidney disease stage v on hd, # ca...
                              ...                        
997     {leukocytosis, resp distress, recurrent asp pn...
998     {fever: most likely infectious with either pul...
999     {hyponatremia, hiv, cad, decompensated heart f...
1000    {respiratory failure, atrial fibrillation, muc...
1001    {chf: likely contribution of known as, chronic...
Name: Summary, Length: 1002, dtype: object

In [3]:
def sepexp(text):
    return [element.strip().lower() for element in text.split(';') if element.strip()]

summary_terms = reference_df['Summary'].apply(lambda x: sepexp(x) if pd.notnull(x) else [])
summary_lemma_terms = reference_df['Summary_lemma'].apply(lambda x: sepexp(x) if pd.notnull(x) else [])
summary_expanded_terms = reference_df['Summary_expanded'].apply(lambda x: sepexp(x) if pd.notnull(x) else [])

summary_terms_lst = []
for row in summary_terms:
    summary_terms_lst.extend(row)
len(summary_terms_lst), len(set(summary_terms_lst))

summary_lemma_terms_lst = []
for row in summary_lemma_terms:
    summary_lemma_terms_lst.extend(row)

summary_expanded_terms_lst = []
for row in summary_expanded_terms:
    summary_expanded_terms_lst.extend(row)

In [4]:
from collections import Counter

terms_sorted_frequency = sorted(Counter(summary_terms_lst).items(), key=lambda x: x[1], reverse=True)
lemma_terms_sorted_frequency = sorted(Counter(summary_lemma_terms_lst).items(), key=lambda x: x[1], reverse=True)
expanded_terms_sorted_frequency = sorted(Counter(summary_expanded_terms_lst).items(), key=lambda x: x[1], reverse=True)

terms_sorted_frequency[:10], lemma_terms_sorted_frequency[:10], expanded_terms_sorted_frequency[:10]

([('anemia', 75),
  ('hypotension', 59),
  ('acute renal failure', 59),
  ('hypertension', 49),
  ('htn', 49),
  ('cad', 46),
  ('altered mental status', 40),
  ('arf', 40),
  ('afib', 39),
  ('atrial fibrillation', 39)],
 [('anemia', 75),
  ('hypotension', 59),
  ('acute renal failure', 59),
  ('hypertension', 49),
  ('htn', 49),
  ('cad', 46),
  ('alter mental status', 40),
  ('arf', 40),
  ('afib', 39),
  ('atrial fibrillation', 39)],
 [('acute renal failure', 99),
  ('hypertension', 99),
  ('atrial fibrillation', 86),
  ('anemia', 75),
  ('coronary artery disease', 65),
  ('hypotension', 59),
  ('altered mental status', 50),
  ('respiratory failure', 39),
  ('chronic obstructive pulmonary disease', 35),
  ('thrombocytopenia', 33)])

In [7]:
import matplotlib.pyplot as plt
def list_to_dataframe(term_list, name):
    return pd.DataFrame(term_list, columns=['Term', f'{name}_Frequency'])

terms_df = list_to_dataframe(terms_sorted_frequency, 'Terms')
lemma_terms_df = list_to_dataframe(lemma_terms_sorted_frequency, 'Lemma_Terms')
expanded_terms_df = list_to_dataframe(expanded_terms_sorted_frequency, 'Expanded_Terms')

In [9]:
# 1. Descriptive Statistics Summary
def descriptive_statistics(df, name):
    print(f"Descriptive Statistics for {name}:\n")
    print(df.describe())
    print("\n")

descriptive_statistics(terms_df[:100], 'Terms')
descriptive_statistics(lemma_terms_df[:100], 'Lemma_Terms')
descriptive_statistics(expanded_terms_df[:100], 'Expanded_Terms')

Descriptive Statistics for Terms:

       Terms_Frequency
count       100.000000
mean         14.860000
std          14.066954
min           5.000000
25%           6.000000
50%           9.000000
75%          16.500000
max          75.000000


Descriptive Statistics for Lemma_Terms:

       Lemma_Terms_Frequency
count             100.000000
mean               15.000000
std                14.041065
min                 5.000000
25%                 6.000000
50%                 9.000000
75%                17.250000
max                75.000000


Descriptive Statistics for Expanded_Terms:

       Expanded_Terms_Frequency
count                 100.00000
mean                   15.99000
std                    18.88642
min                     5.00000
25%                     6.00000
50%                     9.00000
75%                    16.00000
max                    99.00000




In [5]:
K = 100
def top_k_terms(df, column_name, k=20):
    top_k = df.nlargest(k, columns=[column_name])
    print(f"Top {k} Terms for {column_name}:\n")
    print(top_k)
    print("\n")

top_k_terms(terms_df, 'Terms_Frequency', k=K)
top_k_terms(lemma_terms_df, 'Lemma_Terms_Frequency', k=K)
top_k_terms(expanded_terms_df, 'Expanded_Terms_Frequency', k=K)

NameError: name 'terms_df' is not defined

In [34]:
# 4. Term Overlap Comparison
def compare_overlap(df1, df2, name1, name2):
    set1 = set(df1['Term'])
    set2 = set(df2['Term'])
    overlap = set1.intersection(set2)
    print(f"Overlap between {name1} and {name2}: {len(overlap)} terms\n")
    print(overlap)
    print("\n")

compare_overlap(terms_df, lemma_terms_df, 'Terms', 'Lemma Terms')
compare_overlap(terms_df, expanded_terms_df, 'Terms', 'Expanded Terms')
compare_overlap(lemma_terms_df, expanded_terms_df, 'Lemma Terms', 'Expanded Terms')

Overlap between Terms and Lemma Terms: 1187 terms

{'right hip metastasis', 's/p nstemi', 'psychiatric disorder/post-traumatic stress disorder', 'left lobectomy of liver for cystic mass', '# esrd on hd', 'acute diastolic chf exacerbation', 'a-fib w/ rvr', '# gi bleed', 'likely lymphoma', 'upper airway obstruction', 'acute on chronic hypercarbic repiratory failure', '# chronic alcohol abuse', 's/p posterior stemi', 'metastastic renal ca', 'continued intubation/respiratory failure', 'line sepsis', 'endocarditis', 'thumb abscess', 'esophageal cancer', 'cabg', 'dvt/pe', 'sepsis/pneumonia', '# alkalemic respiratory failure', 'hx of esrd', 'stable anemia', 'respiratory alkalosis / metabolic alkalosis', 'hypercarbia/hypoxia', 'neutropenic fever/sepsis', 'torsades de pointes', 'esrd on pd', 'acute respiratory failure', 'acute on chronic resp failure', 'post-op afib', 'septic shock/pneumonia', 'parapneumonic effusion/hypoxia', 'heroin abuse', 'cad s/p stemi 2', 'hypercapnia', 'sob', 'systolic l

In [24]:
terms_df.to_csv('/home/yl3427/cylab/SOAP_MA/terms_df.csv', index=False)
lemma_terms_df.to_csv('/home/yl3427/cylab/SOAP_MA/lemma_terms_df.csv', index=False)
expanded_terms_df.to_csv('/home/yl3427/cylab/SOAP_MA/expanded_terms_df.csv', index=False)

In [14]:
df = pd.read_csv('/home/yl3427/cylab/SOAP_MA/terms_df.csv')
df

Unnamed: 0,Term,Terms_Frequency
0,anemia,75
1,hypotension,59
2,acute renal failure,59
3,hypertension,49
4,htn,49
...,...,...
2195,mucous plugging\n and gnr pneumonia,1
2196,mrsa septic prosthetic joint,1
2197,hyponatremia: likely acute on chronic hyponatr...,1
2198,chronic bronchitis,1


In [26]:
terms_df['Term'].tolist()[:100][-1]

'abdominal pain'

In [27]:
reference_df.columns

Index(['File ID', 'Assessment', 'Summary', 'Subjective', 'Objective',
       'Summary_lemma', 'Summary_expanded', 'Summary_umls',
       'Summary_lemma_umls', 'Summary_expanded_umls'],
      dtype='object')

### evaluation

In [28]:
from bert_score import score
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
rg_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device('cuda:5' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [30]:
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling
    mean_embedding = outputs.last_hidden_state.mean(dim=1)
    # Max pooling
    max_embedding = outputs.last_hidden_state.max(dim=1).values
    # Concatenate mean and max
    combined_embedding = torch.cat((mean_embedding, max_embedding), dim=1).cpu().numpy()
    return max_embedding.cpu().numpy()

def count_tokens(sentence):
    return len(tokenizer.tokenize(sentence))

In [31]:
outer_dict = {}

for sum_type in ['raw', 'lemma', 'expanded']:
    for input_type in ['a', 'so']:
        file_name = f"/home/yl3427/cylab/SOAP_MA/soap_result/1107_soap_closed_{sum_type}_{input_type}.csv"
        reference_df = pd.read_csv(file_name)
  
        outer_dict[f"{sum_type}_{input_type}"] = {'rougeL': {'scores':[]}, 'bert_score': {'scores':[]}, 'sent_emb_sim': {'scores':[]}}
        
        for pred, ref in zip(reference_df['pred'].tolist(), reference_df["Summary"].tolist()):
            # Calculate ROUGE-L F-score
            scores = rg_scorer.score(str(ref), str(pred))
            rougeL_f = scores['rougeL'].fmeasure
            outer_dict[f"{sum_type}_{input_type}"]['rougeL']['scores'].append(rougeL_f)

            # Calculate embedding similarity
            pred_emb = get_sentence_embedding(str(pred))
            ref_emb = get_sentence_embedding(str(ref))
            sim = cosine_similarity(pred_emb, ref_emb)[0][0]
            outer_dict[f"{sum_type}_{input_type}"]['sent_emb_sim']['scores'].append(sim)

            _, _, bert_score = score([str(pred)], [str(ref)], 
                                model_type=model_name, 
                                lang="en", 
                                rescale_with_baseline=False, 
                                num_layers=12)
            outer_dict[f"{sum_type}_{input_type}"]['bert_score']['scores'].append(bert_score[0])
        

        # Calculate statistics
        for metric in ['rougeL', 'bert_score', 'sent_emb_sim']:
            outer_dict[f"{sum_type}_{input_type}"][metric]['average'] = np.mean(outer_dict[f"{sum_type}_{input_type}"][metric]['scores'])
            outer_dict[f"{sum_type}_{input_type}"][metric]['std'] = np.std(outer_dict[f"{sum_type}_{input_type}"][metric]['scores'])

In [35]:
for k, v in outer_dict.items():
    print(k)
    for metric in ['rougeL', 'bert_score', 'sent_emb_sim']:
        avg_percentage = v[metric]['average'] * 100
        std_percentage = v[metric]['std'] * 100
        print(f"{metric}: {avg_percentage:.2f}% ± {std_percentage:.2f}%")
    print("\n")


raw_a
rougeL: 24.20% ± 17.26%
bert_score: 66.81% ± 14.27%
sent_emb_sim: 67.13% ± 15.13%


raw_so
rougeL: 16.21% ± 15.31%
bert_score: 54.64% ± 13.97%
sent_emb_sim: 55.65% ± 14.75%


lemma_a
rougeL: 24.36% ± 17.37%
bert_score: 67.02% ± 14.23%
sent_emb_sim: 67.37% ± 15.05%


lemma_so
rougeL: 16.10% ± 14.62%
bert_score: 55.52% ± 13.62%
sent_emb_sim: 56.75% ± 14.26%


expanded_a
rougeL: 21.70% ± 16.53%
bert_score: 66.23% ± 13.96%
sent_emb_sim: 66.89% ± 14.64%


expanded_so
rougeL: 14.77% ± 13.68%
bert_score: 55.76% ± 13.18%
sent_emb_sim: 57.07% ± 13.86%




In [34]:
import pickle
with open('/home/yl3427/cylab/SOAP_MA/result_dict.pkl', 'wb') as f:
    pickle.dump(outer_dict, f)

In [12]:
import pickle
with open('/home/yl3427/cylab/SOAP_MA/result_dict.pkl', 'rb') as f:
    outer_dict = pickle.load(f)

In [13]:
outer_dict

{'raw_a': {'rougeL': {'scores': [0.3333333333333333,
    0.22222222222222224,
    0.2,
    0.1568627450980392,
    0.2608695652173913,
    0.16666666666666666,
    0.0,
    0.5,
    0.36363636363636365,
    0.28571428571428575,
    0.30769230769230765,
    0.22222222222222218,
    0.4444444444444445,
    0.6666666666666665,
    0.0,
    0.0,
    0.0,
    0.5714285714285715,
    0.12500000000000003,
    0.0,
    0.0,
    0.4444444444444444,
    0.23076923076923075,
    0.3333333333333333,
    0.0,
    0.17142857142857143,
    0.125,
    0.06666666666666668,
    0.4615384615384615,
    0.13333333333333333,
    0.0,
    0.3076923076923077,
    0.21052631578947367,
    0.2727272727272727,
    0.13793103448275862,
    0.26666666666666666,
    0.14814814814814817,
    0.16666666666666669,
    0.15384615384615383,
    0.09302325581395349,
    0.18181818181818182,
    0.13333333333333333,
    0.33333333333333337,
    0.3333333333333333,
    0.10526315789473685,
    0.0,
    0.20689655172413793