In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/entity_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/entity_sum


## https://pypi.org/project/biobert-embedding/

In [None]:
!pip3 install biobert-embedding

Collecting biobert-embedding
  Downloading biobert-embedding-0.1.2.tar.gz (4.8 kB)
Collecting torch==1.2.0
  Downloading torch-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (748.9 MB)
[K     |████████████████████████████████| 748.9 MB 621 bytes/s 
[?25hCollecting pytorch-pretrained-bert==0.6.2
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 91.6 MB/s 
Collecting boto3
  Downloading boto3-1.18.62-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 75.3 MB/s 
Collecting botocore<1.22.0,>=1.21.62
  Downloading botocore-1.21.62-py3-none-any.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 54.3 MB/s 
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 9.4 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting urllib3<1.27,>=1.25.4
  D

In [None]:
import nltk
nltk.download('punkt')  
nltk.download('stopwords')
import json
import pickle as pk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import pickle as pk
import os
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial import distance
from collections import defaultdict
import json
from biobert_embedding.embedding import BiobertEmbedding

## Instantiate a BioBERT model

In [None]:
biobert = BiobertEmbedding()

### For each cluster in each ICD-11 chapter, generate a biobert embedding and perform document importance computation based on centrality

In [None]:
from nltk.tokenize import sent_tokenize

# Read abstracts within a cluster generate their biobert embeddings and compute document importance for each
# the method returns an abstract with its id and the document importamce score
# returns a defaultdict where key is the cluster_id and value is list of abstract, id, doc_importance scores ordered 
# for each icd_10 chapter, generate something like {'cluster_id_1': (pmid:embedding-1), 'cluster_id_2': (pmid_2:importance_score_2),}

def _get_abstract_biobert_embed(icd_chapter, DATA_PATH, OUTPUT_PATH):
  for count, cluster_id in enumerate(os.listdir(DATA_PATH)):
    # read the pickled dataframe corresponding to the cluster id and icd-11 chapter
    df = pd.read_pickle(f"{DATA_PATH}/{cluster_id}")
    # iterate through each abstract embed using biobert and store the pmid and embedding as a dictionary
    dict_pmid_biobert = {}   # this is for each cluster in an icd-11 chapter
    if len(df) > 0:
      for idx, row in df.iterrows():
        pmid = df['PMID'][idx]
        abstract = df['Abstract'][idx]
        # sentence tokenizer first followed by mean-pooling of the sentence representation after 
        # biobert embedding is generated for each sentence
        sentences = sent_tokenize(abstract)
        # generate biobert embedding for each sentence
        sentence_embeddings = [biobert.sentence_vector(sent).numpy() for sent in sentences]
        abstract_embedding = np.mean(np.array(sentence_embeddings), axis=0)

        dict_pmid_biobert[pmid] = abstract_embedding
    
    # Write the generated pmid to biobert emebdding to the file system for each cluster (as filename) in each ICD-11 chapter
    pk.dump(dict_pmid_biobert, open(f"{OUTPUT_PATH}/{cluster_id}", "wb"))


In [None]:
!pwd

/content/drive/My Drive/Colab Notebooks/apex-codes/entity_sum


### Call to the above method

In [None]:
icd_chapter = "developmental_anomaly"
# data path corresponding to an ICD-11 chapter
DATA_PATH = f"pubmed_abstracts_clusters_FINAL/{icd_chapter}_Medline"
# First create a folder----this folder contains pubmbed abstracts' pmids and their biobert embeddings (768-d)
OUTPUT_PATH = "PMID_TO_BIOBERT_EMBED"
if not os.path.exists(OUTPUT_PATH):
  os.mkdir(OUTPUT_PATH)

if not os.path.exists(f"{OUTPUT_PATH}/{icd_chapter}"):
  os.mkdir(f"{OUTPUT_PATH}/{icd_chapter}")

# Final output path corresponding to an icd-11 chapter
FINAL_OUTPUT_PATH = f"PMID_TO_BIOBERT_EMBED/{icd_chapter}"
# Filename of output file will be the correspondig ICD-11 chapter
# Call to the method
_get_abstract_biobert_embed(icd_chapter, DATA_PATH, FINAL_OUTPUT_PATH)


### Read back to verify the PMID to biobert embeddings

In [None]:
icd_chapter = "developmental_anomaly"
FINAL_OUTPUT_PATH = f"PMID_TO_BIOBERT_EMBED/{icd_chapter}"
cluster_id = 0


dict_pmid_biobert = pk.load(open(f"{FINAL_OUTPUT_PATH}/{cluster_id}.pk", "rb"))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import operator

## Method to perform document importance ranking for each cluster in the ICD-11 chapter

In [None]:
def _get_doc_importance(icd_chapter, DATA_PATH, OUTPUT_PATH):
  sorted_dict_pmid_imp = {}
  for cluster_id in os.listdir(DATA_PATH):
    dict_pmid_biobert = pk.load(open(f"{DATA_PATH}/{cluster_id}", "rb"))
    # iterate through the key-value dict
    dict_pmid_imp = {}   # to store avg cosine similarity with others within a cluster of an ICD-11 chapter
    for pmid_src, embed_src in dict_pmid_biobert.items():
      cnt = 0
      total_cosine_sum = 0.0
      for _, embed_tgt in dict_pmid_biobert.items():
        # compute cosine similarity
        #total_cosine_sum += cosine_similarity(embed_src.reshape(-1, 1), embed_tgt.reshape(-1, 1))[0][0]
        total_cosine_sum += distance.euclidean(embed_src, embed_tgt)
        
        cnt += 1
      # average cosine cosine
      avg_cosine_sum = total_cosine_sum / float(cnt)

      dict_pmid_imp[pmid_src] = avg_cosine_sum
      # sort the dictionary in descending order of value (avg cosine sum)
      sorted_dict_pmid_imp = sorted(dict_pmid_imp.items(), key=operator.itemgetter(1))

    # write the sorted dict as json to the file system---note that the dict corresponds to a cluster_id in an ICD-11 chapter
    # write the dict as a pickle file
    pk.dump(sorted_dict_pmid_imp, open(f"{OUTPUT_PATH}/{cluster_id}", "wb"))


In [None]:
# The document importance based ranking is for each abstract in a cluster in an ICD-11 chapter
# The idea is computing aggregate inter-document cosine similarity and sorting based on the cosine scores
icd_chapter = "developmental_anomaly"

# data path corresponding to an ICD-11 chapter
DATA_PATH = f"PMID_TO_BIOBERT_EMBED/{icd_chapter}"

OUTPUT_PATH = "DOCUMENT_IMPORTANCE"

if not os.path.exists(OUTPUT_PATH):
  os.mkdir(OUTPUT_PATH)

if not os.path.exists(f"{OUTPUT_PATH}/{icd_chapter}"):
  os.mkdir(f"{OUTPUT_PATH}/{icd_chapter}")

# Final output path corresponding to an icd-11 chapter
FINAL_OUTPUT_PATH = f"DOCUMENT_IMPORTANCE/{icd_chapter}"


# Call to the actual method
_get_doc_importance(icd_chapter, DATA_PATH, FINAL_OUTPUT_PATH)


### Verify the generated pickle files (i.e., PMID to importance score mappings)



In [None]:
icd_chapter = "neoplasms"
FINAL_OUTPUT_PATH = f"DOCUMENT_IMPORTANCE/{icd_chapter}"
cluster_id = 0

sorted_dict_pmid_imp = pk.load(open(f"{FINAL_OUTPUT_PATH}/{cluster_id}.pk", "rb"))

In [None]:
sorted_dict_pmid_imp[:20]

[(33706122, 2.8744233041196257),
 (33706130, 2.908323534437128),
 (33706249, 2.9145984005283667),
 (33705923, 2.916296179230149),
 (33705918, 2.918151056444323),
 (33706219, 2.9717274775376192),
 (33706111, 2.9743940040871903),
 (33706022, 2.990836749205718),
 (33705966, 2.9997104889637716),
 (33705881, 2.999973467878393),
 (33706097, 3.005936474413485),
 (33706413, 3.018882939944396),
 (33705970, 3.024985767699577),
 (33706124, 3.031220004365251),
 (33705980, 3.0498031796635807),
 (33705751, 3.060083376394736),
 (33706163, 3.0669418750582516),
 (33705950, 3.0677476831384607),
 (33705896, 3.1070501014992997),
 (33705943, 3.111293166070371)]

In [None]:
len(sorted_dict_pmid_imp)

74

## **Next: Entity-aware sentence selection**

### Read the abstracts in each cluster under each ICD-11 chapter

In [None]:
import pickle as pk

icd_chapter = "neoplasms"
DATA_PATH = f"pubmed_abstracts_clusters_FINAL/{icd_chapter}_Medline"

# changes for each cluster in an ICD-11 chapter
cluster_id = 0

df_cluster = pd.read_pickle(f"{DATA_PATH}/{cluster_id}.pk")

In [None]:
df_cluster.head()

Unnamed: 0,PMID,Title,Abstract,Named_Entities_mimic_w_i2b2,Named_Entities_genia_w_BC5CDR
0,33706449,[Short-term and long-term outcomes of tricuspi...,Objective: To examine the short-term and long-...,"{'TREATMENT': ['tricuspid valve replacement', ...","{'DISEASE': ['left ventricular dysfunction', '..."
1,33706443,[A prognostic model of intrahepatic cholangioc...,Objective: To examine a survival prognostic mo...,"{'TREATMENT': ['a survival prognostic model', ...",{'DISEASE': ['intrahepatic cholangiocarcinoma'...
2,33706442,[Conversion therapy of biliary tract cancer fr...,Biliary tract cancer is found in the middle an...,"{'PROBLEM': ['Biliary tract cancer', 'Biliary ...","{'DISEASE': ['Biliary tract cancer', 'Biliary ..."
3,33706440,[Attach importance to the standardized diagnos...,"Gallbladder carcinoma,characterized by conceal...","{'PROBLEM': ['Gallbladder carcinoma', 'gallbla...","{'DISEASE': ['Gallbladder carcinoma', 'gallbla..."
5,33706419,Reproducibility of Lung Nodule Radiomic Featur...,PURPOSE: Recent studies have demonstrated a la...,"{'TEST': ['Recent studies', 'CT parameters', '...","{'DISEASE': ['lung cancer', 'lung cancer']}"


In [None]:
df_cluster.shape

(74, 5)

In [None]:
# drop the Named_Entities_mimic_w_i2b2 column
df_cluster = df_cluster.drop(columns=['Named_Entities_mimic_w_i2b2'])

In [None]:
df_cluster.head()

Unnamed: 0,PMID,Title,Abstract,Named_Entities_genia_w_BC5CDR
0,33706449,[Short-term and long-term outcomes of tricuspi...,Objective: To examine the short-term and long-...,"{'DISEASE': ['left ventricular dysfunction', '..."
1,33706443,[A prognostic model of intrahepatic cholangioc...,Objective: To examine a survival prognostic mo...,{'DISEASE': ['intrahepatic cholangiocarcinoma'...
2,33706442,[Conversion therapy of biliary tract cancer fr...,Biliary tract cancer is found in the middle an...,"{'DISEASE': ['Biliary tract cancer', 'Biliary ..."
3,33706440,[Attach importance to the standardized diagnos...,"Gallbladder carcinoma,characterized by conceal...","{'DISEASE': ['Gallbladder carcinoma', 'gallbla..."
5,33706419,Reproducibility of Lung Nodule Radiomic Featur...,PURPOSE: Recent studies have demonstrated a la...,"{'DISEASE': ['lung cancer', 'lung cancer']}"


In [None]:
df_cluster.shape

(74, 4)

In [None]:
df_cluster[df_cluster['PMID'] == 33706122]['Named_Entities_genia_w_BC5CDR'].values

array([defaultdict(<class 'list'>, {'DISEASE': ['cancer', 'solid tumors', 'cancer'], 'CHEMICAL': ['ruthenium', 'rofecoxib']})],
      dtype=object)

In [None]:
list(df_cluster[df_cluster['PMID'] == 33706122]['Named_Entities_genia_w_BC5CDR'].values[0].values())

[['cancer', 'solid tumors', 'cancer'], ['ruthenium', 'rofecoxib']]

In [None]:
from itertools import chain

newList = list(chain(*list(df_cluster[df_cluster['PMID'] == 33706122]['Named_Entities_genia_w_BC5CDR'].values[0].values())))

In [None]:
newList

['cancer', 'solid tumors', 'cancer', 'ruthenium', 'rofecoxib']

In [None]:
from pprint import pprint

# Tokenize the abstract into sentences and embed using BioBERT and embed the named entities using BioBERT, too 
# and perform pairwise cosine similarity
# Follow the document importance ordering though in a cluster/

import pickle as pk

# this method takes in an icd chapter and generates entity-aware sentences for each cluster belonging to the icd chapter
def _select_entity_aware_content(icd_chapter, OUTPUT_PATH, percentile_threshold=75):
  #icd_chapter = "neoplasms"
  doc_importance_path = f"DOCUMENT_IMPORTANCE/{icd_chapter}"

  for cluster_id in os.listdir(doc_importance_path):
    cluster_id_wo_pk = cluster_id.replace('.pk', '')   # This to be used as the file name of the pseudo-doc
    print("Cluster ID: ", cluster_id_wo_pk)

    final_entity_aware_sentences = []  # This for each cluster in an icd-11 chapter

    sorted_dict_pmid_imp = pk.load(open(f"{doc_importance_path}/{cluster_id}", "rb"))

    for count, pmid_impVal in enumerate(sorted_dict_pmid_imp):
      # grab the row from df_cluster with the pmid value
      pmid = pmid_impVal[0]
      #print("PMID: ", pmid)
      DATA_PATH = f"pubmed_abstracts_clusters_FINAL/{icd_chapter}_Medline"

      # changes for each cluster in an ICD-11 chapter
      #cluster_id = 0

      df_cluster = pd.read_pickle(f"{DATA_PATH}/{cluster_id}")

      df_ = df_cluster[df_cluster['PMID'] == pmid]['Named_Entities_genia_w_BC5CDR']
      try:
        lst_of_lst_entities = list(df_.values[0].values())
        final_lst_of_entities = list(chain(*lst_of_lst_entities))
      except IndexError:
        continue

      abstract = df_cluster[df_cluster['PMID'] == pmid]['Abstract'].values[0]
      sentences = sent_tokenize(abstract)

      # entity to sentence embedding cosine similarity computation
      dict_sent_cossim_score = {}    # key as the sentence and value as the cumulative cosine similarity score against all entities for a sentence
      
      if len(final_lst_of_entities) > 0:
        for sent in sentences:
          total_cosine_sum = 0.0   # total cosine similarity with all entities for a given sentence
          sent_embed = biobert.sentence_vector(sent).numpy()

          for entity in final_lst_of_entities:
            entity_embeddings = [biobert.word_vector(ent)[0].numpy() for ent in entity.split()]
            entity_embed = np.mean(np.array(entity_embeddings), axis=0)

            # compute cosine similarity between the two
            total_cosine_sum += cosine_similarity(sent_embed.reshape(-1, 1), entity_embed.reshape(-1, 1))[0][0]
          
          tokenized_sent = nltk.word_tokenize(sent)
          normalized_count = sum(e1 in final_lst_of_entities for e1 in tokenized_sent) / float(len(tokenized_sent))
          normalized_score = total_cosine_sum / float(len(final_lst_of_entities)) + normalized_count
          
          #if normalized_score >= similarity_threshold:
          dict_sent_cossim_score[sent] = float(normalized_score)
          
        lst_of_scores = list(dict_sent_cossim_score.values())
        
        similarity_threshold = float(np.percentile(lst_of_scores, percentile_threshold))   # 75th percentile
        print("similarity threshold: ", similarity_threshold)
        dict_sent_cossim_score = { k:v for k,v in dict_sent_cossim_score.items() if v >= similarity_threshold }
        print(dict_sent_cossim_score)


        #pprint(dict_sent_cossim_score)
        #pprint(list(dict_sent_cossim_score.keys()))
        final_entity_aware_sentences += list(dict_sent_cossim_score.keys())
    print("==============================================================================================")
      
    # WRITE TO THE FILE SYSTEM THE PSUEOD-DOCUMENT CREATED (SALIENT SENTENCES)
    pseudo_doc = " ".join(final_entity_aware_sentences)
    with open(f"{OUTPUT_PATH}/{cluster_id_wo_pk}.txt", 'w') as fp:
      fp.write(pseudo_doc)
    fp.close()
      


In [None]:
!pwd

/content/drive/My Drive/Colab Notebooks/apex-codes/entity_sum


In [None]:
# create a directory to store the psuedo-documents  
#(which consistss of sentences with huigh level of entity informativesness in each abstract

# First create the directory for entity-aware sentences

#icd_chapter = "certain_conditions_originating_in_the_perinatal_period"
lst_icd_chapters = ['neoplasms',\
                    'developmental_anomaly',\
                    'certain_conditions_originating_in_the_perinatal_period',\
                    'diseases_of_the_blood_and_blood_forming_organs',\
                    'certain_infectious_or_parasitic_diseases',\
                    'disorders_involving_the_immune_mechanism',\
                    'injury_poisoning_or_certain_other_consequences_of_external_cause',\
                    'pregnancy_childbirth_and_the_puerperium']

# iterate through the list of icd chapters 
for icd_chapter in lst_icd_chapters:
  print("ICD Chapter: ", icd_chapter)
  os.makedirs(f"ENTITY_AWARE_CONTENTS/{icd_chapter}", exist_ok=True)
  OUTPUT_PATH = f"ENTITY_AWARE_CONTENTS/{icd_chapter}"

  # call to the method above
  _select_entity_aware_content(icd_chapter, OUTPUT_PATH)
