In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/entity_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/entity_sum


## https://pypi.org/project/biobert-embedding/

In [None]:
!pip3 install biobert-embedding

Collecting biobert-embedding
  Downloading https://files.pythonhosted.org/packages/d2/f0/f5bd3fd4a0bcef4d85e5e82347ae73d376d68dc8086afde75838ba0473a2/biobert-embedding-0.1.2.tar.gz
Collecting torch==1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/05/65/5248be50c55ab7429dd5c11f5e2f9f5865606b80e854ca63139ad1a584f2/torch-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (748.9MB)
[K     |████████████████████████████████| 748.9MB 25kB/s 
[?25hCollecting pytorch-pretrained-bert==0.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 58.4MB/s 
Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/9e/19/20a0d00e8cd7b812aaf408ce512c7ad41fc0bca4a2206674e9d6bc0c058d/boto3-1.17.43.tar.gz (99kB)
[K     |████████████████████████████████| 102kB 13.1MB/s 
Collecting botocore<1.21.0,>=1.20.

In [None]:
import nltk
nltk.download('punkt')  
nltk.download('stopwords')
import json
import pickle as pk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import pickle as pk
import os
from sklearn.cluster import KMeans
from sklearn import metrics
from collections import defaultdict
import json
from biobert_embedding.embedding import BiobertEmbedding

## Instantiate a BioBERT model

In [None]:
biobert = BiobertEmbedding()

## Method below takes an icd_11 chapter and uses this chapter name to load the entities from the file system and then cluster

In [None]:
def _generate_entity_clusters(icd_chapter, min_clusters = 20, max_clusters = 50):
  DATA_PATH = "medline_abstracts_w_entities"
  df = pk.load( open(os.path.join(DATA_PATH, f"{icd_chapter}.pk"), "rb"))

  lst_entities = []
  for dict_entities in list(df['Named_Entities_genia_w_BC5CDR'].values):
    try:
      lst_entities += list(dict_entities.values())[0]
    except (AttributeError, IndexError):
      continue
  lst_entities = list(set(lst_entities))

  embed_values = []
  track_entities = []
  for entity in lst_entities:
    if entity not in track_entities:
      track_entities.append(entity)

      entity_embedding = biobert.word_vector(entity)
      np_entity_embedding = [embed.numpy() for embed in entity_embedding]
      entity_embedding = np.mean(np.array(np_entity_embedding), axis=0)
      embed_values.append(entity_embedding)
      
  # Find the best value for k-number of clusters, iteratively
  max_silhouette_score = 0.0
  if max_clusters >= min_clusters:
    for k in range(min_clusters, max_clusters):
      if k % 5 == 0:
        print("Iteration ", k)
      kmeans = KMeans(n_clusters=k, random_state=0).fit(embed_values)
      labels = kmeans.labels_
      silhouette_avg = metrics.silhouette_score(embed_values, labels)
      if silhouette_avg >= max_silhouette_score:
        max_silhouette_score = silhouette_avg
        K_max = k
    print("Number of entities: ", len(embed_values))
    print("Maximum number of clusters:", K_max)
    # Run K-means with k that has given the best calinski score
    kmeans = KMeans(n_clusters=K_max, random_state=0).fit(embed_values)
    labels = kmeans.labels_

    # A dictionary of cluster label as a key and the entities that fall under that cluster
    dict_cluster_label_entities = defaultdict(list)
    for idx, label in enumerate(labels):
      dict_cluster_label_entities[label].append(lst_entities[idx])

    return dict_cluster_label_entities

## Call to the function above and write to a file system

In [None]:
icd_chapter = "pregnancy_childbirth_and_the_puerperium_Medline"
dict_cluster_label_entities =_generate_entity_clusters(icd_chapter)


# Write the result to a file system
pk.dump(dict_cluster_label_entities, open(f"entity_clusters/{icd_chapter}.pk", "wb"))


## Read back one pickle file to confirm

In [None]:
import os
import pickle as pk

DATA_PATH = "entity_clusters"
icd_chapter = "certain_infectious_or_parasitic_diseases_Medline"

dict_cluster_label_entities = pk.load( open(os.path.join(DATA_PATH, f"{icd_chapter}.pk"), "rb" ) )

In [None]:
dict_cluster_label_entities

defaultdict(list,
            {0: ['Influenza A infection',
              'primary infection',
              'non-union',
              'maturation of cognitive, language and visual skills',
              'necrotizing pulmonary infection',
              'acute vision loss',
              'infectious complications',
              'gastrointestinal bleed',
              'Hepatitis C',
              'Respiratory infections',
              'Tuber infection',
              'immune-mediated disorders',
              'acute urinary retention',
              'Rd-R',
              'chronic infections',
              'Crohn disease',
              'nephrotic syndrome',
              'circulatory disturbance',
              'periodontal infections',
              'lower back ache',
              "Kaposi's sarcoma-associated herpesvirus",
              'CPA-1',
              'chronic illness',
              'streptococcal toxic shock syndrome',
              'postoperative cardiovascular and infec