In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Author: Robert Guthrie

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7feb351bcf50>

In [0]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long).cuda()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast).cuda()).cuda()).cuda()

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.dropout=nn.Dropout(0.25)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=2, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size).cuda())

        
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.transitions.data[tag_to_ix["U-ADR"],tag_to_ix["B-ADR"]]=-100
        self.transitions.data[tag_to_ix["O"],tag_to_ix["B-ADR"]]=-100

        self.transitions.data[tag_to_ix["O"],tag_to_ix["I-ADR"]]=-100
        self.transitions.data[tag_to_ix["B-ADR"],tag_to_ix["I-ADR"]]=-100
        self.transitions.data[tag_to_ix["U-ADR"],tag_to_ix["I-ADR"]]=-100

        
        self.transitions.data[tag_to_ix["I-ADR"],tag_to_ix["U-ADR"]]=-100
        self.transitions.data[tag_to_ix["L-ADR"],tag_to_ix["U-ADR"]]=-100

        self.transitions.data[tag_to_ix["I-ADR"],tag_to_ix["O"]]=-100
        self.transitions.data[tag_to_ix["L-ADR"],tag_to_ix["O"]]=-100
        

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(4, 1, self.hidden_dim // 2).cuda(),
                torch.randn(4, 1, self.hidden_dim // 2).cuda())

    def _forward_alg(self, feats):
        
        init_alphas = torch.full((1, self.tagset_size), -10000.).cuda()
        
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        
        forward_var = init_alphas

        
        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                
                trans_score = self.transitions[next_tag].view(1, -1)
                
                next_tag_var = forward_var + trans_score + emit_score
                
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).cuda().view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        embeds=self.dropout(embeds)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        
        score = torch.zeros(1).cuda()
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).cuda(), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        
        init_vvars = torch.full((1, self.tagset_size), -10000.).cuda()
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  
            viterbivars_t = []  

            for next_tag in range(self.tagset_size):
                
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1).cuda()
            backpointers.append(bptrs_t)

        
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence): 
       
        lstm_feats = self._get_lstm_features(sentence)

        
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
import pickle

with open('/content/gdrive/My Drive/singular_tags.pkl', 'rb') as f:
  training_data = pickle.load(f)

with open('/content/gdrive/My Drive/test_singular_tags.pkl', 'rb') as f:
  testing_data = pickle.load(f)

# with open('/content/gdrive/My Drive/Task2_data/labels_withmeddra.pkl', 'rb') as f:
  # labels = pickle.load(f)
# # Make up some training data
# training_data = [(
#     "the wall street journal reported today that apple corporation made money".split(),
#     "B I I I O O O B I O O".split()
# ), (
#     "georgia tech is a university in georgia".split(),
#     "B I O O O O B".split()
# )]

In [0]:
import torchtext
vec = torchtext.vocab.Vectors('/content/gdrive/My Drive/BioWordVec_PubMed_MIMICIII_d200.vec.bin', cache='./Downloads/')

  0%|          | 0/1623720 [00:00<?, ?it/s]Skipping token b'16545452' with 1-dimensional vector [b'200']; likely a header
100%|█████████▉| 1622304/1623720 [01:54<00:00, 14239.83it/s]

In [0]:
word_to_ix = vec.stoi

In [0]:
"<UNK>" in word_to_ix

False

In [0]:
len(word_to_ix)

1623720

In [0]:
word_to_ix["<UNK>"]=len(word_to_ix)

In [0]:
for i in range(len(training_data)):
    for j in range(len(training_data[i][0])):
      #print(word)
      word=(training_data[i][0])[j]
      if(word in word_to_ix):
        continue
      else:
        (training_data[i][0])[j] = "<UNK>"

tag_to_ix = {"O": 0, "B-ADR": 1, "I-ADR": 2, "L-ADR": 3, "U-ADR": 4, START_TAG: 5, STOP_TAG: 6}
# tag_values = list(set(labels))
# tag_values.append(START_TAG)
# tag_values.append(STOP_TAG)
# tag_to_ix = {t: i for i, t in enumerate(tag_values)}


In [0]:
print(len(word_to_ix))
# for k,v in word_to_ix.items():
#   print(k,v)

1623721


In [0]:
new_row = torch.Tensor(1, 200)
new_vec= torch.cat([vec.vectors, new_row], dim=0)

In [0]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)

In [0]:
model.word_embeds.weight.data.copy_(new_vec)

tensor([[-2.8328e-01, -1.8125e-01, -2.1761e-01,  ...,  1.2970e-01,
         -1.4910e-01, -1.6567e-01],
        [-2.6965e-01, -1.6921e-01, -2.3850e-01,  ...,  1.4007e-01,
         -1.4538e-01, -1.7382e-01],
        [-2.1836e-01,  4.8046e-02, -1.0087e-01,  ...,  1.7459e-01,
          1.0569e-01, -3.5893e-01],
        ...,
        [ 8.2127e-01,  4.7067e-01,  7.8578e-01,  ..., -3.8980e-01,
          1.6855e-01, -5.0794e-01],
        [ 1.0489e+00,  4.1795e-01,  8.2541e-01,  ..., -3.1072e-01,
          5.1590e-01,  2.5049e-01],
        [ 3.9868e-36,  0.0000e+00,  0.0000e+00,  ..., -3.1072e-01,
          0.0000e+00,  0.0000e+00]])

In [0]:
model.word_embeds.weight.shape

torch.Size([1623721, 200])

In [0]:
optimizer = optim.Adam(model.parameters())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("We are working with ", device)
model=model.to(device)
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long).cuda()
    print(model(precheck_sent))


We are working with  cuda
(tensor(9.7558, device='cuda:0'), [4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 1])


In [0]:
# !pip install -U scikit-learn==0.16.1

In [0]:
!pip install sklearn_crfsuite
def acc(testing_data):
  for i in range(len(testing_data)):
      for j in range(len(testing_data[i][0])):
        word=(testing_data[i][0])[j]
        if((testing_data[i][0])[j] not in word_to_ix):
          (testing_data[i][0])[j]="<UNK>"
  # for sentence, tags in testing_data:
  #   for word in sentence:
  #       if word not in word_to_ix:
  #           word_to_ix[word] = len(word_to_ix)
  test_pred = []
  with torch.no_grad():
      for i in range(0,len(testing_data)):
        precheck_sent = prepare_sequence(testing_data[i][0], word_to_ix)
        #print(precheck_sent)
        test_pred.append(model(precheck_sent)[1])

  test_pred_adr = []
  
 #tag_to_ix = {"O": 0, "B-ADR": 1, "I-ADR": 2, "L-ADR": 3, "U-ADR": 4, START_TAG: 5, STOP_TAG: 6}
  for t in test_pred:
    temp  = []
    for k in range(len(t)):
      if t[k] == 0:
        temp.append('O')
      elif t[k] == 1:
        temp.append('B-ADR')
      elif t[k] == 2:
        temp.append('I-ADR')
      elif t[k] == 3:
        temp.append('L-ADR')
      elif t[k] == 4:
        temp.append('U-ADR')
    test_pred_adr.append(temp)
  test_adr = []
  for sent, tag in testing_data:
    test_adr.append(tag)
  import sklearn_crfsuite
  from sklearn_crfsuite import metrics
  print("\n")
  print("################ STRICT ####################")

  print("f1 score macro:- ",metrics.flat_f1_score(test_adr, test_pred_adr, average='macro')) 
  print("f1 score micro:- ",metrics.flat_f1_score(test_adr, test_pred_adr, average='micro'))

  print("Sequence Accuracy:- ",metrics.sequence_accuracy_score(test_adr, test_pred_adr))

  print(metrics.flat_classification_report(
      test_adr, test_pred_adr, labels=['I-ADR','L-ADR','U-ADR','B-ADR','O'], digits=3))
  t_adr=test_adr
  t_p_adr=test_pred_adr
  relaxed(t_adr,t_p_adr)
  return t_adr, t_p_adr



In [0]:
def relaxed(t_adr,t_p_adr):
  import sklearn_crfsuite
  from sklearn_crfsuite import metrics
  print("\n")
  print("################ RELAXED ####################")

  test_one=[]

  for i in range(len(t_adr)):
    temp=[]
    for j in range(len(t_adr[i])):
      if(t_adr[i][j]=='-' or t_adr[i][j]=='U-ADR'or t_adr[i][j]=='I-ADR' or t_adr[i][j]=='B-ADR' or t_adr[i][j]=='L-ADR'):
        temp.append('ADR')
      else:
        temp.append('O')
    test_one.append(temp)

  pred_one=[]


  for i in range(len(t_p_adr)):
    temp=[]
    for j in range(len(t_p_adr[i])):
      if(t_p_adr[i][j]=='-' or t_p_adr[i][j]=='U-ADR'or t_p_adr[i][j]=='I-ADR' or t_p_adr[i][j]=='B-ADR' or t_p_adr[i][j]=='L-ADR'):
        temp.append('ADR')
      else:
        temp.append('O')
    pred_one.append(temp)

  print("f1 score macro:- ",metrics.flat_f1_score(test_one, pred_one, average='macro')) 
  print("f1 score micro:- ",metrics.flat_f1_score(test_one, pred_one, average='micro'))

  print("Sequence Accuracy:- ",metrics.sequence_accuracy_score(test_one, pred_one))

  print(metrics.flat_classification_report(test_one, pred_one, labels=['ADR','O'], digits=3))

In [0]:
acc(testing_data)



################ STRICT ####################
f1 score macro:-  0.12967574062999712
f1 score micro:-  0.42541224594145466
Sequence Accuracy:-  0.0
              precision    recall  f1-score   support

       I-ADR      0.000     0.000     0.000       182
       L-ADR      0.000     0.000     0.000       177
       U-ADR      0.022     0.446     0.041       184
       B-ADR      0.000     0.000     0.000       177
           O      0.905     0.457     0.607      7103

    accuracy                          0.425      7823
   macro avg      0.185     0.181     0.130      7823
weighted avg      0.822     0.425     0.552      7823



################ RELAXED ####################
f1 score macro:-  0.38012123147684657
f1 score micro:-  0.4633772210149559
Sequence Accuracy:-  0.0
              precision    recall  f1-score   support

         ADR      0.089     0.526     0.153       720
           O      0.905     0.457     0.607      7103

    accuracy                          0.463      78

  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=len(training_data), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

In [0]:
import random
for epoch in range(
        4):  # again, normally you would NOT do 300 epochs, it is toy data
    epoch_loss = 0
    counter=0
    res = random.sample(training_data, len(training_data))
    bar.start()
    print("\n")
    print("Epoch:-",epoch+1)
    for sentence, tags in res:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).cuda()
        assert len(sentence_in)==len(targets)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        #print(counter)
        bar.update(counter+1)
        #sleep(0.1)
        counter=counter+1
        #break
    bar.finish()
    print("\n")
    print("Loss:-",epoch_loss/2246)
    acc(testing_data)

                                                                               [                                                                        ] N/A%



Epoch:- 1






Loss:- 2.516590838228499


                                                                               [                                                                        ] N/A%



################ STRICT ####################
f1 score macro:-  0.38338959500117925
f1 score micro:-  0.9112872299629298
Sequence Accuracy:-  0.4883177570093458
              precision    recall  f1-score   support

       I-ADR      0.419     0.071     0.122       182
       L-ADR      0.463     0.107     0.174       177
       U-ADR      0.500     0.538     0.518       184
       B-ADR      0.390     0.090     0.147       177
           O      0.929     0.983     0.955      7103

    accuracy                          0.911      7823
   macro avg      0.540     0.358     0.383      7823
weighted avg      0.885     0.911     0.890      7823



################ RELAXED ####################
f1 score macro:-  0.6620154611756719
f1 score micro:-  0.9167838425156589
Sequence Accuracy:-  0.4883177570093458
              precision    recall  f1-score   support

         ADR      0.611     0.264     0.369       720
           O      0.929     0.983     0.955      7103

    accuracy           





Loss:- 1.6569583720642864


                                                                               [                                                                        ] N/A%



################ STRICT ####################
f1 score macro:-  0.4915749615505054
f1 score micro:-  0.9124376837530359
Sequence Accuracy:-  0.4953271028037383
              precision    recall  f1-score   support

       I-ADR      0.446     0.225     0.299       182
       L-ADR      0.484     0.260     0.338       177
       U-ADR      0.498     0.609     0.548       184
       B-ADR      0.453     0.243     0.316       177
           O      0.943     0.971     0.957      7103

    accuracy                          0.912      7823
   macro avg      0.565     0.462     0.492      7823
weighted avg      0.899     0.912     0.903      7823



################ RELAXED ####################
f1 score macro:-  0.7227566317262386
f1 score micro:-  0.9198517192892752
Sequence Accuracy:-  0.4976635514018692
              precision    recall  f1-score   support

         ADR      0.592     0.417     0.489       720
           O      0.943     0.971     0.957      7103

    accuracy            





Loss:- 1.3043719396353406


                                                                               [                                                                        ] N/A%



################ STRICT ####################
f1 score macro:-  0.5124366141818916
f1 score micro:-  0.8969704716860539
Sequence Accuracy:-  0.4439252336448598
              precision    recall  f1-score   support

       I-ADR      0.331     0.231     0.272       182
       L-ADR      0.364     0.446     0.401       177
       U-ADR      0.466     0.641     0.540       184
       B-ADR      0.366     0.441     0.400       177
           O      0.955     0.943     0.949      7103

    accuracy                          0.897      7823
   macro avg      0.497     0.540     0.512      7823
weighted avg      0.903     0.897     0.899      7823



################ RELAXED ####################
f1 score macro:-  0.7406517797446739
f1 score micro:-  0.9084750095871148
Sequence Accuracy:-  0.4462616822429907
              precision    recall  f1-score   support

         ADR      0.502     0.565     0.532       720
           O      0.955     0.943     0.949      7103

    accuracy            





Loss:- 1.0521749158258005


################ STRICT ####################
f1 score macro:-  0.5170408825566994
f1 score micro:-  0.9008053176530743
Sequence Accuracy:-  0.48130841121495327
              precision    recall  f1-score   support

       I-ADR      0.294     0.286     0.290       182
       L-ADR      0.367     0.429     0.396       177
       U-ADR      0.616     0.505     0.555       184
       B-ADR      0.366     0.424     0.393       177
           O      0.953     0.950     0.952      7103

    accuracy                          0.901      7823
   macro avg      0.519     0.519     0.517      7823
weighted avg      0.903     0.901     0.902      7823



################ RELAXED ####################
f1 score macro:-  0.7416451488926956
f1 score micro:-  0.9125655119519366
Sequence Accuracy:-  0.48364485981308414
              precision    recall  f1-score   support

         ADR      0.524     0.539     0.532       720
           O      0.953     0.950     0.952      

In [0]:
torch.save(model, "/content/gdrive/My Drive/bicrf.pt")

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
t_adr, t_p_adr = acc(testing_data)



################ STRICT ####################
f1 score macro:-  0.527015833721951
f1 score micro:-  0.9032340534321871
Sequence Accuracy:-  0.5070093457943925
              precision    recall  f1-score   support

       I-ADR      0.322     0.313     0.318       182
       L-ADR      0.403     0.446     0.424       177
       U-ADR      0.578     0.505     0.539       184
       B-ADR      0.383     0.424     0.402       177
           O      0.953     0.952     0.953      7103

    accuracy                          0.903      7823
   macro avg      0.528     0.528     0.527      7823
weighted avg      0.904     0.903     0.904      7823



################ RELAXED ####################
f1 score macro:-  0.744607223015711
f1 score micro:-  0.9140994503387447
Sequence Accuracy:-  0.5070093457943925
              precision    recall  f1-score   support

         ADR      0.533     0.540     0.537       720
           O      0.953     0.952     0.953      7103

    accuracy              

In [0]:
for i in range(len(testing_data)):
  tokens, test_labels = testing_data[i]

  if len(tokens) != len(pred_one[i]):
    print("false",i)

In [0]:

pred_one = []
for i in range(len(t_p_adr)):
    temp=[]
    for j in range(len(t_p_adr[i])):
      if(t_p_adr[i][j]=='-' or t_p_adr[i][j]=='U-ADR'or t_p_adr[i][j]=='I-ADR' or t_p_adr[i][j]=='B-ADR' or t_p_adr[i][j]=='L-ADR'):
        temp.append('ADR')
      else:
        temp.append('O')
    pred_one.append(temp)

pred_extractions = []
for i in range(len(testing_data)):
  tokens, test_labels = testing_data[i]
  temp_extraction = []
  for j in range(len(pred_one[i])):
    
    if pred_one[i][j] == "ADR":
      temp_extraction.append(tokens[j])
  pred_extractions.append(temp_extraction)






In [0]:
pred_extractions

NameError: ignored

In [0]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[980][0], word_to_ix)
    print(model(precheck_sent))

(tensor(29.4858, device='cuda:0'), [0, 0, 0, 0, 4, 0, 0, 0])


In [0]:
training_data[980]

(['vyvanse', 'commonly', 'known', 'a', 'ocd', 'in', 'a', 'pill'],
 ['O', 'O', 'O', 'O', 'U-ADR', 'O', 'O', 'O'])

In [0]:
with torch.no_grad():
    for i in range(0,len(training_data)):
      precheck_sent = prepare_sequence(training_data[i][0], word_to_ix)
      print(str(i),"--->",model(precheck_sent)[1])
      #print(z)

0 ---> [0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0]
1 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]
2 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4 ---> [0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0]
5 ---> [0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
6 ---> [0, 0, 0, 1, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]
7 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]
8 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
10 ---> [0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
11 ---> [0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0]
12 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
13 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]
14 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
15 ---> [0, 0, 0, 0, 0, 0, 4, 0]
16 ---> [0, 0, 0, 0, 0,

In [0]:
!pip install biobert-embedding==0.1.2

Collecting biobert-embedding==0.1.2
  Downloading https://files.pythonhosted.org/packages/d2/f0/f5bd3fd4a0bcef4d85e5e82347ae73d376d68dc8086afde75838ba0473a2/biobert-embedding-0.1.2.tar.gz
Collecting torch==1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/30/57/d5cceb0799c06733eefce80c395459f28970ebb9e896846ce96ab579a3f1/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (748.8MB)
[K     |████████████████████████████████| 748.9MB 21kB/s 
[?25hCollecting pytorch-pretrained-bert==0.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 58.2MB/s 
Building wheels for collected packages: biobert-embedding
  Building wheel for biobert-embedding (setup.py) ... [?25l[?25hdone
  Created wheel for biobert-embedding: filename=biobert_embedding-0.1.2-cp36-none-any.whl size=5702 sha256=8f7ae085351676e283e246

In [0]:
from biobert_embedding.embedding import BiobertEmbedding

text = "Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis."\

# Class Initialization (You can set default 'model_path=None' as your finetuned BERT model path while Initialization)
biobert = BiobertEmbedding()
#(biobert.word_vector("pyeloonly"))
biobert.tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[unused1]', 1),
             ('[unused2]', 2),
             ('[unused3]', 3),
             ('[unused4]', 4),
             ('[unused5]', 5),
             ('[unused6]', 6),
             ('[unused7]', 7),
             ('[unused8]', 8),
             ('[unused9]', 9),
             ('[unused10]', 10),
             ('[unused11]', 11),
             ('[unused12]', 12),
             ('[unused13]', 13),
             ('[unused14]', 14),
             ('[unused15]', 15),
             ('[unused16]', 16),
             ('[unused17]', 17),
             ('[unused18]', 18),
             ('[unused19]', 19),
             ('[unused20]', 20),
             ('[unused21]', 21),
             ('[unused22]', 22),
             ('[unused23]', 23),
             ('[unused24]', 24),
             ('[unused25]', 25),
             ('[unused26]', 26),
             ('[unused27]', 27),
             ('[unused28]', 28),
             ('[unused29]', 29),
             ('[unused30]', 30),
 