In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

KeyboardInterrupt: ignored

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fc5d7f1a530>

In [0]:
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long).cuda()


def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast).cuda()).cuda()).cuda()

In [0]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.dropout=nn.Dropout(0.25)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=2, bidirectional=True)
        
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size).cuda())

        # Constraining the Tranisiton Probabilties
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.transitions.data[tag_to_ix["U-ADR"],tag_to_ix["B-ADR"]]=-100
        self.transitions.data[tag_to_ix["O"],tag_to_ix["B-ADR"]]=-100

        self.transitions.data[tag_to_ix["O"],tag_to_ix["I-ADR"]]=-100
        self.transitions.data[tag_to_ix["B-ADR"],tag_to_ix["I-ADR"]]=-100
        self.transitions.data[tag_to_ix["U-ADR"],tag_to_ix["I-ADR"]]=-100

        
        self.transitions.data[tag_to_ix["I-ADR"],tag_to_ix["U-ADR"]]=-100
        self.transitions.data[tag_to_ix["L-ADR"],tag_to_ix["U-ADR"]]=-100

        self.transitions.data[tag_to_ix["I-ADR"],tag_to_ix["O"]]=-100
        self.transitions.data[tag_to_ix["L-ADR"],tag_to_ix["O"]]=-100
        

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(4, 1, self.hidden_dim // 2).cuda(),
                torch.randn(4, 1, self.hidden_dim // 2).cuda())

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000.).cuda()
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        forward_var = init_alphas

        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).cuda().view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        embeds=self.dropout(embeds)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1).cuda()
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).cuda(), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        init_vvars = torch.full((1, self.tagset_size), -10000.).cuda()
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []
            viterbivars_t = []

            for next_tag in range(self.tagset_size):

                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))

            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1).cuda()
            backpointers.append(bptrs_t)


        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]


        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
import pickle

with open('/content/gdrive/My Drive/Task2_data/singular_tags.pkl', 'rb') as f:
  training_data = pickle.load(f)

with open('/content/gdrive/My Drive/Task2_data/test_singular_tags.pkl', 'rb') as f:
  testing_data = pickle.load(f)

with open('/content/gdrive/My Drive/Task2_data/labels_withmeddra.pkl', 'rb') as f:
  labels = pickle.load(f)

In [0]:
import torchtext
vec = torchtext.vocab.Vectors('/content/gdrive/My Drive/BioWordVec_PubMed_MIMICIII_d200.vec.bin', cache='./Downloads/')

  0%|          | 0/1623720 [00:00<?, ?it/s]Skipping token b'16545452' with 1-dimensional vector [b'200']; likely a header
100%|█████████▉| 1622867/1623720 [02:09<00:00, 12690.33it/s]

In [0]:
word_to_ix = vec.stoi

In [0]:
"<UNK>" in word_to_ix

False

In [0]:
len(word_to_ix)

1623720

In [0]:
word_to_ix["<UNK>"]=len(word_to_ix)

In [0]:
# import pickle
# with open('/content/gdrive/My Drive/Task3/word_toix.pkl', 'wb') as f:
#   pickle.dump(word_to_ix, f)

100%|█████████▉| 1622867/1623720 [02:20<00:00, 12690.33it/s]

In [0]:
for i in range(len(training_data)):
    for j in range(len(training_data[i][0])):
      #print(word)
      word=(training_data[i][0])[j]
      if(word in word_to_ix):
        continue
      else:
        (training_data[i][0])[j] = "<UNK>"

tag_to_ix = {"O": 0, "B-ADR": 1, "I-ADR": 2, "L-ADR": 3, "U-ADR": 4, START_TAG: 5, STOP_TAG: 6}
# tag_values = list(set(labels))
# tag_values.append(START_TAG)
# tag_values.append(STOP_TAG)
# tag_to_ix = {t: i for i, t in enumerate(tag_values)}


In [0]:
print(len(word_to_ix))
# for k,v in word_to_ix.items():
#   print(k,v)

1623721


In [0]:
new_row = torch.Tensor(1, 200)
new_vec= torch.cat([vec.vectors, new_row], dim=0)

In [0]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)

In [0]:
model.word_embeds.weight.data.copy_(new_vec)

tensor([[-2.8328e-01, -1.8125e-01, -2.1761e-01,  ...,  1.2970e-01,
         -1.4910e-01, -1.6567e-01],
        [-2.6965e-01, -1.6921e-01, -2.3850e-01,  ...,  1.4007e-01,
         -1.4538e-01, -1.7382e-01],
        [-2.1836e-01,  4.8046e-02, -1.0087e-01,  ...,  1.7459e-01,
          1.0569e-01, -3.5893e-01],
        ...,
        [ 8.2127e-01,  4.7067e-01,  7.8578e-01,  ..., -3.8980e-01,
          1.6855e-01, -5.0794e-01],
        [ 1.0489e+00,  4.1795e-01,  8.2541e-01,  ..., -3.1072e-01,
          5.1590e-01,  2.5049e-01],
        [ 1.4160e-35,  0.0000e+00,  0.0000e+00,  ..., -3.1072e-01,
          0.0000e+00,  0.0000e+00]])

In [0]:
model.word_embeds.weight.shape

torch.Size([1623721, 200])

In [0]:
optimizer = optim.Adam(model.parameters())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("We are working with ", device)
model=model.to(device)
# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long).cuda()
    print(model(precheck_sent))


We are working with  cuda
(tensor(9.7558, device='cuda:0'), [4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 1])


In [0]:
#!pip install -U scikit-learn==0.16.1

Collecting scikit-learn==0.16.1
[?25l  Downloading https://files.pythonhosted.org/packages/40/91/ec319f8ddad10539440192ac0ed6f445eda57472f370e66a70bdaf90003d/scikit-learn-0.16.1.tar.gz (7.3MB)
[K     |████████████████████████████████| 7.3MB 2.7MB/s 
[?25hBuilding wheels for collected packages: scikit-learn
  Building wheel for scikit-learn (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-learn: filename=scikit_learn-0.16.1-cp36-cp36m-linux_x86_64.whl size=7301438 sha256=df6da99e24884efab0dbaf0046167a5032a3113604982ba9f217eabe22a47630
  Stored in directory: /root/.cache/pip/wheels/7a/b6/39/1d54c9badbe43b63c939f2d47a9d90c545506b99e6bf73691f
Successfully built scikit-learn
[31mERROR: yellowbrick 0.9.1 has requirement scikit-learn>=0.20, but you'll have scikit-learn 0.16.1 which is incompatible.[0m
[31mERROR: umap-learn 0.4.1 has requirement scikit-learn>=0.20, but you'll have scikit-learn 0.16.1 which is incompatible.[0m
[31mERROR: mlxtend 0.14.0 has requirement scikit-l

In [0]:
!pip install sklearn_crfsuite
def acc(testing_data):
  for i in range(len(testing_data)):
      for j in range(len(testing_data[i][0])):
        word=(testing_data[i][0])[j]
        if((testing_data[i][0])[j] not in word_to_ix):
          (testing_data[i][0])[j]="<UNK>"
  # for sentence, tags in testing_data:
  #   for word in sentence:
  #       if word not in word_to_ix:
  #           word_to_ix[word] = len(word_to_ix)
  test_pred = []
  with torch.no_grad():
      for i in range(0,len(testing_data)):
        precheck_sent = prepare_sequence(testing_data[i][0], word_to_ix)
        #print(precheck_sent)
        test_pred.append(model(precheck_sent)[1])

  test_pred_adr = []
  
 #tag_to_ix = {"O": 0, "B-ADR": 1, "I-ADR": 2, "L-ADR": 3, "U-ADR": 4, START_TAG: 5, STOP_TAG: 6}
  for t in test_pred:
    temp  = []
    for k in range(len(t)):
      if t[k] == 0:
        temp.append('O')
      elif t[k] == 1:
        temp.append('B-ADR')
      elif t[k] == 2:
        temp.append('I-ADR')
      elif t[k] == 3:
        temp.append('L-ADR')
      elif t[k] == 4:
        temp.append('U-ADR')
    test_pred_adr.append(temp)
  test_adr = []
  for sent, tag in testing_data:
    test_adr.append(tag)
  import sklearn_crfsuite
  from sklearn_crfsuite import metrics
  print("\n")
  print("################ STRICT ####################")

  print("f1 score macro:- ",metrics.flat_f1_score(test_adr, test_pred_adr, average='macro')) 
  print("f1 score micro:- ",metrics.flat_f1_score(test_adr, test_pred_adr, average='micro'))

  print("Sequence Accuracy:- ",metrics.sequence_accuracy_score(test_adr, test_pred_adr))

  print(metrics.flat_classification_report(
      test_adr, test_pred_adr, labels=['I-ADR','L-ADR','U-ADR','B-ADR','O'], digits=3))
  t_adr=test_adr
  t_p_adr=test_pred_adr
  relaxed(t_adr,t_p_adr)

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 11.5MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [0]:
def relaxed(t_adr,t_p_adr):
  import sklearn_crfsuite
  from sklearn_crfsuite import metrics
  print("\n")
  print("################ RELAXED ####################")

  test_one=[]

  for i in range(len(t_adr)):
    temp=[]
    for j in range(len(t_adr[i])):
      if(t_adr[i][j]=='-' or t_adr[i][j]=='U-ADR'or t_adr[i][j]=='I-ADR' or t_adr[i][j]=='B-ADR' or t_adr[i][j]=='L-ADR'):
        temp.append('ADR')
      else:
        temp.append('O')
    test_one.append(temp)

  pred_one=[]


  for i in range(len(t_p_adr)):
    temp=[]
    for j in range(len(t_p_adr[i])):
      if(t_p_adr[i][j]=='-' or t_p_adr[i][j]=='U-ADR'or t_p_adr[i][j]=='I-ADR' or t_p_adr[i][j]=='B-ADR' or t_p_adr[i][j]=='L-ADR'):
        temp.append('ADR')
      else:
        temp.append('O')
    pred_one.append(temp)

  print("f1 score macro:- ",metrics.flat_f1_score(test_one, pred_one, average='macro')) 
  print("f1 score micro:- ",metrics.flat_f1_score(test_one, pred_one, average='micro'))

  print("Sequence Accuracy:- ",metrics.sequence_accuracy_score(test_one, pred_one))

  print(metrics.flat_classification_report(test_one, pred_one, labels=['ADR','O'], digits=3))

In [0]:
acc(testing_data)



################ STRICT ####################
f1 score macro:-  0.514056305376555
f1 score micro:-  0.9195960628914738
Sequence Accuracy:-  0.5257009345794392
              precision    recall  f1-score   support

       I-ADR      0.479     0.187     0.269       182
       L-ADR      0.496     0.339     0.403       177
       U-ADR      0.642     0.467     0.541       184
       B-ADR      0.492     0.333     0.397       177
           O      0.943     0.979     0.961      7103

    accuracy                          0.920      7823
   macro avg      0.610     0.461     0.514      7823
weighted avg      0.905     0.920     0.909      7823



################ RELAXED ####################
f1 score macro:-  0.7358922936231912
f1 score micro:-  0.927137926626614
Sequence Accuracy:-  0.5257009345794392
              precision    recall  f1-score   support

         ADR      0.668     0.414     0.511       720
           O      0.943     0.979     0.961      7103

    accuracy              

In [0]:
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=len(training_data), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

In [0]:
import random
for epoch in range(
        1):  # again, normally you would NOT do 300 epochs, it is toy data
    epoch_loss = 0
    counter=0
    res = random.sample(training_data, len(training_data))
    bar.start()
    print("\n")
    print("Epoch:-",epoch+1)
    for sentence, tags in res:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).cuda()
        assert len(sentence_in)==len(targets)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        #print(counter)
        bar.update(counter+1)
        #sleep(0.1)
        counter=counter+1
        #break
    bar.finish()
    print("\n")
    print("Loss:-",epoch_loss/2246)
    acc(testing_data)

                                                                               [                                                                        ] N/A%



Epoch:- 1






Loss:- 0.5666915766383959


################ STRICT ####################
f1 score macro:-  0.5200972714819245
f1 score micro:-  0.9079636967915123
Sequence Accuracy:-  0.5233644859813084
              precision    recall  f1-score   support

       I-ADR      0.339     0.313     0.326       182
       L-ADR      0.447     0.379     0.410       177
       U-ADR      0.571     0.527     0.548       184
       B-ADR      0.396     0.333     0.362       177
           O      0.949     0.961     0.955      7103

    accuracy                          0.908      7823
   macro avg      0.540     0.503     0.520      7823
weighted avg      0.902     0.908     0.905      7823



################ RELAXED ####################
f1 score macro:-  0.7405804992044166
f1 score micro:-  0.9178064681068644
Sequence Accuracy:-  0.530373831775701
              precision    recall  f1-score   support

         ADR      0.560     0.496     0.526       720
           O      0.949     0.961     0.955      710

In [0]:
torch.save(model, "bicrf54.4.pt")

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[980][0], word_to_ix)
    print(model(precheck_sent))

(tensor(29.4858, device='cuda:0'), [0, 0, 0, 0, 4, 0, 0, 0])


In [0]:
training_data[980]

(['vyvanse', 'commonly', 'known', 'a', 'ocd', 'in', 'a', 'pill'],
 ['O', 'O', 'O', 'O', 'U-ADR', 'O', 'O', 'O'])

In [0]:
with torch.no_grad():
    for i in range(0,len(training_data)):
      precheck_sent = prepare_sequence(training_data[i][0], word_to_ix)
      print(str(i),"--->",model(precheck_sent)[1])
      #print(z)

0 ---> [0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0]
1 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]
2 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4 ---> [0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0]
5 ---> [0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
6 ---> [0, 0, 0, 1, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]
7 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]
8 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
10 ---> [0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
11 ---> [0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0]
12 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
13 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]
14 ---> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
15 ---> [0, 0, 0, 0, 0, 0, 4, 0]
16 ---> [0, 0, 0, 0, 0,