In [None]:
!pip install transformers

In [173]:
from transformers import BertTokenizer, BertModel

In [174]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.data import Field, BucketIterator, TabularDataset

import spacy
import numpy as np

import random
import math
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [175]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [176]:
bert = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = False,
                                  )
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [177]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [178]:
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.train.examples"
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.test.examples"

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.train.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])


In [179]:

import pandas as pd

df = pd.DataFrame({'utterance':utterance, 'original':original})

# df['utterance'] = df['utterance'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
# df['original'] = df['original'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
df['utterance_mask'] = df['utterance'].apply(lambda x : ' '.join(['1']*(len(bert_tokenizer.tokenize(x))+2)))
df['original_mask'] = df['original'].apply(lambda x : ' '.join(['1']*(len(bert_tokenizer.tokenize(x))+2)))
df.to_csv('train_data.csv',index=False)

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.test.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]

utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])

df = pd.DataFrame({'utterance':utterance, 'original':original})

# df['utterance'] = df['utterance'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
# df['original'] = df['original'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
df['utterance_mask'] = df['utterance'].apply(lambda x : ' '.join(['1']*(len(bert_tokenizer.tokenize(x))+2)))
df['original_mask'] = df['original'].apply(lambda x : ' '.join(['1']*(len(bert_tokenizer.tokenize(x))+2)))
df.to_csv('test_data.csv',index=False)

In [180]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [181]:
len(train_df)

640

In [182]:
train_df.head()

Unnamed: 0,utterance,original,utterance_mask,original_mask
0,article with the largest amount of authors,article that has the most number of author,1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1
1,article citing article published in annals of ...,article that article whose venue is annals of ...,1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1
2,what article from 2004 cites multivariate data...,article whose publication date is 2004 and tha...,1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3,find an article published in 2004,article whose publication date is 2004,1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1
4,articles that have a publication date close to...,article whose publication date is at most publ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


In [183]:
cls_token_idx = bert_tokenizer.cls_token_id
sep_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [184]:
UTTERANCE = Field(sequential=True,
                tokenize = bert_tokenizer.tokenize, 
                use_vocab = False,    
                preprocessing = bert_tokenizer.convert_tokens_to_ids,             
                pad_token = pad_token_idx,
                init_token = cls_token_idx,
                eos_token = sep_token_idx,
                unk_token = unk_token_idx,
                batch_first = True,
                lower = True)

ORIGINAL = Field(sequential=True,
                tokenize = bert_tokenizer.tokenize, 
                use_vocab = False,    
                preprocessing = bert_tokenizer.convert_tokens_to_ids,             
                pad_token = pad_token_idx,
                init_token = cls_token_idx,
                eos_token = sep_token_idx,
                unk_token = unk_token_idx,
                batch_first = True,
                lower = True)

UTTERANCE_MASK = Field(sequential=True,
                  tokenize = lambda x : x.split(),
                  use_vocab = False, 
                  preprocessing = lambda x : [int(i) for i in x],
                  pad_token = 0,
                  batch_first = True
                  )

ORIGINAL_MASK = Field(sequential=True,
                tokenize = lambda x : x.split(), 
                use_vocab = False,        
                preprocessing = lambda x : [int(i) for i in x],         
                pad_token = 0,
                batch_first = True
                )

fields = [('utterance', UTTERANCE), ('original', ORIGINAL),('utterance_mask', UTTERANCE_MASK), ('original_mask', ORIGINAL_MASK)] # ('token_type', TTYPE), ('start',START), ('end',END)]

In [185]:
train_data, test_data = TabularDataset.splits(
                                        path = '/content/',
                                        train = 'train_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
print(type(train_data))

<class 'torchtext.legacy.data.dataset.TabularDataset'>


In [186]:
UTTERANCE.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'
UTTERANCE_MASK.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL_MASK.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>

In [187]:
BATCH_SIZE = 1

train_iterator, test_iterator = BucketIterator.splits(
                                                      (train_data, test_data), 
                                                      batch_size = BATCH_SIZE,
                                                      sort_key = lambda x : len(x.original),
                                                      shuffle = False,
                                                      device = device)

In [188]:
bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [189]:
bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(train_df['utterance'][0]))

[3720, 2007, 1996, 2922, 3815, 1997, 6048]

In [207]:
import pickle

class BERT_LSH:
  def __init__(self, num_bits, emb_dim, bert):
    self.l = num_bits
    self.emb_dim = emb_dim
    self.hyperplanes = torch.rand(num_bits,emb_dim)
    self.hyperplanes = self.hyperplanes - self.hyperplanes.mean(dim=-1,keepdims = True)
    self.hyperplanes.requires_grad = False
    self.bert = bert
    self.dynamic_hash = {}
    self.cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
    self.vector_list = None

  def create_hash_database(self, _iterator):
    
    for id,batch in enumerate(_iterator):
      assert batch.utterance.shape[0] == 1 #batch_size == 1
      assert batch.utterance.shape == batch.utterance_mask.shape
      assert batch.original.shape == batch.original_mask.shape

      output = self.bert(batch.original,batch.original_mask)[0][0,:,:].mean(dim = 0)
      if self.vector_list == None:
        self.vector_list = output.unsqueeze(dim = 0)
        # print(self.vector_list.shape)
      else:
        self.vector_list = torch.cat((self.vector_list,output.unsqueeze(dim = 0)),dim = 0)

      if self.vector_list.shape[0] == 100 or id == len(_iterator)-1:
        file_pi = open('vector_' + str(id//100) + '.obj', 'wb') 
        pickle.dump({'data':self.vector_list}, file_pi)
        self.vector_list = None


      output = (torch.matmul(self.hyperplanes,output) >= 0).type(torch.uint8)
      
      finger_print = output.detach().cpu().numpy().tolist()
      finger_print = ''.join([str(x) for x in finger_print])

      try:
        self.dynamic_hash[finger_print].append(id)
      except:
        self.dynamic_hash[finger_print] = [id]


    # print(self.vector_list.shape)

  def cosine_dist(self, u, v):
    return 1 - self.cos(u,v)

  def find_closest_projection(self, _iterator):
    closest_proj = []
    for id,batch in enumerate(_iterator):
      output = self.bert(batch.utterance,batch.utterance_mask)[0][0,:,:].mean(dim = 0)
      out = (torch.matmul(self.hyperplanes,output) >= 0).type(torch.uint8)
      temp,idx = -float('inf'),0
      finger_print = out.detach().cpu().numpy().tolist()
      finger_print = ''.join([str(x) for x in finger_print])
      # print(finger_print)
      try:
        for val in self.dynamic_hash[finger_print]:
          id = val//100
          file_pi = open('vector_' + str(id) + '.obj', 'rb') 
          pickle_obj = pickle.load(file_pi)
          # print(pickle_obj['data'])
          self.vector_list = pickle_obj['data']
          if temp < self.cosine_dist(self.vector_list[val%100,:],output):
            temp = self.cosine_dist(self.vector_list[val%100,:],output)
            idx = val

        closest_proj.append(idx)
      except:
        print("Key out of training data")
        closest_proj.append(0)

    return closest_proj

In [213]:
lsh_obj = BERT_LSH(num_bits = 2, emb_dim = bert.config.to_dict()['hidden_size'], bert = bert)

In [214]:
lsh_obj.create_hash_database(train_iterator)

In [215]:
list = lsh_obj.find_closest_projection(test_iterator)

In [216]:
exact_match = 0
for id,val in enumerate(list):
  exact_match += int(test_df['original'][id] == train_df['original'][val])

In [219]:
exact_match,len(list)

(7, 161)

In [218]:
lsh_obj.dynamic_hash.keys()

dict_keys(['10', '00', '11', '01'])

LSH with Glove/Fasttext Embeddings

In [153]:
import pandas as pd

df = pd.DataFrame({'utterance':utterance, 'original':original})

df['utterance'] = df['utterance'].apply(lambda x : ' '.join(x.split()))
df['original'] = df['original'].apply(lambda x : ' '.join(x.split()))
df.to_csv('train_data.csv',index=False)

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.test.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])

df = pd.DataFrame({'utterance':utterance, 'original':original})

df['utterance'] = df['utterance'].apply(lambda x : ' '.join(x.split()))
df['original'] = df['original'].apply(lambda x : ' '.join(x.split()))
df.to_csv('test_data.csv',index=False)

In [154]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [155]:
train_df.head()

Unnamed: 0,utterance,original
0,what article has the most number of articles c...,article that the most number of article cites
1,person who has not published article in multiv...,person that is not author of multivariate data...
2,what person is not the author of multivariate ...,person that is not author of multivariate data...
3,article cited by two articles,article that two article cites
4,articles that do not cite multivariate data an...,article that not cites multivariate data analysis


In [156]:
test_df.head()

Unnamed: 0,utterance,original
0,what article has the most number of articles c...,article that the most number of article cites
1,person who has not published article in multiv...,person that is not author of multivariate data...
2,what person is not the author of multivariate ...,person that is not author of multivariate data...
3,article cited by two articles,article that two article cites
4,articles that do not cite multivariate data an...,article that not cites multivariate data analysis


In [157]:
# spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
  return bert_tokenizer.tokenize(text)

def normal_tokenizer(text):
  return text.split()

UTTERANCE = Field(sequential=True,
                  tokenize = normal_tokenizer,
                  use_vocab = True,
                  # preprocessing = bert_tokenizer.convert_tokens_to_ids, 
                  pad_token = 0,
                  batch_first = True,
                  lower = True)

ORIGINAL = Field(sequential=True,
                tokenize = normal_tokenizer, 
                use_vocab = True,       
                batch_first = True,
                pad_token = 0,
                lower = True)

fields = [('utterance', UTTERANCE), ('original', ORIGINAL)] # ('token_type', TTYPE), ('start',START), ('end',END)]

In [158]:
train_data, test_data = TabularDataset.splits(
                                        path = '/content/',
                                        train = 'train_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
print(type(train_data))

<class 'torchtext.legacy.data.dataset.TabularDataset'>


In [159]:
UTTERANCE.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'

len(ORIGINAL.vocab),len(UTTERANCE.vocab)

(44, 122)

In [160]:
BATCH_SIZE = 1

train_iterator, test_iterator = BucketIterator.splits(
                                                      (train_data, test_data), 
                                                      batch_size = BATCH_SIZE,
                                                      sort_key = lambda x : len(x.utterance),
                                                      shuffle = False,
                                                      device = device)

In [161]:
len(test_iterator)

161

In [162]:
import pickle

class Embedding_LSH:

  def __init__(self, num_bits, input_dim, emb_dim, embedding1, embedding2):
    self.l = num_bits
    self.emb_dim = emb_dim
    self.hyperplanes = torch.rand(num_bits,emb_dim)
    self.hyperplanes = self.hyperplanes - self.hyperplanes.mean(dim=-1,keepdims = True)

    self.hyperplanes.requires_grad = False
    self.embedding1 = nn.Embedding.from_pretrained(embedding1)
    self.embedding2 = nn.Embedding.from_pretrained(embedding2)


    self.dynamic_hash = {}
    self.cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
    self.vector_list = None

  def create_hash_database(self, _iterator):
    
    for id,batch in enumerate(_iterator):
      # print(batch.original.shape)

      assert batch.utterance.shape[0] == 1 #batch_size == 1
      
      output = self.embedding1(batch.original).mean(dim = 1)
      # print(output.shape) # [batch_sz, 300]
      if self.vector_list == None:
        self.vector_list = output
        # print(self.vector_list.shape)
      else:
        self.vector_list = torch.cat((self.vector_list,output),dim = 0)

      if self.vector_list.shape[0] == 100 or id == len(_iterator)-1:
        file_pi = open('vector_' + str(id//100) + '.obj', 'wb')
        pickle.dump({'data':self.vector_list}, file_pi)
        self.vector_list = None

      output = (torch.matmul(self.hyperplanes,output.permute(1,0)) >= 0).type(torch.uint8)
      # print(output.shape) [2,1]
      finger_print = output.squeeze(-1).detach().cpu().numpy().tolist()
      finger_print = ''.join([str(x) for x in finger_print])
      # print(finger_print)

      try:
        self.dynamic_hash[finger_print].append(id)
      except:
        self.dynamic_hash[finger_print] = [id]

    # print(self.vector_list.shape)

  def cosine_dist(self, u, v):
    return 1 - self.cos(u,v)

  def find_closest_projection(self, _iterator):
    closest_proj = []
    for id,batch in enumerate(_iterator):
      # print(self.embedding2(batch.utterance).shape)
      output = self.embedding2(batch.utterance).mean(dim = 1)
      # print(output.shape)
      out = (torch.matmul(self.hyperplanes,output.permute(1,0)) >= 0).type(torch.uint8)
      temp,idx = -float('inf'),0
      finger_print = out.squeeze(-1).detach().cpu().numpy().tolist()
      finger_print = ''.join([str(x) for x in finger_print])
      # print(finger_print)
      try:
        for val in self.dynamic_hash[finger_print]:
          id = val//100
          file_pi = open('vector_' + str(id) + '.obj', 'rb') 
          pickle_obj = pickle.load(file_pi)
          # print(pickle_obj['data'])
          self.vector_list = pickle_obj['data']
          cosine = self.cosine_dist(self.vector_list[val%100,:],output.squeeze(0))
          if temp < cosine.item():
            temp = cosine.item()
            idx = val

        closest_proj.append(idx)
      except:
        print("Key out of training data")
        closest_proj.append(0)

    return closest_proj

In [163]:
# import torchtext
# glove = torchtext.vocab.GloVe(name='42B', dim=300)
# # fasttext = torchtext.vocab.FastText(language='en')

In [164]:
ORIGINAL.vocab.itos[0]

'<unk>'

In [165]:
glove_emb1 = []
glove_emb2 = []

for i in range(len(ORIGINAL.vocab)):
  try:
    glove_emb1.append(glove[ORIGINAL.vocab.itos[i]].detach().cpu().numpy().tolist())
  except:
    glove_emb1.append(glove[ORIGINAL.vocab.itos[0]].detach().cpu().numpy().tolist())

for i in range(len(UTTERANCE.vocab)):
  try:
    glove_emb2.append(glove[UTTERANCE.vocab.itos[i]].detach().cpu().numpy().tolist())
  except:
    glove_emb2.append(glove[UTTERANCE.vocab.itos[0]].detach().cpu().numpy().tolist())

In [166]:
INPUT_DIM = len(ORIGINAL.vocab)
EMB_DIM = 300

lsh_obj = Embedding_LSH(num_bits = 2, input_dim = INPUT_DIM, emb_dim = EMB_DIM, embedding1 = torch.tensor(glove_emb1), embedding2 = torch.tensor(glove_emb2))
torch.tensor(glove_emb1).shape

torch.Size([44, 300])

In [167]:
lsh_obj.create_hash_database(train_iterator)

In [168]:
list = lsh_obj.find_closest_projection(test_iterator)

In [169]:
lsh_obj.dynamic_hash.keys()

dict_keys(['11', '01', '00', '10'])

In [170]:
exact_match = 0
for id,val in enumerate(list):
  exact_match += int(test_df['original'][id] == train_df['original'][val])

In [220]:
exact_match

9