This notebook generates word embeddings using BERT. So, instead of passing sentences to BERT, words are passed. It is based on the following blog post: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

In [None]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 2.8MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2


In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import numpy as np
import time

In [None]:
torch.cuda.is_available()

True

# Methods

In [None]:
def read_data(data_directory, data_file, tokenizer, old_dictionary=None):
  """
  This method reads data sentences and transforms them to accepted format by BERT's tokenizer. Dictionary is
    used to collect uniqe words of data. Each value in the dictionary is set to 0, 
    and it is not used.

  This method is used once to collect all uniqe words for a given dataset. Then, the words are written to a vocab file. When one 
    wants to generate embeddings, they do not have to run this method. They just need to read the vocab file corresponding to the dataset.  

  If vocabs (uniqe words) of several datasets are to be generated, then one needs to set "old_dictionary" to "None" for the first dataset. After that,
    they need to set "old_dictionary" to dictionary returned by the previous call to the method. See Read Data section for an example.
  """

  print("Processing data in {}".format(data_directory + data_file))

  dictionary = old_dictionary if old_dictionary else {}
  counter = 0

  with open(data_directory + data_file, mode="r") as data_file:
    start_time = time.time()
    for line in data_file:
        sentence = "[CLS] " + line + " [SEP]"
        tokenized_sentence = tokenizer.tokenize(sentence)

        for element in tokenized_sentence:
          dictionary[element] = 0

        counter += 1
        if counter % 100000 == 0:
            print("Done with line %d" % counter)
            print("Elapsed time in minutes is:", round((time.time() - start_time)/60, 4))

  avg_time = (time.time() - start_time) * (100000 / counter) * (1 / 60)
  print("Process is done, and average time (in minutes) needed for every 100000 lines is", round(avg_time, 4))
  return dictionary

In [None]:
def write_vocab(dictionary, vocab_directory, vocab_file="vocab.txt"):
  """
  This method writes all uniqe words to a vocab file.
  """
  with open(vocab_directory + vocab_file, 'w') as f:
    for key in dictionary.keys():
        f.write(key + "\n")

In [None]:
def read_vocab(vocab_directory, vocab_file="vocab.txt"):
  dictionary = {}
  with open(vocab_directory + vocab_file, 'r') as f:
    for line in f:
      dictionary[line.strip()] = 0

  return dictionary

In [None]:
def get_embeddings(dictionary, tokenizer, model, mode="first"):
  """
  This method extracts embeddings from BERT's layers as specified by "mode"
  """

  counter = 0
  start_time = time.time()
  for key in dictionary:
    
    counter += 1
    if (counter%1000 == 0):
      print("Processing element: ", counter)
      print("Elapsed time in minutes is:", round((time.time() - start_time)/60, 4))
    
    list_ = [key]
    segments_ids = [1] * len(list_)

    id_tokens = tokenizer.convert_tokens_to_ids(list_)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([id_tokens], device=torch.device("cuda:0"))
    segments_tensors = torch.tensor([segments_ids], device=torch.device("cuda:0"))

    with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor, segments_tensors)

    token_embeddings = torch.stack(encoded_layers, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    if (mode=="first"):
      dictionary[key] = token_embeddings[0][0].cpu().numpy()
    elif (mode=="mid"):
      dictionary[key] = np.mean(token_embeddings[4:6, 0, :].cpu().numpy(), axis=0)
    elif (mode=="last"):
      dictionary[key] = token_embeddings[-1][0].cpu().numpy()
    elif (mode=="avg_1st_2nd"):
      dictionary[key] = np.mean(token_embeddings[0:2, 0, :].cpu().numpy(), axis=0)
    elif (mode=="concat_1st_2nd"):
      dictionary[key] = np.concatenate((token_embeddings[0:2, 0, :].cpu().numpy()), axis=0)
    elif (mode=="avg"):
      dictionary[key] = np.mean(token_embeddings[-4:, 0, :].cpu().numpy(), axis=0)

  avg_time = (time.time() - start_time) * (1000 / counter) * (1 / 60)
  print("Process is done, and average time (in minutes) needed for every 1000 elements is", round(avg_time, 4))
  return dictionary

In [None]:
def write_embeddings(dictionary, embeddings_directory, embeddings_file="embeddingsFile.txt"):

  with open(embeddings_directory + embeddings_file, mode='w+') as data_file:
    counter = 0
    for key in dictionary:
        counter += 1
        data_file.write(key + ' ')
        np.savetxt(data_file, dictionary[key], delimiter=' ', newline=' ')
        data_file.write('\n')

In [None]:
def read_embeddings(embeddings_directory, embeddings_file):

  embedding_matrix = {}
  with open(embeddings_directory + embeddings_file, mode='r') as data_file:
    counter = 0
    for line in data_file:
      counter += 1
      element = line.split()
      embedding_matrix[element[0]] = np.asarray([float(x) for x in element[1:]], dtype=np.float32)

  return embedding_matrix

# Define Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')
model.to(device=torch.device("cuda:0"))
model.eval()

100%|██████████| 231508/231508 [00:00<00:00, 352234.20B/s]
100%|██████████| 407873900/407873900 [00:30<00:00, 13263858.56B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

# Read Data

## Ready vocab file

If there is a vocab file, run the next cell and ignore the 5 cells after it.

In [None]:
vocab_directory = "./"
vocab = read_vocab(vocab_directory)

Ignore if not needed.

In [None]:
data_directory = "./"
dictionary = read_data(data_directory, "CBT-NE.txt", tokenizer, old_dictionary=None)

In [None]:
dictionary = read_data(data_directory, "CBT-CN.txt", tokenizer, old_dictionary = dictionary)

In [None]:
vocab_directory = "./"
write_vocab(dictionary, vocab_directory)

# Get Embeddings

In [None]:
vocab = get_embeddings(vocab, tokenizer, model, mode="avg_1st_2nd")

Processing element:  1000
Elapsed time in minutes is: 0.1982
Processing element:  2000
Elapsed time in minutes is: 0.3911
Processing element:  3000
Elapsed time in minutes is: 0.5831
Processing element:  4000
Elapsed time in minutes is: 0.7749
Processing element:  5000
Elapsed time in minutes is: 0.9693
Processing element:  6000
Elapsed time in minutes is: 1.1615
Processing element:  7000
Elapsed time in minutes is: 1.357
Processing element:  8000
Elapsed time in minutes is: 1.549
Processing element:  9000
Elapsed time in minutes is: 1.7433
Processing element:  10000
Elapsed time in minutes is: 1.9431
Processing element:  11000
Elapsed time in minutes is: 2.1387
Processing element:  12000
Elapsed time in minutes is: 2.3344
Processing element:  13000
Elapsed time in minutes is: 2.5313
Processing element:  14000
Elapsed time in minutes is: 2.7236
Processing element:  15000
Elapsed time in minutes is: 2.9167
Processing element:  16000
Elapsed time in minutes is: 3.1128
Processing element:

# Write Embeddings

In [None]:
embeddings_directory = "/content/drive/My Drive/192/Thesis/embeddings/"
write_embeddings(vocab, embeddings_directory, embeddings_file="embeddingsAvg_1st_2nd.txt")

# Read Embeddings

In [None]:
embeddings_directory = "/content/drive/My Drive/192/Thesis/embeddings/"
embedding_matrix = read_embeddings(embeddings_directory, "embeddingsConcat_1st_2nd.txt")