In [1]:
import pandas as pd
import numpy as np
import torch

from transformers import BertTokenizer, BertModel
from datasets import load_dataset

from pprint import pprint

import logging
logging.basicConfig(level=logging.INFO) 

import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda', index=0)

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [3]:
def load_dataset(dataset):
    dset = pd.read_json(dataset, lines=True)
    return pd.DataFrame(dset['abstract'])

arxiv = load_dataset('dataset/arxiv_data.json')

In [4]:
k = 1000
arxiv_first_k = arxiv[:k]

display(arxiv_first_k)

Unnamed: 0,abstract
0,A fully differential calculation in perturba...
1,"We describe a new algorithm, the $(k,\ell)$-..."
2,The evolution of Earth-Moon system is descri...
3,We show that a determinant of Stirling cycle...
4,In this paper we show how to compute the $\L...
...,...
995,Five dimensional neutral rotating black ring...
996,Suppose that $h$ and $g$ belong to the algeb...
997,Let F be a finite extension of Qp and G be G...
998,We relate a generic character sheaf on a dis...


In [5]:
%%time
def get_embeddings(data):
    data = data['abstract']
    marked_text = "[CLS] " + data + " [SEP]"    
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    max_seq_length = 256
    if len(indexed_tokens) < max_seq_length:
        indexed_tokens += [0] * (max_seq_length - len(indexed_tokens))
    else:
        indexed_tokens = indexed_tokens[:max_seq_length]
    
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    
    with torch.no_grad():
        outputs = model(tokens_tensor)
        hidden_states = outputs.hidden_states
        
        bert_embeddings = hidden_states[-2].squeeze(0)
        # 
        return bert_embeddings
    
bert_embeddings = []

for i, row in arxiv_first_k.iterrows():
    if i % 100 == 0:
        print(f'Processing Entry: [{i}-{i+100}]/{k}')
    bert_embeddings.append(get_embeddings(row))

Processing Entry: [0-100]/1000
Processing Entry: [100-200]/1000
Processing Entry: [200-300]/1000
Processing Entry: [300-400]/1000
Processing Entry: [400-500]/1000
Processing Entry: [500-600]/1000
Processing Entry: [600-700]/1000
Processing Entry: [700-800]/1000
Processing Entry: [800-900]/1000
Processing Entry: [900-1000]/1000
CPU times: total: 2.73 s
Wall time: 10.3 s


In [6]:
bert_embeddings = torch.stack(bert_embeddings)

In [7]:
bert_embeddings.shape

torch.Size([1000, 256, 768])

In [8]:
# save the embeddings
torch.save(bert_embeddings, 'bert_embeddings.pt')

In [9]:
# load the embeddings
bert_embeddings_loaded = torch.load('bert_embeddings.pt')

bert_embeddings_loaded.shape

torch.Size([1000, 256, 768])