In [1]:
import torch
from transformers import BertTokenizer, BertModel

import pickle
import os
import random

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Send the model to the GPU
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def bert_encode(article:str) -> torch.Tensor :
    input_ids = tokenizer.encode(article, truncation=False)
    chunks = [input_ids[i:i + 512] for i in range(0, len(input_ids), 512)]
    embeddings = []

    for chunk in chunks:
        # Add the required special tokens
        chunk = chunk[:510]  # in case chunk is the last one and has more than 510 tokens
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
        
        # Convert to tensor and add batch dimension
        chunk_tensor = torch.tensor(chunk).unsqueeze(0).to(device)
        
        # Run through the model
        with torch.no_grad():
            outputs = model(chunk_tensor)
            
        # Take the mean of the sequence output (could also use [CLS] token, etc.)
        embedding = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(embedding)

    # Concatenate or average the embeddings from each chunk
    document_embedding = torch.cat(embeddings, dim=0).mean(dim=0)

    return document_embedding


# Wikipedia Corpus - 2018

In [5]:
corpus_path = 'D:/INFO323/TokenizedToast/corpus-all.pkl'

In [6]:
if os.path.exists(corpus_path):
    with open(corpus_path, 'rb') as file:
        corpus = pickle.load(file)

# Using UMAP to lower dimensionality of BERT Encodings. lol. Lower the dimensionality of encodings.

In [7]:
# import umap
# descr_vect_3d = umap.UMAP(n_components=3).fit_transform(doc_encodes.cpu().numpy())

# Encoding Corpus - Test 100000. 

In [8]:
corpus = random.sample(corpus, 100000)

In [9]:
corpus[0][1]

'tang ab also romanized as tang b is a village in juyom rural district juyom district larestan county fars province iran at the census its population was in families'

In [10]:
print(type(corpus))

<class 'list'>


In [13]:
embeddings = []

for article in corpus:
    article = article[1] # A text document or article.
    encoding = bert_encode(article) # Encoding Articles
    
    # Turning BERT Encoding to CPU to save GPU memory.
    encoding = encoding.cpu().numpy()

    # Concatenation of encodings
    embeddings.append(encoding)

embeddings_tensor = torch.tensor(embeddings)

Token indices sequence length is longer than the specified maximum sequence length for this model (1126 > 512). Running this sequence through the model will result in indexing errors
  embeddings_tensor = torch.tensor(embeddings)
