In [12]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
import pickle

In [13]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
df = pd.read_csv("lab2_oil_gas_field_construction_data.csv")

In [15]:
texts = list(filter(lambda x: x is not np.nan, df.to_numpy().reshape(-1)))
texts = list(set(texts))
texts = list(map(lambda x: x.lower(), texts))

In [16]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

In [17]:
embeddings = [embed_bert_cls(list(text), model, tokenizer) for text in tqdm(np.array_split(texts, 100))]

  0%|          | 0/100 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 100/100 [05:13<00:00,  3.13s/it]


In [18]:
embeddings = np.vstack(embeddings)

In [19]:
name2emb = {n: emb for n, emb in zip(texts, embeddings)}

In [20]:
with open("name2emb.pickle", "wb") as f:
    pickle.dump(name2emb, f)