Imports

In [2]:
from typing import List, Dict, Tuple, Any, Callable
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import itertools
import torch
from transformers import AutoTokenizer, AutoModel

Code

In [12]:
input_text = "Это пример предложения для эмбеддингов"
input_ids = torch.tensor([tokenizer.encode(input_text)])
with torch.no_grad():
    model_output = model(input_ids)
bert_embeddings = model_output[0]
print(bert_embeddings.shape)

torch.Size([1, 11, 768])


In [3]:
# Global vars
_verbose = 0
_batch_size = 1
# bert_path = "../"
# Init word embedding models once
tokenizer = AutoTokenizer.from_pretrained("deepvk/deberta-v1-base")
model = AutoModel.from_pretrained("deepvk/deberta-v1-base", output_hidden_states=True)

# Helper func 1
def get_word_idx(sent: str, word: str) -> int:
    """split sentences and add index to each word. Each word has its own index based on when it was added to the list first
    Args:
        sent (str): sentence in string
        word (str): word in string
    Returns:
        int: output the index of where the word correspond to in each sentence input
    """
    return sent.lower().split(" ").index(word)


# Helper func 2
def get_hidden_states(sent, tokenizer, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
       Select only those subword token outputs that belong to our word of interest
       and average them.
    Args:
        sent (str): Input sentence
        tokenizer : Tokenizer function
        model: bert model
        layers : last 4 model of model
    Returns:
        output: tensor torch
    """
    # encode without adding [CLS] and [SEP] tokens
    encoded = tokenizer.encode_plus(sent, return_tensors="pt", add_special_tokens=False)

    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    return output


# Helper func 3
def chunking(max_len, sent):
    """because the embedding function is trained on dim 512, so we have to limit the size of the sentences using max_len so the final chunked sentences wont exceed length 512
    Args:
        max_len (int): maximum number of tokens for each chunk
        sent (str): input sentence
    Returns:
        sent_chunk (List(str)): list of chunked sentences
    """
    tokenized_text = sent.lower().split(" ")
    # using list comprehension
    final = [
        tokenized_text[i * max_len : (i + 1) * max_len]
        for i in range((len(tokenized_text) + max_len - 1) // max_len)
    ]

    # join back to sentences for each of the chunks
    sent_chunk = []
    for item in final:
        # make sure the len(items) > 1 or else some of the embeddings will appear as len 1 instead of 768.
        assert len(item) > 1
        sent_chunk.append(" ".join(item))
    return sent_chunk

In [4]:
def main_avg(sent: str, layers=None, chunk_size=300):
    """Gives the average word embedding per sentence

    Args:
        sent (str): The input sentence

    Returns:
        torch tensor: word embedding per sentence, dim = 768
    """
    # change all standard form numbers to decimal
    np.set_printoptions(formatter={"float_kind": "{:f}".format})

    # Use last four layers by default
    layers = [-4, -3, -2, -1] if layers is None else layers
    global tokenizer
    global model

    # chunking
    chunked_tokens = chunking(chunk_size, sent)  # helper func 3

    # initialise a outside chunk
    word_embedding_avg_collective = []
    # for each chunked token, we embed them separately
    for item in chunked_tokens:
        # adding tensors
        word_embedding_torch = get_hidden_states(
            item, tokenizer, model, layers
        )  # helper fun 2

        # convert torch tensor to numpy array
        word_embedding_avg_np = word_embedding_torch.cpu().detach().numpy()
        word_embedding_avg_chunks = np.mean(word_embedding_avg_np, axis=0)
        word_embedding_avg_collective.append(word_embedding_avg_chunks)
    word_embedding_avg = np.mean(word_embedding_avg_collective, axis=0)
    assert len(word_embedding_avg) == 768
    return word_embedding_avg

In [10]:
avg_input = main_avg("Это пример предложения для эмбеддингов", layers=None, chunk_size=300)
print(avg_input)

[0.591490 -0.380062 0.935977 -1.466756 0.808263 0.013020 -0.496918
 -10.941804 -1.330297 -0.217808 0.064943 0.358653 -0.874851 -0.792649
 0.958106 -0.463012 0.428570 -0.055375 -1.106878 1.328627 0.185171
 -0.444248 -0.940792 -1.615614 0.415217 0.418257 -0.478086 0.272854
 1.911883 1.071831 0.520323 -0.097729 0.761777 -1.053714 -0.428464
 0.723403 -0.143116 -0.338280 0.087615 -0.608163 -0.684337 0.241025
 0.205047 0.160684 0.560690 -0.509371 0.228526 0.139592 -0.401513 0.466110
 0.504656 0.647467 0.920083 0.063520 0.490110 0.844734 -0.185446 -0.947576
 -0.017977 -1.306925 0.441733 0.623836 -0.391002 0.676817 -0.097150
 -0.819193 -0.442789 0.830710 -0.973627 0.454027 0.652821 0.430550
 -1.082851 0.571368 -0.128744 -2.208430 0.294621 0.664620 -0.327884
 -1.605323 0.538414 -0.597383 0.371870 0.279613 0.642930 -0.065847
 0.761038 -2.042186 0.613059 -0.712953 0.242563 -0.091222 0.557234
 -0.637208 1.312360 0.338818 -0.652879 -0.948241 -0.194665 -1.251956
 0.374977 0.655686 -0.814976 1.093540