Imports

In [2]:
from typing import List, Dict, Tuple, Any, Callable
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import itertools
import torch
from transformers import AutoTokenizer, AutoModel

Code

In [12]:
input_text = "Это пример предложения для эмбеддингов"
input_ids = torch.tensor([tokenizer.encode(input_text)])
with torch.no_grad():
    model_output = model(input_ids)
bert_embeddings = model_output[0]
print(bert_embeddings.shape)

torch.Size([1, 11, 768])


In [3]:
# Global vars
_verbose = 0
_batch_size = 1
# bert_path = "../"
# Init word embedding models once
tokenizer = AutoTokenizer.from_pretrained("deepvk/deberta-v1-base")
model = AutoModel.from_pretrained("deepvk/deberta-v1-base", output_hidden_states=True)

# Helper func 1
def get_word_idx(sent: str, word: str) -> int:
    """split sentences and add index to each word. Each word has its own index based on when it was added to the list first
    Args:
        sent (str): sentence in string
        word (str): word in string
    Returns:
        int: output the index of where the word correspond to in each sentence input
    """
    return sent.lower().split(" ").index(word)


# Helper func 2
def get_hidden_states(sent, tokenizer, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
       Select only those subword token outputs that belong to our word of interest
       and average them.
    Args:
        sent (str): Input sentence
        tokenizer : Tokenizer function
        model: bert model
        layers : last 4 model of model
    Returns:
        output: tensor torch
    """
    # encode without adding [CLS] and [SEP] tokens
    encoded = tokenizer.encode_plus(sent, return_tensors="pt", add_special_tokens=False)

    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    return output


# Helper func 3
def chunking(max_len, sent):
    """because the embedding function is trained on dim 512, so we have to limit the size of the sentences using max_len so the final chunked sentences wont exceed length 512
    Args:
        max_len (int): maximum number of tokens for each chunk
        sent (str): input sentence
    Returns:
        sent_chunk (List(str)): list of chunked sentences
    """
    tokenized_text = sent.lower().split(" ")
    # using list comprehension
    final = [
        tokenized_text[i * max_len : (i + 1) * max_len]
        for i in range((len(tokenized_text) + max_len - 1) // max_len)
    ]

    # join back to sentences for each of the chunks
    sent_chunk = []
    for item in final:
        # make sure the len(items) > 1 or else some of the embeddings will appear as len 1 instead of 768.
        assert len(item) > 1
        sent_chunk.append(" ".join(item))
    return sent_chunk

In [4]:
def main_avg(sent: str, layers=None, chunk_size=300):
    """Gives the average word embedding per sentence

    Args:
        sent (str): The input sentence

    Returns:
        torch tensor: word embedding per sentence, dim = 768
    """
    # change all standard form numbers to decimal
    np.set_printoptions(formatter={"float_kind": "{:f}".format})

    # Use last four layers by default
    layers = [-4, -3, -2, -1] if layers is None else layers
    global tokenizer
    global model

    # chunking
    chunked_tokens = chunking(chunk_size, sent)  # helper func 3

    # initialise a outside chunk
    word_embedding_avg_collective = []
    # for each chunked token, we embed them separately
    for item in chunked_tokens:
        # adding tensors
        word_embedding_torch = get_hidden_states(
            item, tokenizer, model, layers
        )  # helper fun 2

        # convert torch tensor to numpy array
        word_embedding_avg_np = word_embedding_torch.cpu().detach().numpy()
        word_embedding_avg_chunks = np.mean(word_embedding_avg_np, axis=0)
        word_embedding_avg_collective.append(word_embedding_avg_chunks)
    word_embedding_avg = np.mean(word_embedding_avg_collective, axis=0)
    assert len(word_embedding_avg) == 768
    return word_embedding_avg

In [12]:
avg_input = main_avg("Это пример предложения для эмбеддингов", layers=None, chunk_size=300)
print(avg_input)

[0.369115 -0.290790 1.333342 -1.083138 0.412907 -0.443546 -0.069624
 -9.860280 -0.538551 -0.654225 0.108648 0.365405 -0.416808 -0.951250
 -0.081191 -0.049205 0.543378 -0.570664 -1.160450 0.053071 0.479706
 -0.052462 -0.656499 -1.505008 -0.520346 0.673665 0.210586 0.923983
 1.010185 1.910658 -0.319075 -0.472100 0.524774 -0.170134 -1.138236
 1.173258 0.018227 -1.051024 -0.073268 -0.387759 -0.466832 0.402000
 0.102785 -0.588362 0.659817 -0.775767 0.111295 0.172673 -0.769767
 0.703399 -0.053231 0.234463 0.915914 0.395673 -0.006962 1.110569 0.036302
 -0.935259 0.581140 -0.721152 0.726095 1.674190 -0.610191 -0.252199
 -0.367069 -0.388982 -0.030001 0.590770 -1.815267 0.988255 1.028563
 0.361742 -0.014706 0.669818 0.277727 -1.271986 -0.158736 -0.208780
 -0.084705 -1.422081 0.298740 -0.520150 -0.281948 0.482153 0.552460
 -0.143972 0.025073 -1.099061 -0.242412 -0.018601 0.073086 0.165035
 0.613909 -0.220386 0.740834 0.685412 -0.402224 0.034771 0.746321
 -1.078444 0.474506 -0.368160 -0.528261 0.1

In [1]:
import pyarrow.parquet as pq

df = pq.read_table('player_starts_train.parquet').to_pandas()

In [4]:
print(df.head(10))

                        date        user_id        item_id  watch_time  \
0  2023-07-21 19:04:50+03:00  user_12964323  video_1042531          51   
1  2023-07-21 02:02:41+03:00     user_16517  video_1707159          31   
2  2023-07-21 22:00:47+03:00  user_15057892  video_1989987           9   
3  2023-07-21 19:09:43+03:00   user_2846972  video_1356486          -1   
4  2023-07-21 11:06:58+03:00  user_20517034  video_1380654          11   
5  2023-07-21 23:24:41+03:00   user_8293675   video_331810         340   
6  2023-07-21 21:07:45+03:00   user_9408179   video_168375         281   
7  2023-07-21 20:08:10+03:00   user_3700170  video_1092603           2   
8  2023-07-21 04:39:05+03:00  user_11171041  video_1511844          90   
9  2023-07-21 08:06:33+03:00  user_17072821  video_1818125         342   

   is_autorized  
0             0  
1             0  
2             0  
3             0  
4             0  
5             0  
6             0  
7             0  
8             0  
9    

In [5]:
df1 = pq.read_table('videos.parquet').to_pandas()

In [8]:
from tqdm import tqdm

tqdm.pandas()
df3 = df.merge(df1, on='item_id')

MemoryError: Unable to allocate 2.08 GiB for an array with shape (4, 69954383) and data type int64