<a href="https://colab.research.google.com/github/BunnyNoBugs/infosearch/blob/master/hw/hw4/hw4_compute_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from tqdm import tqdm
import json
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
import pickle

In [3]:
device = 'cuda'
samples_limit = 50000

In [4]:
with open('/content/drive/MyDrive/Colab Notebooks/infosearch/hw4/corpus_50000.json', encoding='utf-8') as f:
    corpus = json.load(f)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model.to(device);

In [6]:
questions = list(corpus.keys())[:samples_limit]
answers = list(corpus.values())[:samples_limit]

In [7]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [8]:
def get_embeddings(sentences, tokenizer, model):
    embeddings = []
    for sentence in tqdm(sentences):
        encoded_sentence = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors='pt')
        encoded_sentence.to(device)
        with torch.no_grad():
            model_output = model(**encoded_sentence)
        sentence_embedding = mean_pooling(model_output, encoded_sentence['attention_mask'])
        embeddings.append(sentence_embedding.cpu())
    embeddings = np.vstack(embeddings)
    return embeddings

In [9]:
questions_embeddings = get_embeddings(questions, tokenizer, model)
answers_embeddings = get_embeddings(answers, tokenizer, model)

100%|██████████| 50000/50000 [32:50<00:00, 25.37it/s]
100%|██████████| 50000/50000 [25:39<00:00, 32.48it/s]


In [10]:
with open('/content/drive/MyDrive/Colab Notebooks/infosearch/hw4/questions_embeddings.pickle', 'wb') as f:
    pickle.dump(questions_embeddings, f)
with open('/content/drive/MyDrive/Colab Notebooks/infosearch/hw4/answers_embeddings.pickle', 'wb') as f:
    pickle.dump(answers_embeddings, f)