In [23]:
import torch
from sentence_transformers import SentenceTransformer, util

In [24]:
MIN_SENTENCE_LEN = 20
BATCH_LEN = 32


model: SentenceTransformer = None
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
def get_cos_scores(job_embeddings, resume_embeddings):
    from sentence_transformers import util

    cos_scores = util.pytorch_cos_sim(job_embeddings, resume_embeddings)
    # cos_scores = sigmoid(cos_scores, a=9.8, b=1.017, c=5.8)  # a=8.5, b=1.03, c=5)

    return cos_scores


def init_model():
    global model
    if model is None:

        # List of pre-trained models: https://www.sbert.net/docs/pretrained_models.html
        # These will be downloaded from https://sbert.net/models/<model_name>.zip
        # The model will be cached in ~/.cache/torch/sentence_transformers/sbert.net_models_<model_name>

        model_name = 'distilbert-base-nli-stsb-mean-tokens'  # small <250 MB RAM
        # model_name = 'roberta-base-nli-stsb-mean-tokens'      # medium
        # model_name = 'roberta-large-nli-stsb-mean-tokens'  # Large 1.3 GB RAM

        model = SentenceTransformer(model_name, device=device)


In [25]:
init_model()

In [38]:
sentence = ' As part of a cross-functional Agile development team, the SDETs primary role is to ensure quality through delivery of test automation best practices'

In [39]:
tokens = torch.tensor(model.tokenize(sentence))

In [17]:
embeddings = model.encode([tokens], batch_size=BATCH_LEN, output_value='token_embeddings', is_pretokenized=True, device=device)[0]

In [5]:
for token in tokens:
    print(model.tokenizer.ids_to_tokens[token], end=' ')

as part of a cross - functional agile development team , the sd ##ets primary role is to ensure quality through delivery of test automation best practices 

In [48]:

segment_len = 3
segments = []

for i in range(len(tokens) - segment_len):
    segments.append(range(i, i+segment_len))

segments = torch.tensor(segments)

In [49]:
combinations = embeddings[segments].mean(1)

  combinations = embeddings[segments].mean(1)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [29]:
sentence_embedding = torch.tensor(embeddings).mean(0).view((1, -1))

In [32]:
cos_scores = util.pytorch_cos_sim(sentence_embedding, combinations)[0]

In [34]:
top_combinations_idx = cos_scores.argsort()[:5]

In [36]:
top_combinations = segments[top_combinations_idx]

In [40]:
top_segment_tokens = tokens[top_combinations]

In [42]:
ids_to_tokens = model.tokenizer.ids_to_tokens

In [45]:
for segment in top_segment_tokens:
    for token in segment:
        print(ids_to_tokens[int(token)], end=' ')
    print()

of a cross 
part of a 
a cross - 
quality through delivery 
- functional agile 
