In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-xlm-r-multilingual-v1')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Sentence embeddings:
tensor([[ 0.1145,  0.0768,  0.0263,  ..., -0.1323, -0.0056,  0.3162],
        [ 0.0065,  0.1654, -0.0364,  ...,  0.1892,  0.2014,  0.2443]])


In [14]:
def embbed_sentence(sentence: str):
    encoded_input = tokenizer([sentence], padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input)

    embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    return embedding

In [15]:
s1 = embbed_sentence("Thats a nice cheescake!")
s1.shape

torch.Size([1, 768])

In [16]:
s2 = embbed_sentence("That cheesecake is tasty!")
s2.shape

torch.Size([1, 768])

In [22]:
s3 = embbed_sentence("Today it rains heavily.")

In [20]:
from numpy import dot
from numpy.linalg import norm

def cosine_sim(v1, v2):
    return dot(v1, v2)/(norm(v1)*norm(v2))

In [21]:
cosine_sim(s1[0], s2[0])

0.89961934

In [23]:
cosine_sim(s1[0], s3[0])

0.03820859

In [24]:
cosine_sim(embbed_sentence("Das heutige Wetter ist wunderbar!")[0], embbed_sentence("Real Madrid ist Champions League Sieger")[0])

0.088764265

In [25]:
cosine_sim(embbed_sentence("Das heutige Wetter ist wunderbar!")[0], embbed_sentence("Gestern war das Wetter auch schön")[0])

0.79186964

In [36]:
db = []

def import_data_to_db(sentences: list, db: list) -> list:
    for i, sentence in enumerate(sentences):
        db = db + [(sentence, embbed_sentence(sentence)[0])]
    return db

In [37]:
text = ["Hallo wie geht es dir?", "Fußball ist toll", "Ich habe gestern Pizza gegessen.", "Das Wetter ist schön.", "Die Nacht war kurz"]

db = import_data_to_db(text, db)

In [44]:
from operator import itemgetter

def calculate_similarity(query: str, db: list) -> dict:
    similarity = []
    embedded_query = embbed_sentence(query)[0]
    for doc in db:
        res = {}
        # doc[0] - text; doc[1] - embedding
        res["text"] = doc[0]
        res["similarity"] = cosine_sim(doc[1], embedded_query)
        similarity.append(res)
    return sorted(similarity, key=itemgetter("similarity"), reverse=True)

In [45]:
calculate_similarity("Ich mag Basketball", db)

[{'text': 'Fußball ist toll', 'similarity': 0.43767485},
 {'text': 'Das Wetter ist schön.', 'similarity': 0.2850722},
 {'text': 'Ich habe gestern Pizza gegessen.', 'similarity': 0.14241163},
 {'text': 'Hallo wie geht es dir?', 'similarity': 0.13822904},
 {'text': 'Die Nacht war kurz', 'similarity': 0.107494265}]

In [46]:
calculate_similarity("Heute scheint die Sonne", db)

[{'text': 'Das Wetter ist schön.', 'similarity': 0.49877402},
 {'text': 'Die Nacht war kurz', 'similarity': 0.3375984},
 {'text': 'Ich habe gestern Pizza gegessen.', 'similarity': 0.21767616},
 {'text': 'Fußball ist toll', 'similarity': 0.20279406},
 {'text': 'Hallo wie geht es dir?', 'similarity': 0.04581321}]