In [1]:
from pathlib import Path
import itertools
import urllib.request

import numpy as np
from numpy.linalg import norm
import pandas as pd
from transformers import AutoModel
import zipfile

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "jinaai/jina-embeddings-v2-base-en"

In [3]:
model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True,
)

In [4]:
texts = [
    "Dinosaurs once roamed the earth",
    "The triceratops lived in what is now western North America",
    "Black Monday refers to a stock market crash that occured on October 19, 1987"
]
embeddings = model.encode(texts)

In [5]:
embeddings

array([[-0.2775978 , -0.43230128,  0.42803136, ...,  0.5659601 ,
        -0.47726437, -0.8063539 ],
       [-0.550837  , -0.60276926,  0.6015245 , ...,  0.28022295,
        -0.26890475, -0.56855124],
       [ 0.01751115, -0.9987813 ,  0.2592022 , ...,  0.9591321 ,
        -0.29912838, -0.54930294]], dtype=float32)

In [6]:
def cosine_distance(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

In [7]:
distances = [
    [
        cosine_distance(embeddings[i], embeddings[j])
        for j in range(len(embeddings))
    ]
    for i in range(len(embeddings))
]


In [8]:
pd.DataFrame(distances, index=texts, columns=texts)

Unnamed: 0,Dinosaurs once roamed the earth,The triceratops lived in what is now western North America,"Black Monday refers to a stock market crash that occured on October 19, 1987"
Dinosaurs once roamed the earth,1.0,0.720321,0.569723
The triceratops lived in what is now western North America,0.720321,1.0,0.55539
"Black Monday refers to a stock market crash that occured on October 19, 1987",0.569723,0.55539,1.0
