In [2]:
from torch.cuda import is_available as is_cuda_available
from transformers import AutoModel

from xlm_roberta.modeling_lora import XLMRobertaLoRA

model_folder = './models/jina-embeddings-v3/'

# Initialize the model
model = AutoModel.from_pretrained(model_folder, trust_remote_code=True)
model: XLMRobertaLoRA

if is_cuda_available():
    model.to('cuda')

  def forward(
  def backward(ctx, dout, *args):


In [24]:
texts = [
    "sample text",
    "Look at her face",
    "a love song",
    "a sad song",
]

# When calling the `encode` function, you can choose a `task` based on the use case:
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
query_embeddings = model.encode(texts, task="retrieval.query")
print(query_embeddings.shape)
print(query_embeddings[0])

(4, 1024)
[ 0.07359939 -0.09012064  0.08909674 ...  0.01743072 -0.00241766
  0.0039986 ]


In [4]:
import numpy as np

# load embeddings
passage_embeddings = np.load('./embeddings/jina-embeddings-v3_retrieval.passage.npy')
print(passage_embeddings.shape)
print(passage_embeddings[0])

(57650, 1024)
[-0.1406054  -0.04126596  0.02898028 ... -0.0118162  -0.0104178
 -0.00558778]


In [25]:
# Compute similarities
similarities = query_embeddings @ passage_embeddings.T
print(similarities)

[[0.1181575  0.20447016 0.1517792  ... 0.12681568 0.1679917  0.17886987]
 [0.30442497 0.11763603 0.15821353 ... 0.0948153  0.08668843 0.08245966]
 [0.3070848  0.35255226 0.37161088 ... 0.30424643 0.27756616 0.34715414]
 [0.32861927 0.3058338  0.3108795  ... 0.2291998  0.30555478 0.32890457]]


In [12]:
%%time
# get top k indices
top_k = 5
top_k_indices = np.argsort(-similarities, axis=1)[:, :top_k]
print(top_k_indices)

[[12383 46207 47392 44620 12494]
 [43615  4627 33356 20269 26647]
 [25317  1841 31527 44260  6152]]
CPU times: user 17.3 ms, sys: 103 μs, total: 17.4 ms
Wall time: 16.3 ms


In [26]:
%%time
# a more efficient way to get top k indices
num_queries = similarities.shape[0]
arange = np.arange(num_queries)[:, None]

top_k = 5
top_k_indices = np.argpartition(-similarities, top_k, axis=1)[:, :top_k]
# Sort the top_k indices to get them in order
top_k_indices = top_k_indices[arange, np.argsort(-similarities[arange, top_k_indices])]
print(top_k_indices)

[[12383 46207 47392 44620 12494]
 [43615  4627 33356 20269 26647]
 [25317  1841 31527 44260 24504]
 [23647 32184 45343  4425 33847]]
CPU times: user 3.02 ms, sys: 1.01 ms, total: 4.03 ms
Wall time: 3.37 ms


In [27]:
# print similarities for top k indices
for i in range(len(top_k_indices)):
    print(similarities[i, top_k_indices[i]])

[0.31351554 0.3099225  0.3026087  0.30215356 0.3013426 ]
[0.52556664 0.52281755 0.50563866 0.49271652 0.48811904]
[0.57675344 0.57347375 0.567487   0.5579922  0.5570458 ]
[0.6042438  0.58098495 0.57753104 0.57650334 0.55687964]


## Data

In [28]:
import pandas as pd

data_path = './spotify_millsongdata.csv'
df = pd.read_csv(data_path)

print(df.head())

  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [31]:
# get entries for top k indices of query "a love song"
top_k_entries = df.iloc[top_k_indices[-2]]
print(top_k_entries)

                artist                   song  \
25317   Billie Holiday  Lover Come Back To Me   
1841      Bonnie Raitt          Cure For Love   
31527   Emmylou Harris                Love Is   
44260   Modern Talking      Romantic Warriors   
24504  Backstreet Boys          Love Somebody   

                                                    link  \
25317  /b/billie+holiday/lover+come+back+to+me_200180...   
1841         /b/bonnie+raitt/cure+for+love_20022697.html   
31527            /e/emmylou+harris/love+is_20050047.html   
44260  /m/modern+talking/romantic+warriors_20094699.html   
24504     /b/backstreet+boys/love+somebody_21063816.html   

                                                    text  
25317  The sky was blue  \r\nAnd high above  \r\nThe ...  
1841   You bring me roses  \r\nYou give me kisses  \r...  
31527  Love is a shiny car  \r\nLove is a steel guita...  
44260  In the nights of lost and found  \r\nMany stra...  
24504  Love  \r\nIs it too much to ask for  \r\nO

In [30]:
# get entries for top k indices of query "a sad song"
top_k_entries = df.iloc[top_k_indices[-1]]
print(top_k_entries)

              artist           song  \
23647  Alison Krauss  This Sad Song   
32184       Everlast       Sad Girl   
45343    Neil Sedaka  Sad Sad Story   
4425    Donna Summer     Sing Along   
33847  George Strait  Blue Melodies   

                                               link  \
23647  /a/alison+krauss/this+sad+song_20521716.html   
32184            /e/everlast/sad+girl_20181664.html   
45343    /n/neil+sedaka/sad+sad+story_20613341.html   
4425       /d/donna+summer/sing+along_10087839.html   
33847  /g/george+strait/blue+melodies_21061476.html   

                                                    text  
23647  Well, the rain is apourin' down in a fury  \r\...  
32184  I seen her at a stop light on Alverano  \r\nWa...  
45343  Look at the lady she's the one with the broken...  
4425   I've an emptiness inside  \r\nThat can only be...  
33847  I don't know how to write you a song  \r\nThat...  
