In [None]:
!pip install -r requirements.txt
!pip install flash-attn --no-build-isolation
!huggingface-cli download jinaai/jina-embeddings-v3 --local-dir ./models/jina-embeddings-v3

In [1]:
from torch.cuda import is_available as is_cuda_available
from transformers import AutoModel

# from xlm_roberta.modeling_lora import XLMRobertaLoRA

model_folder = './models/jina-embeddings-v3/'

# Initialize the model
model = AutoModel.from_pretrained(model_folder, trust_remote_code=True, use_flash_attn=False)
# model: XLMRobertaLoRA

if is_cuda_available():
    model.to('cuda')

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. 

In [2]:
texts = [
    # "sample text",
    "Look at her face",
    "a love song",
    "a sad song",
]

# When calling the `encode` function, you can choose a `task` based on the use case:
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
query_embeddings = model.encode(texts, task="retrieval.query")
print(query_embeddings.shape)
print(query_embeddings[0])

(3, 1024)
[ 0.01671595 -0.05356369  0.07552624 ...  0.00800042 -0.02177768
 -0.00731576]


In [3]:
import numpy as np

# load embeddings
passage_embeddings = np.load('./embeddings/jina-embeddings-v3_retrieval.passage.npy')
print(passage_embeddings.shape)
print(passage_embeddings[0])

(57650, 1024)
[-0.1406054  -0.04126596  0.02898028 ... -0.0118162  -0.0104178
 -0.00558778]


1. Multi-threaded parallel MapReduce
This approach utilizes Python's ThreadPoolExecutor to slice the similarity computation task and process it in parallel:
Slicing (Map): The large-scale embedded matrix is sliced by rows into a number of small blocks, and each block calculates the similarity individually.
Parallel Computing: Process multiple slices simultaneously through multiple threads, making full use of multi-core CPUs to improve computational efficiency.
Reduce: Splice the similarity results of each slice into a complete similarity matrix.

Applicable Scenarios:
The embedding matrix is large and the memory of a single machine is not enough to load all the data at once.

In [4]:
%%time
from concurrent.futures import ThreadPoolExecutor
import numpy as np


def compute_shard_similarity(shard, query_embeddings):
    return query_embeddings @ shard.T

# divide `passage_embeddings` into different pieces
num_shards = 10
shards = np.array_split(passage_embeddings, num_shards, axis=0)

with ThreadPoolExecutor(max_workers=4) as executor:
    shard_similarities = list(executor.map(compute_shard_similarity, shards, [query_embeddings] * len(shards)))


similarities = np.hstack(shard_similarities)
print("MapReduce-style computation completed!")
print(similarities)

MapReduce-style computation completed!
[[0.30582494 0.11828539 0.15932773 ... 0.0956329  0.0874272  0.08308057]
 [0.30726215 0.35276058 0.36984026 ... 0.30388635 0.27699167 0.34673175]
 [0.32911277 0.30546767 0.3108574  ... 0.22833987 0.30470648 0.32797337]]
CPU times: user 179 ms, sys: 246 ms, total: 426 ms
Wall time: 16.5 ms


2. MapReduce based on map and functools.reduce
This approach simulates the MapReduce workflow using Python's built-in map and functools.reduce:
Map: Each slice of the embedded matrix is passed into the mapper function, which computes the similarity.
Reduce: Merge the results of all the slices with functools.reduce to get the full similarity matrix.

Applicable Scenarios:
Embedded matrices are moderate and only need to be processed in a standalone environment.

In [49]:
%%time
import functools
import numpy as np

# define mapper 和 reducer
def mapper(shard):
    shard_similarity = query_embeddings @ shard.T
    return shard_similarity

def reducer(p, c):
    return np.hstack((p, c))

# divide `passage_embeddings` into many pieces
num_shards = 10
shards = np.array_split(passage_embeddings, num_shards, axis=0)

mapped = map(mapper, shards)
reduced = functools.reduce(reducer, mapped)

print("MapReduce-style computation completed!")
print(reduced)

MapReduce-style computation completed!
[[0.30582494 0.11828539 0.15932773 ... 0.0956329  0.0874272  0.08308057]
 [0.30726215 0.35276058 0.36984026 ... 0.30388635 0.27699167 0.34673175]
 [0.32911277 0.30546767 0.3108574  ... 0.22833987 0.30470648 0.32797337]]
CPU times: user 227 ms, sys: 43.6 ms, total: 271 ms
Wall time: 8.79 ms


Comparison Summary
Multi-threaded parallel MapReduce realizes parallel processing through thread pooling, which is more suitable for super-large-scale embedded matrix processing, especially in multi-core CPU environment, which can give full play to the hardware performance.

MapReduce based on map and functools.reduce is more lightweight and suitable for medium-sized data processing, but performs better when computing resources are limited or the task size is small.

In [16]:
%%time
# Compute similarities
similarities = query_embeddings @ passage_embeddings.T
print(similarities)

[[0.30582494 0.11828539 0.15932773 ... 0.0956329  0.0874272  0.08308057]
 [0.30726215 0.35276058 0.36984026 ... 0.30388635 0.27699167 0.34673175]
 [0.32911277 0.30546767 0.3108574  ... 0.22833987 0.30470648 0.32797337]]
CPU times: user 269 ms, sys: 102 ms, total: 370 ms
Wall time: 12.2 ms


In [6]:
%%time
# get top k indices
top_k = 5
top_k_indices = np.argsort(-similarities, axis=1)[:, :top_k]
print(top_k_indices)

[[43615  4627 33356 20269 11272]
 [25317  1841 31527 44260  6152]
 [23647 32184 45343  4425 33847]]
CPU times: user 16.5 ms, sys: 298 μs, total: 16.8 ms
Wall time: 16.7 ms


In [7]:
%%time
# a more efficient way to get top k indices
num_queries = similarities.shape[0]
arange = np.arange(num_queries)[:, None]

top_k = 5
top_k_indices = np.argpartition(-similarities, top_k, axis=1)[:, :top_k]
# Sort the top_k indices to get them in order
top_k_indices = top_k_indices[arange, np.argsort(-similarities[arange, top_k_indices])]
print(top_k_indices)

[[43615  4627 33356 20269 11272]
 [25317  1841 31527 44260  6152]
 [23647 32184 45343  4425 33847]]
CPU times: user 3.42 ms, sys: 534 μs, total: 3.96 ms
Wall time: 4.05 ms


In [8]:
# print similarities for top k indices
for i in range(len(top_k_indices)):
    print(similarities[i, top_k_indices[i]])

[0.526491   0.5235444  0.5057533  0.49374926 0.4898193 ]
[0.5771628  0.5733152  0.5681341  0.5577022  0.55689794]
[0.6037626  0.5811384  0.57794267 0.5760369  0.55670303]


## Data

In [10]:
import pandas as pd

data_path = './data/spotify_millsongdata.csv'
df = pd.read_csv(data_path)

print(df.head())

  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [11]:
# get entries for top k indices of query "a love song"
top_k_entries = df.iloc[top_k_indices[-2]]
print(top_k_entries)

               artist                   song  \
25317  Billie Holiday  Lover Come Back To Me   
1841     Bonnie Raitt          Cure For Love   
31527  Emmylou Harris                Love Is   
44260  Modern Talking      Romantic Warriors   
6152    Fleetwood Mac         It's Only Love   

                                                    link  \
25317  /b/billie+holiday/lover+come+back+to+me_200180...   
1841         /b/bonnie+raitt/cure+for+love_20022697.html   
31527            /e/emmylou+harris/love+is_20050047.html   
44260  /m/modern+talking/romantic+warriors_20094699.html   
6152        /f/fleetwood+mac/its+only+love_20632943.html   

                                                    text  
25317  The sky was blue  \r\nAnd high above  \r\nThe ...  
1841   You bring me roses  \r\nYou give me kisses  \r...  
31527  Love is a shiny car  \r\nLove is a steel guita...  
44260  In the nights of lost and found  \r\nMany stra...  
6152   I think I met my match again  \r\nPassing 'rou..

In [12]:
# get entries for top k indices of query "a sad song"
top_k_entries = df.iloc[top_k_indices[-1]]
print(top_k_entries)

              artist           song  \
23647  Alison Krauss  This Sad Song   
32184       Everlast       Sad Girl   
45343    Neil Sedaka  Sad Sad Story   
4425    Donna Summer     Sing Along   
33847  George Strait  Blue Melodies   

                                               link  \
23647  /a/alison+krauss/this+sad+song_20521716.html   
32184            /e/everlast/sad+girl_20181664.html   
45343    /n/neil+sedaka/sad+sad+story_20613341.html   
4425       /d/donna+summer/sing+along_10087839.html   
33847  /g/george+strait/blue+melodies_21061476.html   

                                                    text  
23647  Well, the rain is apourin' down in a fury  \r\...  
32184  I seen her at a stop light on Alverano  \r\nWa...  
45343  Look at the lady she's the one with the broken...  
4425   I've an emptiness inside  \r\nThat can only be...  
33847  I don't know how to write you a song  \r\nThat...  
