In [None]:
# import os
# from pathlib import Path
# from sentence_transformers import SentenceTransformer

# # Set up caching directory (optional - HF uses default cache if not set)
# # You can set a custom cache directory if needed
# # os.environ["HF_HOME"] = "/path/to/your/cache"
# # os.environ["TRANSFORMERS_CACHE"] = "/path/to/your/cache"

# # Check current cache directory
# from transformers import file_utils
# print(f"Default HF cache directory: {file_utils.default_cache_path}")
# print(f"Cache size: {sum(f.stat().st_size for f in Path(file_utils.default_cache_path).rglob('*') if f.is_file()) / 1024**3:.2f} GB")

In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    # device="cuda",
    device="cpu",  # Use CPU for compatibility
    model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
    tokenizer_kwargs={"padding_side": "left"},
)

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

In [4]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device="cpu")

In [5]:
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
similarity = model.similarity(query_embeddings, document_embeddings)
print(similarity)

tensor([[0.7646, 0.1414],
        [0.1355, 0.6000]])


# Transformer implementation

In [6]:
#coding:utf8
import os
from typing import Dict, Optional, List, Union
import torch
from torch import nn
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from transformers.utils import is_flash_attn_2_available
import numpy as np
from collections import defaultdict

class Qwen3TextEmbedding():
    def __init__(self, model_name_or_path, instruction=None,  use_fp16: bool = True, use_cuda: bool = True, max_length=8192):
        if instruction is None:
            instruction = 'Given a user query, retrieve relevant passages that answer the query'
        self.instruction = instruction
        if is_flash_attn_2_available() and use_cuda:
            self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16)
        else:
            self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16)
        if use_cuda:
            self.model = self.model.cuda()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, padding_side='left')
        self.max_length=max_length
    
    def last_token_pool(self, last_hidden_states: Tensor,
        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])  # last token mask is always `1`
        if left_padding:
            return last_hidden_states[:, -1]  # '<|endoftext|>' on the right hand side.. this is used as sentence embedding
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1   # deduct the '<|endoftext|>' token
            batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

    def get_detailed_instruct(self, task_description: str, query: str) -> str:
        if task_description is None:
            task_description = self.instruction
        return f'Instruct: {task_description}\nQuery:{query}'

    def encode(self, sentences: Union[List[str], str], is_query: bool = False, instruction=None, dim: int = -1):
        if isinstance(sentences, str):
            sentences = [sentences]
        if is_query:
            sentences = [self.get_detailed_instruct(instruction, sent) for sent in sentences]
        inputs = self.tokenizer(sentences, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
        inputs.to(self.model.device)
        with torch.no_grad():
            model_outputs = self.model(**inputs)
            output = self.last_token_pool(model_outputs.last_hidden_state, inputs['attention_mask'])
            if dim != -1:
                output = output[:, :dim]  # top-k embedding dimension preserving most significant dimensions
            output  = F.normalize(output, p=2, dim=1)
        return output

In [8]:
model_path = "Qwen/Qwen3-Embedding-0.6B"
model = Qwen3TextEmbedding(model_path, use_cuda=False)
queries = ['What is the capital of China?', 'Explain gravity']
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]
dim = 1024
# It looks like the diff dim generates different embeddings not just truncating the output. However, cosine similarity is still valid.
query_outputs = model.encode(queries, is_query=True, dim=dim)
doc_outputs = model.encode(documents, dim=dim)
print('query outputs', query_outputs)
print('doc outputs', doc_outputs)
scores = (query_outputs @ doc_outputs.T) * 100
print(scores.tolist())

query outputs tensor([[-0.0524, -0.0293, -0.0020,  ...,  0.0753,  0.0379, -0.0138],
        [-0.0073, -0.0385, -0.0034,  ..., -0.0291,  0.0065, -0.0105]],
       dtype=torch.float16)
doc outputs tensor([[-0.0471, -0.0209,  0.0036,  ...,  0.0563,  0.0707, -0.0171],
        [-0.0531, -0.0152, -0.0012,  ...,  0.0036, -0.0206,  0.0196]],
       dtype=torch.float16)
[[78.25, 20.140625], [13.28125, 58.3125]]


In [9]:
len(model.tokenizer.vocab)

151669

In [8]:
sentences = documents
inputs = model.tokenizer(sentences, padding=True, truncation=True, max_length=model.max_length, return_tensors='pt')

In [20]:
model.tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen3-Embedding-0.6B', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, n

In [None]:
# get the original token from token id
def get_token_from_id(tokenizer, token_id):
    if hasattr(tokenizer, 'convert_ids_to_tokens'):
        return tokenizer.convert_ids_to_tokens(token_id)
    else:
        return tokenizer._convert_id_to_token(token_id)

In [4]:
model.tokenizer.convert_ids_to_tokens(0)

NameError: name 'model' is not defined

In [9]:
inputs

{'input_ids': tensor([[151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643,    785,   6722,    315,   5616,
            374,  26549,     13, 151643],
        [ 38409,    374,    264,   5344,    429,  60091,   1378,  12866,   6974,
           1817,   1008,     13,   1084,   6696,   4680,    311,   6961,   6171,
            323,    374,   8480,    369,    279,   7203,    315,  32875,   2163,
            279,   7015,     13, 151643]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}

In [11]:
attention_mask = inputs["attention_mask"]
attention_mask

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])

In [12]:
attention_mask.shape

torch.Size([2, 31])

In [17]:
attention_mask[:, -1].sum()

tensor(2)

In [2]:
model_path = "Qwen/Qwen3-Embedding-0.6B"
model = Qwen3TextEmbedding(model_path, use_cuda=False)
queries = ['What is the capital of China?', 'Explain gravity']
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]
dim = 1024
# It looks like the diff dim generates different embeddings not just truncating the output. However, cosine similarity is still valid.
query_outputs = model.encode(queries, is_query=True, dim=dim)
doc_outputs = model.encode(documents, dim=dim)
print('query outputs', query_outputs)
print('doc outputs', doc_outputs)
scores = (query_outputs @ doc_outputs.T) * 100
print(scores.tolist())

query outputs tensor([[-0.0525, -0.0296, -0.0019,  ...,  0.0754,  0.0379, -0.0137],
        [-0.0073, -0.0387, -0.0034,  ..., -0.0292,  0.0064, -0.0105]],
       dtype=torch.float16)
doc outputs tensor([[-0.0472, -0.0208,  0.0037,  ...,  0.0562,  0.0707, -0.0172],
        [-0.0530, -0.0152, -0.0013,  ...,  0.0037, -0.0206,  0.0196]],
       dtype=torch.float16)
[[78.25, 20.140625], [13.234375, 58.25]]


In [None]:
type(model.tokenizer)

# Qwen3 pkg

In [5]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%autoreload 2

In [7]:
from nlp_rag.models.qwen3 import Qwen3Embedding

In [8]:
model_path = "Qwen/Qwen3-Embedding-0.6B"
qwen3_emb = Qwen3Embedding(model_name=model_path, instruction="Given a user query, retrieve relevant passages that answer the query")

In [9]:
queries = ['What is the capital of China?', 'Explain gravity']
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]

In [10]:
qwen3_emb._model

Qwen3Model(
  (embed_tokens): Embedding(151669, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): Qwen3RMSNorm((102

In [11]:
# for queries 
query_emb = qwen3_emb.embed_data(data=queries)

In [12]:
qwen3_emb.instruction=None
qwen3_emb

Qwen3Embedding(model_name='Qwen/Qwen3-Embedding-0.6B', instruction=None, max_length=8192, dim_embedding=-1, use_cuda=True)

In [13]:
doc_emb = qwen3_emb.embed_data(data=documents)

In [14]:
raw_query_emb=qwen3_emb.encode(sentences=queries, instruction="Given a user query, retrieve relevant passages that answer the query", max_length=8192, dim_embedding = -1)

In [15]:
type(raw_query_emb)

torch.Tensor

In [16]:
raw_query_emb.shape

torch.Size([2, 1024])

In [17]:
raw_query_emb

tensor([[-0.0524, -0.0294, -0.0019,  ...,  0.0752,  0.0377, -0.0139],
        [-0.0073, -0.0386, -0.0035,  ..., -0.0292,  0.0065, -0.0103]],
       device='cuda:0', dtype=torch.float16)

In [18]:
doc_outputs

NameError: name 'doc_outputs' is not defined

In [None]:
doc_emb[1]

In [29]:
doc_emb[0]

[-0.047210693359375,
 -0.021087646484375,
 0.003673553466796875,
 -0.033538818359375,
 -0.058990478515625,
 -0.005336761474609375,
 0.0164031982421875,
 0.04931640625,
 -0.04681396484375,
 0.05096435546875,
 0.028778076171875,
 0.01393890380859375,
 0.053985595703125,
 0.00102996826171875,
 -0.01959228515625,
 0.0204620361328125,
 -0.0002906322479248047,
 0.047882080078125,
 0.03204345703125,
 -0.0292510986328125,
 0.0682373046875,
 0.0250396728515625,
 0.06622314453125,
 -0.056976318359375,
 0.006549835205078125,
 0.037628173828125,
 -0.08514404296875,
 0.04779052734375,
 0.032379150390625,
 0.0172882080078125,
 0.09478759765625,
 0.0157470703125,
 0.0209808349609375,
 0.029937744140625,
 0.01203155517578125,
 0.0010499954223632812,
 0.060882568359375,
 0.01332855224609375,
 0.0018167495727539062,
 -0.01568603515625,
 -0.0011758804321289062,
 -0.0208892822265625,
 -0.041900634765625,
 0.01468658447265625,
 0.01064300537109375,
 0.00725555419921875,
 0.02081298828125,
 0.00255775451660

In [27]:
doc_emb

[[-0.047210693359375,
  -0.021087646484375,
  0.003673553466796875,
  -0.033538818359375,
  -0.058990478515625,
  -0.005336761474609375,
  0.0164031982421875,
  0.04931640625,
  -0.04681396484375,
  0.05096435546875,
  0.028778076171875,
  0.01393890380859375,
  0.053985595703125,
  0.00102996826171875,
  -0.01959228515625,
  0.0204620361328125,
  -0.0002906322479248047,
  0.047882080078125,
  0.03204345703125,
  -0.0292510986328125,
  0.0682373046875,
  0.0250396728515625,
  0.06622314453125,
  -0.056976318359375,
  0.006549835205078125,
  0.037628173828125,
  -0.08514404296875,
  0.04779052734375,
  0.032379150390625,
  0.0172882080078125,
  0.09478759765625,
  0.0157470703125,
  0.0209808349609375,
  0.029937744140625,
  0.01203155517578125,
  0.0010499954223632812,
  0.060882568359375,
  0.01332855224609375,
  0.0018167495727539062,
  -0.01568603515625,
  -0.0011758804321289062,
  -0.0208892822265625,
  -0.041900634765625,
  0.01468658447265625,
  0.01064300537109375,
  0.007255554

In [14]:
import numpy as np

In [32]:
np.array(doc_emb) @ np.array(doc_outputs).T

  np.array(doc_emb) @ np.array(doc_outputs).T


array([[0.99992317, 0.31809559],
       [0.31796764, 0.99995463]])

In [30]:
np.array(query_emb) @ np.array(doc_emb).T *100

array([[78.24808013, 20.13974079],
       [13.25720041, 58.27236779]])

In [15]:
np.array(query_emb) @ np.array(doc_emb).T *100

array([[82.71647956, 18.87513176],
       [20.24335442, 82.45363315]])

In [33]:
model_path = "Qwen/Qwen3-Embedding-0.6B"
qwen3_emb_cpu = Qwen3Embedding(model_name=model_path, instruction="Given a user query, retrieve relevant passages that answer the query", use_cuda=False)

In [34]:
query_emb_cpu = qwen3_emb_cpu.embed_data(data=queries)

In [35]:
query_emb_cpu

[[-0.052459716796875,
  -0.0295867919921875,
  -0.0019397735595703125,
  -0.0318603515625,
  -0.030609130859375,
  -0.045440673828125,
  0.0489501953125,
  0.0215911865234375,
  -0.051361083984375,
  0.034698486328125,
  0.05206298828125,
  0.02001953125,
  0.0794677734375,
  -0.0022563934326171875,
  -0.0341796875,
  0.0023670196533203125,
  -0.0013418197631835938,
  0.10467529296875,
  -0.012176513671875,
  -0.0340576171875,
  0.039520263671875,
  -0.022186279296875,
  0.04388427734375,
  -0.06597900390625,
  0.06536865234375,
  0.01503753662109375,
  -0.089111328125,
  0.042266845703125,
  0.01593017578125,
  -0.0087127685546875,
  0.09979248046875,
  0.0257720947265625,
  -0.001308441162109375,
  0.031829833984375,
  0.014556884765625,
  -0.0059967041015625,
  0.08746337890625,
  0.01300048828125,
  -0.00791168212890625,
  -0.004482269287109375,
  -0.038177490234375,
  -0.0108184814453125,
  -0.09149169921875,
  0.01018524169921875,
  -0.013946533203125,
  0.02972412109375,
  -0.00

In [1]:
from nlp_rag.models.fastembed import FastEmbedRerankerEmbeddings

In [3]:
colbertv2 = FastEmbedRerankerEmbeddings()

In [14]:
colbertv2.embed_query(queries[0])

array([[ 0.12253779, -0.01348286,  0.02574195, ..., -0.09389692,
        -0.17026047,  0.01661755],
       [ 0.14403951,  0.00942109, -0.06113891, ..., -0.17349637,
        -0.16948836,  0.11560889],
       [ 0.11211355, -0.0192696 ,  0.05480438, ..., -0.10986683,
        -0.15306032,  0.05582669],
       ...,
       [ 0.09928965,  0.0459879 ,  0.05173886, ..., -0.22249354,
        -0.06116874, -0.06851213],
       [ 0.        ,  0.        , -0.        , ..., -0.        ,
        -0.        ,  0.        ],
       [ 0.13992333, -0.03676378, -0.04938003, ..., -0.1030542 ,
        -0.18879594,  0.01579314]], shape=(10, 128), dtype=float32)

In [None]:
from typing import List
import numpy as np

queries = ['What is the capital of China?', 'Explain gravity']
doc_embeddings = colbertv2.embed_data(data=queries)
# Display the doc_embeddings
print(f"Type: {type(doc_embeddings)}")
print(f"Number of queries: {len(doc_embeddings)}")
print(f"Shape of first embedding: {doc_embeddings[0].shape}")

# Sample of the first embedding values
print("\nSample of first embedding values:")
print(doc_embeddings[0][0][:5])  # First 5 values of the first vector

# Return the full embeddings
doc_embeddings: List[np.ndarray]

Type: <class 'list'>
Number of queries: 2
Shape of first embedding: (10, 128)

Sample of first embedding values:
[ 0.12253779 -0.01348286  0.02574195  0.02927173 -0.02230829]


[array([[ 0.12253779, -0.01348286,  0.02574195, ..., -0.09389692,
         -0.17026047,  0.01661755],
        [ 0.14403951,  0.00942109, -0.06113891, ..., -0.17349637,
         -0.16948836,  0.11560889],
        [ 0.11211355, -0.0192696 ,  0.05480438, ..., -0.10986683,
         -0.15306032,  0.05582669],
        ...,
        [ 0.09928965,  0.0459879 ,  0.05173886, ..., -0.22249354,
         -0.06116874, -0.06851213],
        [ 0.        ,  0.        , -0.        , ..., -0.        ,
         -0.        ,  0.        ],
        [ 0.13992333, -0.03676378, -0.04938003, ..., -0.1030542 ,
         -0.18879594,  0.01579314]], shape=(10, 128), dtype=float32),
 array([[-0.00997523,  0.03878174,  0.01485046, ...,  0.02029536,
         -0.06985453,  0.00785969],
        [-0.12207086,  0.0041189 , -0.06974896, ..., -0.06515262,
         -0.07959309,  0.00259247],
        [-0.18004848,  0.00299347, -0.11695206, ..., -0.05418336,
         -0.08153487,  0.00154958],
        ...,
        [ 0.        , 

In [6]:
type(doc_embeddings)

list

In [10]:
len(doc_embeddings[0].tolist())

10

In [11]:
len(doc_embeddings[0].tolist()[0])

128