# Embedding models

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [3]:
tokenized_data = model.tokenize(["walker walked a long walk"])
tokenized_data


{'input_ids': tensor([[ 101, 5232, 2939, 1037, 2146, 3328,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [4]:
first_module = model._first_module()
first_module.auto_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
embeddings = first_module.auto_model.embeddings
embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 384, padding_idx=0)
  (position_embeddings): Embedding(512, 384)
  (token_type_embeddings): Embedding(2, 384)
  (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

## No context aware embeddings

In [9]:
import torch
import plotly.express as px

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device = 'mps'


first_sentence = "vector search optimization"
second_sentence = "we learn about vector search optimization"

with torch.no_grad():
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])

     # Convert the input_ids tensors to the device
    first_embedding = embeddings.word_embeddings(first_tokens['input_ids'].to(device))
    second_embedding = embeddings.word_embeddings(second_tokens['input_ids'].to(device))


first_embedding.shape, second_embedding.shape



(torch.Size([1, 5, 384]), torch.Size([1, 8, 384]))

In [12]:
from sentence_transformers import util



distances= util.cos_sim(first_embedding.squeeze(), second_embedding.squeeze()).cpu().numpy()
distances = distances.T
px.imshow(
    distances,
    x=model.tokenizer.convert_ids_to_tokens(
        first_tokens['input_ids'][0]
        ),
    y=model.tokenizer.convert_ids_to_tokens(
        second_tokens['input_ids'][0]
        ),
    text_auto=True,
)


In [13]:
token_embeddings = first_module.auto_model \
                    .embeddings \
                    .word_embeddings \
                    .weight \
                    .detach() \
                    .cpu() \
                    .numpy()

token_embeddings.shape

(30522, 384)

In [14]:
import random

vocabulary = first_module.tokenizer.get_vocab()
sorted_vocabulary = sorted(
    vocabulary.items(),
    key=lambda x: x[1]
)

sorted_tokens = [token for token, _ in sorted_vocabulary]
random.choices(sorted_tokens, k=100)


['whorls',
 'teachings',
 'blond',
 '[unused421]',
 'tempered',
 '##oed',
 'series',
 'mickey',
 'periodical',
 'outstanding',
 'inflicted',
 'offs',
 'flying',
 'embracing',
 'growling',
 'majors',
 'adolescents',
 'geological',
 'huddled',
 'safer',
 'attributes',
 '‡',
 'societal',
 'politically',
 'vigor',
 'offenses',
 'depart',
 'documentation',
 'cello',
 '##dora',
 'overdose',
 'pagoda',
 'draper',
 'translations',
 '[unused236]',
 'aspect',
 'affairs',
 'fathers',
 '276',
 '##riding',
 'automated',
 'leukemia',
 'troll',
 'creeks',
 'imminent',
 'hated',
 'founders',
 'residency',
 '##ː',
 '##economic',
 '1920',
 'hostile',
 'francesco',
 'robbins',
 'downloaded',
 'francais',
 'accepting',
 '##app',
 'twinkle',
 'satisfy',
 '##⁴',
 'myanmar',
 'irony',
 'adjustments',
 'flanders',
 'solved',
 'eireann',
 'doncaster',
 '##olio',
 'promotes',
 'distal',
 '##ी',
 'nixon',
 '##igh',
 'straw',
 'bradley',
 'eel',
 'thirty',
 '和',
 'cia',
 'lovely',
 'enroll',
 '##erate',
 'landmar

In [15]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42, metric='cosine')
tsne_embeddings_2d = tsne.fit_transform(token_embeddings)
tsne_embeddings_2d.shape


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(30522, 2)

In [16]:
token_colors = []

for token in sorted_tokens:
    if token[0] == "[" and token[-1] == "]":
        token_colors.append("red")
    elif token.startswith("##"):
        token_colors.append("blue")
    else:
        token_colors.append("green")


In [18]:
import plotly.graph_objs as go

scatter = go.Scatter(
    x=tsne_embeddings_2d[:, 0],
    y=tsne_embeddings_2d[:, 1],
    text=sorted_tokens,
    mode="markers",
    marker=dict(
        color=token_colors,
        size=3,
    ),
)

fig = go.Figure(data=[scatter],
                layout=dict(
                    width=600,
                    height=900,
                    margin=dict(l=0, r=0,),
                )
)

fig.show()



## Context aware embeddings

In [19]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device = 'mps'


first_sentence = "vector search optimization"
second_sentence = "we learn about vector search optimization"

with torch.no_grad():
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])

    first_embedding = model.encode([first_sentence], output_value='token_embeddings')
    second_embedding = model.encode([second_sentence], output_value='token_embeddings')

distances = util.cos_sim(first_embedding[0], second_embedding[0]).cpu().numpy()
distances = distances.T


In [20]:
px.imshow(
    distances,
    x=model.tokenizer.convert_ids_to_tokens(first_tokens['input_ids'][0]),
    y=model.tokenizer.convert_ids_to_tokens(second_tokens['input_ids'][0]),
    text_auto=True,
)

# Role of tokenizers

## BPE - Byte Pair Encoding

In [22]:
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace

bpe_tokenizer = Tokenizer(BPE())
bpe_tokenizer.pre_tokenizer = Whitespace()

bpe_trainer = BpeTrainer(vocab_size=14)

In [23]:
training_data = ["walker walked a long walk"]

bpe_tokenizer.train_from_iterator(training_data, trainer=bpe_trainer)






In [24]:
bpe_tokenizer.get_vocab()

{'d': 1,
 'al': 10,
 'e': 2,
 'g': 3,
 'n': 6,
 'walk': 12,
 'wal': 11,
 'l': 5,
 'o': 7,
 'walke': 13,
 'k': 4,
 'r': 8,
 'w': 9,
 'a': 0}

In [26]:
bpe_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [27]:
bpe_tokenizer.encode("wlk").tokens

['w', 'l', 'k']

In [28]:
bpe_tokenizer.encode("she walked").tokens

['e', 'walke', 'd']

## WordPiece

In [32]:
from real_wordpiece.trainer import RealWordPieceTrainer
from tokenizers.models import WordPiece

real_wordpiece_tokenizer = Tokenizer(WordPiece())
real_wordpiece_tokenizer.pre_tokenizer = Whitespace()

real_wordpiece_trainer = RealWordPieceTrainer(vocab_size=27)

In [33]:
real_wordpiece_trainer.train_tokenizer(
    training_data, real_wordpiece_tokenizer
)
real_wordpiece_tokenizer.get_vocab()

{'long': 21,
 '##ed': 23,
 '##lk': 25,
 'o': 16,
 'w': 0,
 'd': 15,
 'k': 12,
 '##l': 2,
 'lo': 19,
 'g': 18,
 '##ng': 20,
 '##k': 3,
 '##r': 7,
 'l': 6,
 '##o': 9,
 '##e': 4,
 '##a': 1,
 'e': 13,
 '##n': 10,
 '##er': 22,
 'walk': 26,
 'r': 14,
 'a': 5,
 '##d': 8,
 'wa': 24,
 'n': 17,
 '##g': 11}

In [34]:
real_wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walk', '##er', 'walk', '##ed', 'a', 'long', 'walk']

In [35]:
real_wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

## Huggingface wordpiece

In [37]:
from tokenizers.trainers import WordPieceTrainer

unk_token = "[UNK]"

wordpiece_model = WordPiece(unk_token=unk_token)
wordpiece_tokenizer = Tokenizer(wordpiece_model)
wordpiece_tokenizer.pre_tokenizer = Whitespace()
wordpiece_trainer = WordPieceTrainer(vocab_size=28, 
                                     special_tokens=[unk_token])

In [38]:
wordpiece_tokenizer.train_from_iterator(training_data,
                                        wordpiece_trainer)
wordpiece_tokenizer.get_vocab()






{'##n': 18,
 '##r': 15,
 'l': 6,
 '##a': 11,
 'wa': 20,
 'walk': 22,
 '##ng': 25,
 '[UNK]': 0,
 '##l': 12,
 '##k': 13,
 'e': 3,
 'k': 5,
 '##lk': 21,
 'g': 4,
 '##g': 19,
 '##d': 16,
 'walke': 23,
 'r': 9,
 'd': 2,
 'o': 8,
 'lo': 24,
 'walker': 26,
 'n': 7,
 'w': 10,
 '##e': 14,
 'a': 1,
 '##o': 17,
 'walked': 27}

In [39]:
wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walker', 'walked', 'a', 'lo', '##ng', 'walk']

In [40]:
wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

In [41]:
wordpiece_tokenizer.encode("she walked").tokens

['[UNK]', 'walked']

## Unigram

In [42]:
from tokenizers.trainers import UnigramTrainer
from tokenizers.models import Unigram

unigram_tokenizer = Tokenizer(Unigram())
unigram_tokenizer.pre_tokenizer = Whitespace()

unigram_trainer = UnigramTrainer(vocab_size=28, 
                                 special_tokens=[unk_token],
                                 unk_token=unk_token)
unigram_tokenizer.train_from_iterator(training_data, unigram_trainer)
unigram_tokenizer.get_vocab()





{'l': 6,
 '[UNK]': 0,
 'd': 8,
 'walk': 4,
 'r': 11,
 'a': 5,
 'w': 12,
 'walke': 1,
 'e': 2,
 'g': 10,
 'k': 3,
 'n': 9,
 'o': 7}

In [43]:
unigram_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [44]:
unigram_tokenizer.encode("wlk").tokens


['w', 'l', 'k']

In [45]:
unigram_tokenizer.encode("she walked").tokens


['sh', 'e', 'walke', 'd']

In [46]:
unigram_tokenizer.encode("she walked").ids


[0, 2, 1, 8]

# Implication of Tokenization

# Measusring RAG

`Data from github repository:`https://github.com/wayfair/WANDS/blob/main/dataset/

In [None]:
import pandas as pd

products_df = pd.read_csv(
    f"product.csv", 
    sep="\t", 
    index_col="product_id", 
    keep_default_na=False,  # some products do not have a description
)
products_df.head()

Unnamed: 0_level_0,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


## Encode products name and description


In [None]:
from sentence_transformers import SentenceTransformer
num_products = 5000
model = SentenceTransformer("all-MiniLM-L6-v2")

product_name_embeddings = model.encode(
    products_df["product_name"][:num_products].tolist()
)
product_name_embeddings.shape

(5000, 384)

In [None]:
product_description_embeddings = model.encode(
    products_df["product_description"][:num_products].tolist()
)
product_description_embeddings.shape

(5000, 384)

## Create empty collection in Qdrant

 `use docker to run qdrant`
 **docker run -p 6333:6333 qdrant/qdrant**


In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333")
client.delete_collection("wands-products")
client.create_collection(
    collection_name="wands-products",
    vectors_config={
        "product_name": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
        "product_description": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
    },
    optimizers_config=models.OptimizersConfigDiff(
        default_segment_number=2,
        indexing_threshold=1000,
    ),
)

True

## Uploading data to Qdrant

In [None]:
client.upload_collection(
    collection_name="wands-products",
    vectors={
        "product_name": product_name_embeddings,
        "product_description": product_description_embeddings,
    },
    payload=products_df.to_dict(orient="records"),
    ids=products_df.index.tolist(),
    batch_size=64,
)


## Read queries and labels

In [None]:
queries_df = pd.read_csv(
    "query.csv", 
    sep="\t", 
    index_col="query_id",
)
queries_df.head()

Unnamed: 0_level_0,query,query_class
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,salon chair,Massage Chairs
1,smart coffee table,Coffee & Cocktail Tables
2,dinosaur,Kids Wall Décor
3,turquoise pillows,Accent Pillows
4,chair and a half recliner,Recliners


In [None]:

labels_df = pd.read_csv(
    "label.csv", 
    sep="\t", 
)
labels_df.sample(n=5)

Unnamed: 0,id,query_id,product_id,label
71166,71317,50,36137,Partial
116183,116357,236,33983,Irrelevant
161280,161847,409,9869,Partial
208030,208597,14,25062,Irrelevant
178536,179103,427,7170,Irrelevant


## Create relevancy scores

In [None]:
relevancy_scores = {
    "Exact": 10,
    "Partial": 5,
    "Irrelevant": 0,
}

labels_df["score"] = labels_df["label"].map(relevancy_scores.get)
labels_df["query_id"] = labels_df["query_id"].map(lambda x: f"query_{x}")
labels_df["product_id"] = labels_df["product_id"].map(lambda x: f"doc_{x}")
labels_df.sample(n=5)

Unnamed: 0,id,query_id,product_id,label,score
223351,223918,query_114,doc_9523,Partial,5
161339,161906,query_409,doc_34138,Partial,5
219431,219998,query_82,doc_40287,Partial,5
52540,52663,query_269,doc_15343,Irrelevant,0
131137,131311,query_304,doc_27083,Irrelevant,0


## Make evaluation metrics

In [None]:
from ranx import Qrels

qrels = Qrels.from_df(
    labels_df.astype({"query_id": "str", "product_id": "str"}),
    q_id_col="query_id",
    doc_id_col="product_id", 
    score_col="score",
)

## Encode queries

In [None]:
queries_df["query_embedding"] = model.encode(
    queries_df["query"].tolist()
).tolist()
queries_df.sample(n=5)


Unnamed: 0_level_0,query,query_class,query_embedding
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
323,hub leaning full length mirror,Wall & Accent Mirrors,"[0.06984295696020126, 0.017407871782779694, 0...."
457,5 gang light switch,Wall Plates,"[-0.026735350489616394, -0.019914403557777405,..."
273,stainless steel free standing shower caddy,Shower & Tub Accessories,"[-0.08695775270462036, 0.08431896567344666, 0...."
33,non slip shower floor tile,Floor & Wall Tile,"[-0.018963497132062912, -0.030709998682141304,..."
6,acrylic clear chair,Dining Chairs,"[-0.03706296160817146, 0.07312846183776855, -0..."


## Search by product name

In [None]:
from collections import defaultdict

name_run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_name", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
    )

    for point in results:
        document_id = f"doc_{point.id}"
        name_run_dict[query_id][document_id] = point.score  
    
name_run_dict

defaultdict(dict,
            {'query_0': {'doc_4410': 0.751218,
              'doc_4034': 0.74417406,
              'doc_251': 0.7269762,
              'doc_2187': 0.72043693,
              'doc_975': 0.69298476,
              'doc_1616': 0.6591316,
              'doc_4444': 0.650861,
              'doc_746': 0.6489469,
              'doc_209': 0.6478479,
              'doc_2638': 0.6400893,
              'doc_1148': 0.63802814,
              'doc_1059': 0.6332264,
              'doc_1372': 0.6321897,
              'doc_308': 0.63199484,
              'doc_603': 0.6241783,
              'doc_1742': 0.61341786,
              'doc_4938': 0.6115166,
              'doc_4330': 0.60891706,
              'doc_1259': 0.6060813,
              'doc_1864': 0.6024617,
              'doc_4329': 0.5979624,
              'doc_1373': 0.5947803,
              'doc_1454': 0.5935928,
              'doc_3604': 0.5926491,
              'doc_187': 0.5895107,
              'doc_206': 0.5892326,
            

## Create Run object

In [None]:
from ranx import Run

product_name_run = Run(name_run_dict, name="product_name")


## Search by product description

In [None]:
description_run_dict = defaultdict(dict)
for id, row in queries_df.iterrows():
    query_id = f"query_{id}"
    
    results = client.search(
        collection_name="wands-products",
        query_vector=models.NamedVector(
            name="product_description", 
            vector=row["query_embedding"]
        ),
        with_vectors=False,
        with_payload=False,
        limit=100,
    )

    for point in results:
        document_id = f"doc_{point.id}"
        description_run_dict[query_id][document_id] = point.score 

product_description_run = Run(
    description_run_dict, 
    name="product_description"
)


## Compare metrics

In [None]:
from ranx import compare

compare(
    qrels=qrels,
    runs=[
        product_name_run, 
        product_description_run
    ],
    metrics=[
        "precision@10", 
        "recall@10", 
        "mrr@10",
        "dcg@10", 
        "ndcg@10",
    ],
)

#    Model                P@10    Recall@10    MRR@10    DCG@10    NDCG@10
---  -------------------  ------  -----------  --------  --------  ---------
a    product_name         0.616ᵇ  0.034ᵇ       0.807ᵇ    19.132ᵇ   0.517ᵇ
b    product_description  0.457   0.024        0.687     14.494    0.388