In [1]:
import torch

# -------------------------
# Small config
# -------------------------
MODEL_NAME = "BAAI/bge-m3"  # multilingual BGE-M3
# Auto-pick device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

NORMALIZE = True  # cosine similarity works best with normalized vecs
BATCH_SIZE = 32  # embedding batch size

CHUNK_SIZE = 800  # characters per chunk
CHUNK_OVERLAP = 150  # characters

In [2]:
from services.embedding_service import EmbeddingService

service = EmbeddingService(
        model_name= MODEL_NAME,
        device=DEVICE,
        normalize=NORMALIZE,
        batch_size=BATCH_SIZE,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("Query dim:", len(service.embed_query("อธิบาย RAG สั้นๆ")))

Query dim: 1024


In [4]:
vectors = service.embed_query("อธิบาย RAG สั้นๆ")
print(vectors[0:5])

[-0.008919138461351395, 0.0007591982721351087, -0.011482643894851208, 0.0031074178405106068, -0.01438190694898367]


In [5]:
vectors = service.embed_texts(["hello", "สวัสดี"])
print("Batch dims:", [len(v) for v in vectors])

Batch dims: [1024, 1024]


In [3]:
chunks = service.embed_file("data/MachineLearning-Lecture01.pdf")
# print("PDF chunks:", len(chunks), "first vec dim:", len(chunks[0].vector) if chunks else 0)
# rows = service.embed_file("data/articles.csv", csv_text_cols=["title", "summary"])
# print("CSV chunks:", len(rows))

display(chunks[0])

ChunkEmbedding(chunk_text="MachineLearning-Lecture01  \nInstructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \nlearning class. So what I wanna do today is just spend a little time going over the logistics \nof the class, and then we'll start to talk a bit about machine learning.  \nBy way of introduction, my name's Andrew Ng and I'll be instructor for this class. And so \nI personally work in machine learning, and I've worked on it for about 15 years now, and \nI actually think that machine learning is the most exciting field of all the computer \nsciences. So I'm actually always excited about teaching this class. Sometimes I actually \nthink that machine learning is not only the most exciting thing in computer science, but \nthe most exciting thing in all of human endeavor, so maybe a little bias there.", vector=[-0.05867505446076393, -0.03161442652344704, -0.02582080289721489, 0.009673778899013996, -0.002812956692650914, -0.041676703840494156, -0.0556856691837310

In [6]:
print(chunks[0].chunk_text, '\n')

print(chunks[0].vector, '\n')

print(chunks[0].meta, '\n')

print(chunks[0].meta['id'])

MachineLearning-Lecture01  
Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine 
learning class. So what I wanna do today is just spend a little time going over the logistics 
of the class, and then we'll start to talk a bit about machine learning.  
By way of introduction, my name's Andrew Ng and I'll be instructor for this class. And so 
I personally work in machine learning, and I've worked on it for about 15 years now, and 
I actually think that machine learning is the most exciting field of all the computer 
sciences. So I'm actually always excited about teaching this class. Sometimes I actually 
think that machine learning is not only the most exciting thing in computer science, but 
the most exciting thing in all of human endeavor, so maybe a little bias there. 

[-0.05867505446076393, -0.03161442652344704, -0.02582080289721489, 0.009673778899013996, -0.002812956692650914, -0.041676703840494156, -0.05568566918373108, 0.005118017550557852, 0.0479833856225013

In [3]:
pdf_chunks = service.embed_file("data/car_example.pdf")

In [4]:
display(pdf_chunks[0])
print(len(pdf_chunks))

ChunkEmbedding(chunk_text='\'<Ē\x1d HEV PREMIUM LUXURY HEV PREMIUM HEV SMART\n\x07\x1d6\x18\'\x1a&\x1d\x19Ė\n%8\x198$6&\x1d1\x06 &6+\x03Å\x03\x06+ē6\x0c\x03Å\x03.=\x0c %%\x80 ä~\x86â\x82\x03Å\x03á~\x85ä\x82\x03Å\x03á~ääå\n\t+6%&6+\x0fĒ+\x0c)ē1 %%\x80 â~\x85âå\n\t+6%\x06+ē6\x0c\x0fĒ+\x0c)ē1 /\x1dē6\x03\x81\x03/)5\x0c %%\x80 á~å\x85\x82\x03\x81\x03á~å\x86\x82 á~å\x86\x82\x03\x81\x03á~\x83\x82\x82\n\'4&4\x19NI6.<\x18\r6\x06"õJ\x1d %%\x80 áãå\n\'5,%9+\x0cA)9J&+B\t\x1e.<\x18\x03 %\x80 å\x80\x84\n\t+6%\rþ\x1a5\x0c\x1dNJ6%5\x1d \x03)8\x19\' å\x82\nA\t\'éI1\x0c&\x1d\x19Ė\n\'<Ē\x1d\x03\x81\x03B\x1e\x1e \x8eâå\x8e\x7f\x93¥\xa0\x03\x81\x03ä\x03.=\x1e\x03B\x1a+A\'ç&\x0c\x03\x91\x9c\x95\x90\x03á\x83\x03+6)Ė+\x03B\x1e\x1e\x03\x91Â®¹\x03££¡\x7f¶\n\x1f\'æ%6\x19\'\x06\'4\x1e1\x06.=\x1e \x109\x109 â~ä\x85\x84\n\t+6%\x06+ē6\x0c\x06\'4\x1e1\x06.=\x1e\x03Å\x03\'4&4\x0f5\x06 %%\x80 \x85\x84\x80å\x03Å\x03á\x82ã\x80ä\n15\x19\'6.Ē+\x1d\x06N6)5\x0c15\x18 áä\x03\x87\x03á\n\x06N6)5\x0c.=\x0c.<\x18\x03\x92\x92\x90

8


In [10]:
pdf_chunks = service.embed_file("data/hello.pdf")

In [11]:
display(pdf_chunks[0])
print(len(pdf_chunks))

ChunkEmbedding(chunk_text='สวัสดี', vector=[-0.031175674870610237, 0.032977327704429626, -0.04203437641263008, -0.012651350349187851, -0.03258179500699043, -0.03303123265504837, -0.016864867880940437, -0.0403139628469944, 0.0299763772636652, -0.0018812116468325257, 0.00943673960864544, 0.014189145527780056, 0.00822197925299406, -0.02020949497818947, 0.018172690644860268, -0.03090336173772812, 0.011125601828098297, -0.029796971008181572, -0.006700531113892794, -0.04914924502372742, -0.017075464129447937, -0.010638661682605743, 0.04289492592215538, 0.006836464628577232, -0.0018576540751382709, 0.024125734344124794, -0.023568198084831238, 0.00902429036796093, 0.00773134408518672, -0.011815809644758701, 0.032005079090595245, 0.058491986244916916, -0.011185879819095135, -0.028523175045847893, -0.03362651541829109, -0.06271813809871674, 0.0009670351864770055, -0.022999988868832588, -0.06825327128171921, 0.026350347325205803, 0.028646372258663177, 0.0012773246271535754, 0.014218884520232677, 

1


In [5]:
display(pdf_chunks[1])
print(len(pdf_chunks))

ChunkEmbedding(chunk_text='E##Ď6.=\x0c.<\x18\x03\x81\x03\x06N6)5\x0c.=\x0c.<\x18\x03\x81\x03B\'\x0c\x1e8\x18.=\x0c.<\x18\x03\x92\x92\x90\x03»²Á %1A\x191\'Ė\x108\x0cC\t\'\x1d5.B%ĒA/)H\x06\x1a6+\'\x03\x81\x03\x83\x82\x82\x03C+)\x19Ė\x03\x81\x03á\x82\x82\x03\x068C)+5\x19\x19Ė\x03\x81\x03â\x82\x85\x03\x1d8+\x195\x1dA%\x19\'\nB\x1e\x19A\x191\'çIE2\x1e\'æ\x18\n\x0f\x1d8\x18\x03\x81\x03B\'\x0c\x185\x1dE##Ď6\x03\x81\x03\rN6\x1d+\x1dA\x10))Ė\x03\x81\x03\t+6%\rþE##Ď6 \x99¶Áµ¶Âº\x7f¶¼»\x03\x81\x03âåá\x80\x83\x03C+)\x19Ė\x03\x81\x03\x83\x85\x03A\x10))Ė\x03\x81\x03ä\x03B1%B\x1f\'Ė\x03\x7f\x03\x0f5I+C%\x0c\nA\t\'éI1\x0c&\x1d\x19Ė\x03B)4%1A\x191\'ĖE##Ď6\n\x06N6)5\x0c.=\x0c.<\x18 \x068C)+5\x19\x19Ė\x03z\x9d\xa0{ á\x83\x84zââ\x84{\n\'4\x1e\x1e\x075\x1eA\t);I1\x1d\x03B)4\'4\x1e\x1e\x065\x1d.4A\x1b;1\x1d\n\'4\x1e\x1e.Ē\x0c\x06N6)5\x0c 15\x19C\x1d%5\x198\x03\x92\x7f\x90£¡\n15\x19\'6\x1b\x18A#í1\x0c\x1bē6& ã\x80á\x86ã\n\'4\x1e\x1e\x065\x1d.4A\x1b;1\x1d /\x1dē6\x03\x81\x03/)5\x0c 18.\'4B%H\tA#1\'Ė.5\x1d.\x1

8


In [6]:
display(pdf_chunks[2])
print(len(pdf_chunks))

ChunkEmbedding(chunk_text='6+A+1\'ĖE##Ď6\x03\x92\x9d\xa0\n1ğ\x1f\x06\'\x17Ė$6&\x1d1\x06\nE#/\x1dē6 \x99\x92\x91\x03C\x1f\'A\r\tA\x191\'Ė\nE#.Ē1\x0c.+Ē6\x0cA+)6\x06)6\x0c+5\x1d \x99\x92\x91\nE#\x1bē6& \x93Â¹¹\x03\x99\x92\x91\n\'4\x1e\x1e\x1f\'5\x1e\'4\x185\x1eE#/\x1dē6\x03.=\x0c\x7f\x19NI6\x03 •\n\'4\x1e\x1e\t+\x1e\t<%\x066\'A\x1fĉ\x18\x7f\x1fĉ\x18E#/\x1dē615\x19C\x1d%5\x198 •\n\x06\'4\r5\x0c/\x1dē6\x18ē6\x1d\x1e\x1d .9\x18N6A\x0c6 .9\x18N6\n\x06\'4\r5\x0c/\x1dē6\x18ē6\x1d)Ē6\x0c .9A\x1b6A%\x1b5))8\x06 .9\x18N6\n\x06\'4\r\x06%1\x0c\x07ē6\x0c\x03"\'ē1%E#A)9J&+\x03\x99\x92\x91 \x03\x1f\'5\x1eE##Ď6~\x03"5\x1eA\x06H\x1e15\x19C\x1d%5\x198~\n"\'ē1%\'4\x1e\x1e\t+6%\rN6\x03B)4\x03\x9f²Ã²¿À²\x03\x99¶»¸ \x1f\'5\x1eE##Ď6\x03B)4"5\x1eA\x06H\x1e15\x19C\x1d%5\x198\n\x1b9I\x1fė\x18\x1dNJ6!\x1dB\x1e\x1e15\x19C\x1d%5\x198 •\n\x06\'4\r\x06\x1e5\x0c)%/\x1dē6 B\x1e\x1e)\x18A.9&\x0c\'\x1e\x06+\x1d\x03z\x8e°¼ÂÀÁ¶°\x03\x94¹®ÀÀ{\n\x9d®»¼¿®º¶°\x03\x9f¼¼³\x03B\x1e\x1eA\x1fĉ\x18E\x18ē\x03"\'ē1%%Ē6\x1d\x1e5\x0cB\x

8


In [9]:
display(pdf_chunks[3])
print(len(pdf_chunks))
display(pdf_chunks[0].chunk_text)

ChunkEmbedding(chunk_text='~\x03\xa0²®Á\x03£²»Á¶¹®Á¼¿\x03\nB)4A);I1\x1d\x1a1&/)5\x0c15\x19C\x1d%5\x198\x07\x174\nA\x07ē6\x7f11\x06\r6\x06\x195+\'\x1a\n\x85\x03\x1b8,\x1b6\x0c\nA\x1e64\x1d5I\x0c =ēC\x18&.6\'\x18ē6\x1d/\x1dē6B\x1e\x1e\x1f\'5\x1eE##Ď6 \x85\x03\x1b8,\x1b6\x0c\x03"\'ē1%\x1f<č%\x1f\'5\x1e\x07ē6\x0c"\x1d5\x06"ò\x0c\x03\nB)4\x03\xa0²®Á\x03£²»Á¶¹®Á¼¿ ä\x03\x1b8,\x1b6\x0c\n\x1f<č%\x1f\'5\x1e\x185\x1d/)5\x0cE##Ď6 \x18ē6\x1d =ē\x075\x1e\x03B)4 =ēC\x18&.6\'\x18ē6\x1d/\x1dē6 \x18ē6\x1d =ē\x075\x1e\nA\x1e64\x1d5I\x0c =ēC\x18&.6\'\x18ē6\x1d/)5\x0c\x1f\'5\x1e"\x1d5\x06"ò\x0cE##Ď6 • \x7f\n\'<Ē\x1d HEV PREMIUM LUXURY HEV PREMIUM HEV SMART\n\x1b9I+6\x0cB\x07\x1dA\x1e64\x1d5I\x0c =ēC\x18&.6\'\x18ē6\x1d/)5\x0c \x03"\'ē1%\x1b9I+6\x0cB\x06ē+B\x1e\x1e"5\x1eA\x06H\x1eE\x18ē\n\x03B)4B \x0c\t+\x1e\t<%\'4\x1e\x1e.5% 5.B\x1e\x1e\x188\ræ\x1b5) "\'ē1%\x1b9I+6\x0cB\x06ē+\x03B)4!6\x1fĉ\x18\n"+\x0c%6)5&\x1f\'5\x1e\'4\x185\x1eE\x18ē\x03ä\x03\x1b8,\x1b6\x0c \x1f\'5\x1eE##Ď6\x03"\'ē1%\'4\x1e\x1e\t+6%\rN6B)

8


'\'<Ē\x1d HEV PREMIUM LUXURY HEV PREMIUM HEV SMART\n\x07\x1d6\x18\'\x1a&\x1d\x19Ė\n%8\x198$6&\x1d1\x06 &6+\x03Å\x03\x06+ē6\x0c\x03Å\x03.=\x0c %%\x80 ä~\x86â\x82\x03Å\x03á~\x85ä\x82\x03Å\x03á~ääå\n\t+6%&6+\x0fĒ+\x0c)ē1 %%\x80 â~\x85âå\n\t+6%\x06+ē6\x0c\x0fĒ+\x0c)ē1 /\x1dē6\x03\x81\x03/)5\x0c %%\x80 á~å\x85\x82\x03\x81\x03á~å\x86\x82 á~å\x86\x82\x03\x81\x03á~\x83\x82\x82\n\'4&4\x19NI6.<\x18\r6\x06"õJ\x1d %%\x80 áãå\n\'5,%9+\x0cA)9J&+B\t\x1e.<\x18\x03 %\x80 å\x80\x84\n\t+6%\rþ\x1a5\x0c\x1dNJ6%5\x1d \x03)8\x19\' å\x82\nA\t\'éI1\x0c&\x1d\x19Ė\n\'<Ē\x1d\x03\x81\x03B\x1e\x1e \x8eâå\x8e\x7f\x93¥\xa0\x03\x81\x03ä\x03.=\x1e\x03B\x1a+A\'ç&\x0c\x03\x91\x9c\x95\x90\x03á\x83\x03+6)Ė+\x03B\x1e\x1e\x03\x91Â®¹\x03££¡\x7f¶\n\x1f\'æ%6\x19\'\x06\'4\x1e1\x06.=\x1e \x109\x109 â~ä\x85\x84\n\t+6%\x06+ē6\x0c\x06\'4\x1e1\x06.=\x1e\x03Å\x03\'4&4\x0f5\x06 %%\x80 \x85\x84\x80å\x03Å\x03á\x82ã\x80ä\n15\x19\'6.Ē+\x1d\x06N6)5\x0c15\x18 áä\x03\x87\x03á\n\x06N6)5\x0c.=\x0c.<\x18\x03\x92\x92\x90\x03»²Á \x068C)+5\x19\x19Ė

In [13]:
csv_chunks = service.embed_file("data/Service_Booking_Sample_Data__300_rows_.csv")

display(csv_chunks[0])
print(len(csv_chunks))

ChunkEmbedding(chunk_text='CustomerName: Customer_1\nPhone: 0895686362\nEmail: customer1@example.com\nBrand: Kia\nModel: Rio\nLicensePlate: คค-7192\nMileage: 122633\nServiceNeeded: Tire Rotation\nPreferredDate: 2025-01-28\nPreferredTime: 14:30', vector=[-0.050712354481220245, -0.033797454088926315, -0.0381033830344677, -0.013539273291826248, -0.05946635082364082, -0.016109732910990715, -0.001410748460330069, 0.00046176451724022627, -0.04031635448336601, 0.011790184304118156, 0.013072870671749115, -0.0020587521139532328, 0.018134571611881256, 0.017252983525395393, 0.02488251030445099, 0.008702515624463558, -0.03247413784265518, 0.016788752749562263, -0.025224555283784866, -0.014008743688464165, -0.007235677447170019, 0.011257899925112724, -0.010104547254741192, 0.008079729042947292, 0.0074941483326256275, -0.023061323910951614, -0.0071932426653802395, -0.027303475886583328, 0.05629677325487137, -0.0019248125609010458, -0.04528915509581566, 0.017750203609466553, 0.04393812268972397, -0.0

10


In [14]:
documents = service.embed_file("data/test_doc.docx")

display(documents[0])


ChunkEmbedding(chunk_text='Hello This is Testing Word Document', vector=[-0.04036655277013779, 0.005314746871590614, -0.02581852860748768, 0.007884732447564602, -0.015546074137091637, -0.013210175558924675, 0.005793898366391659, 0.034221287816762924, -0.0010798396542668343, -0.015985695645213127, -0.028127457946538925, -0.044203415513038635, 0.024798037484288216, -0.0075069935992360115, 0.00602471549063921, -0.02262422814965248, 0.023450996726751328, -0.027154697105288506, -0.004810687154531479, -0.019172560423612595, -0.08855263888835907, -0.006068907678127289, 0.060158226639032364, 0.01391880214214325, 0.013441473245620728, 0.04077279567718506, -0.02418523095548153, -0.019513297826051712, 0.04482873156666756, -0.0040839374996721745, 0.033257901668548584, -0.02136486954987049, 0.009235276840627193, -0.052684638649225235, -0.017454542219638824, -0.017669038847088814, 0.015550002455711365, -0.015846334397792816, -0.025965487584471703, 0.015428957529366016, 0.030891530215740204, -0.03113

In [17]:
chunks = service.embed_file("data/Book1.xlsx")

display(chunks[0])
print(len(chunks))

display(chunks[1].chunk_text)

ChunkEmbedding(chunk_text='Header', vector=[-0.00044957781210541725, 0.03562842682003975, -0.033085934817790985, 0.025149177759885788, -0.02889089286327362, -0.0004033200384583324, 0.02445019967854023, 0.05156340450048447, -0.010035563260316849, -0.003922163508832455, -0.005445151124149561, -0.003134202677756548, -0.01740400493144989, -0.011392530053853989, 0.009976510889828205, -0.07759977877140045, 0.015940824523568153, -0.0010447002714499831, -0.011498617939651012, -0.03458566591143608, -0.018096577376127243, -0.008812597952783108, 0.023307448253035545, 0.032236143946647644, -0.0003078101435676217, -0.021574078127741814, 0.019574306905269623, -0.013692754320800304, -0.04770903289318085, -0.024580422788858414, 0.02009233646094799, 0.0569976344704628, 0.0433342345058918, -0.06957460194826126, -0.02742459811270237, -0.05544453114271164, 0.03224833309650421, 0.007297030184417963, -0.0915980115532875, 0.02113315649330616, 0.02098984643816948, 0.0503980778157711, -0.0061870152130723, -0.0

2


'Number Alpha 1 A 2 B 3 A 4 B 5 A 6 B 7 A 8 B'