In [2]:
from sentence_transformers import SentenceTransformer
from typing import List

class MyEmbeddings:
        def __init__(self, model):
            self.model = SentenceTransformer(model, trust_remote_code=True)
    
        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return [self.model.encode(t).tolist() for t in texts]
        
        def embed_query(self, query: str) -> List[float]:
            return self.model.encode(query).tolist()

  from tqdm.autonotebook import tqdm, trange


In [3]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
dir_path = os.getcwd()

loader = DirectoryLoader(f"{dir_path}/texts", glob="**/*.txt",show_progress=True)
data = loader.load()
print("finished loading data")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

from langchain_chroma import Chroma
embeddings=MyEmbeddings("sentence-transformers/all-MiniLM-L6-v2")

chromadb = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
    persist_directory=False
)

100%|██████████| 6/6 [00:01<00:00,  5.61it/s]


finished loading data


In [4]:
embeddings.embed_query('werjhew')

[-0.054087817668914795,
 0.054764606058597565,
 0.0021965771447867155,
 0.009164744056761265,
 -0.033183492720127106,
 -0.050097860395908356,
 0.04886755347251892,
 0.04547635093331337,
 -0.011244338005781174,
 -0.02189750224351883,
 -5.032909393776208e-05,
 -0.012112320400774479,
 -0.004658358637243509,
 0.02909783087670803,
 0.04339463263750076,
 -0.02069094590842724,
 -0.038302045315504074,
 -0.007020563818514347,
 -0.08805578202009201,
 -0.0676301047205925,
 -0.031486015766859055,
 0.005094621330499649,
 0.0025470375549048185,
 -0.07208281010389328,
 0.03479332476854324,
 -0.02091299556195736,
 0.03974774479866028,
 0.012262613512575626,
 0.029731761664152145,
 -0.007597778923809528,
 0.042864397168159485,
 -0.05497439578175545,
 0.01454886607825756,
 -0.050631046295166016,
 -0.06678640842437744,
 -0.05730041489005089,
 -0.04807291179895401,
 0.019747614860534668,
 0.060588203370571136,
 -0.008551045320928097,
 -0.0833759531378746,
 -0.08547929674386978,
 -0.003925711382180452,
 -0

In [5]:
question = "lee hsien loong wife"
docs = chromadb.similarity_search(question,k=10)
relevant_texts = [document for document in docs]
relevant_texts

[Document(metadata={'source': '/Users/shaoyang/Desktop/Agent/langgraph/rag/texts/leehsienloong.txt'}, page_content='Personal life Lee married his first wife, Wong Ming Yang, a Malaysian-born physician, on 20 May 1978. They have a daughter and a son, Li Xiuqi, born in 1981, and Li Yipeng, born in 1982. Three weeks after giving birth to their son, Wong died of a heart attack on 28 October 1982, at the age of 31. [201]'),
 Document(metadata={'source': '/Users/shaoyang/Desktop/Agent/langgraph/rag/texts/leehsienloong.txt'}, page_content='Lee remarried to Ho Ching in 1985, a promising civil servant who subsequently became the executive director and chief executive officer of Temasek Holdings. [202] They have two sons, Li Hongyi and Li Haoyi. [203] Their elder son, Li Hongyi, was a commissioned officer in the Singapore Armed Forces (SAF),[204] and is the deputy director of the Government Technology Agency. [205][206] Their younger son, Li Haoyi, is a software engineer who authors books on the