In [1]:
from sentence_transformers import SentenceTransformer
from typing import List

class MyEmbeddings:
        def __init__(self, model):
            self.model = SentenceTransformer(model, trust_remote_code=True)
    
        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return [self.model.encode(t).tolist() for t in texts]
        
        def embed_query(self, query: str) -> List[float]:
            return self.model.encode(query).tolist()

  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
dir_path = os.getcwd()

loader = DirectoryLoader(f"{dir_path}/texts", glob="**/*.txt",show_progress=True)
data = loader.load()
print("finished loading data")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

from langchain_chroma import Chroma
embeddings=MyEmbeddings("sentence-transformers/all-MiniLM-L6-v2")

chromadb = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
)

100%|██████████| 3/3 [00:00<00:00,  5.49it/s]


In [3]:
embeddings.embed_query("hi")

[-0.09047619253396988,
 0.040439605712890625,
 0.02390565536916256,
 0.05894799157977104,
 -0.022882331162691116,
 -0.047220051288604736,
 0.04504755884408951,
 0.01578631065785885,
 -0.048199545592069626,
 -0.037794098258018494,
 -0.019077591598033905,
 0.02130882814526558,
 -0.0046830810606479645,
 -0.04330813139677048,
 0.05991479009389877,
 0.05910342559218407,
 -0.028036706149578094,
 -0.05921836942434311,
 -0.12440313398838043,
 -0.03559999540448189,
 -0.006080579478293657,
 0.0324290469288826,
 -0.037800729274749756,
 0.02471097931265831,
 -0.04272431880235672,
 -0.04245390370488167,
 0.045935627073049545,
 0.09862551093101501,
 -0.04999801516532898,
 -0.03523581475019455,
 0.0708397626876831,
 0.03316318988800049,
 0.02658831886947155,
 0.00017322623170912266,
 0.0038816528394818306,
 0.030467234551906586,
 -0.07820264250040054,
 -0.12037952989339828,
 0.018041525036096573,
 0.022829042747616768,
 -0.0017750156112015247,
 -0.023449819535017014,
 0.0030581247992813587,
 0.024355

In [4]:
question = "kim yong role"
docs = chromadb.similarity_search(question,k=5)
relevant_texts = [document for document in docs]
relevant_texts

[Document(metadata={'source': 'texts/gankimyong.txt'}, page_content="Gan Kim Yong[a] (born 9 February 1959) is a Singaporean politician who has been serving as Deputy Prime Minister of Singapore since 2024 and Minister for Trade and Industry since 2021. A member of the governing People's Action Party, he has been the Member of Parliament (MP) representing the Choa Chu Kang division of Choa Chu Kang GRC since 2011."),
 Document(metadata={'source': 'texts/lawrence.txt'}, page_content='At his swearing-in ceremony, during his first speech as Prime Minister, Wong stated "This is my promise to all Singaporeans: I will serve you with all my heart. I will never settle for the status quo. I will always seek better ways to make tomorrow better than today." He also stated that his mission as Prime Minister was "to continue defying the odds and to sustain this miracle called Singapore. "[77] On 12 June 2024, it was announced that Gan Kim Yong had taken over as Chairman of the Economic'),
 Document