## **Step 1: Initialize an Embedding Model** 

In [2]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

## **Step 2: Initialize the Chroma DB Connection**

In [3]:
from langchain_chroma import Chroma

db = Chroma(collection_name="new_vector_database", 
            embedding_function=embedding_model, 
            persist_directory="./new_chroma_db")

In [4]:
db.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

### **Load a document**

In [5]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

loader = DirectoryLoader(path='example_data/text', glob="*.txt", show_progress=True, loader_cls=TextLoader)

data = loader.load()

100%|██████████| 2/2 [00:00<00:00, 1996.34it/s]


In [6]:
doc_content = [doc.page_content for doc in data]
doc_metadata = [doc.metadata for doc in data]

In [7]:
doc_content, doc_metadata

(['This is pretty much\nwhat\'s happened so far.\n\nRoss was in love\nwith Rachel since forever.\n\nEvery time he tried to tell her,\nsomething got in the way\n\nIike cats, Italian guys.\n\nAnd finally, Chandler was,\nlike, "Forget about her."',
  "These were unbelievely expensive\nand he'll grow out of them\n\nin 20 minutes,\nbut I couldn't resist!\n\nLook at these."],
 [{'source': 'example_data\\text\\file_1.txt'},
  {'source': 'example_data\\text\\file_2.txt'}])

### **Split the document into chunks**

In [8]:
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter

st_text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-mpnet-base-v2", 
                                                         chunk_size=100, 
                                                         chunk_overlap=50)

st_chunks = st_text_splitter.create_documents(doc_content, doc_metadata)

In [9]:
print("Total number of documents inside chunks:", len(st_chunks))
print()
for i, chunk in enumerate(st_chunks, start=1):
    print(f"Document {i} metadata: {chunk.metadata}")
    print(f"Document {i} chunks: {chunk.page_content}")
    print("-" * 100)

Total number of documents inside chunks: 2

Document 1 metadata: {'source': 'example_data\\text\\file_1.txt'}
Document 1 chunks: this is pretty much what ' s happened so far. ross was in love with rachel since forever. every time he tried to tell her, something got in the way iike cats, italian guys. and finally, chandler was, like, " forget about her. "
----------------------------------------------------------------------------------------------------
Document 2 metadata: {'source': 'example_data\\text\\file_2.txt'}
Document 2 chunks: these were unbelievely expensive and he ' ll grow out of them in 20 minutes, but i couldn ' t resist! look at these.
----------------------------------------------------------------------------------------------------


### **Add Chunks to ChromaDB**

In [10]:
db.add_documents(st_chunks)

['e44775ef-e6d4-4363-935a-da991d70c097',
 '0e6c836b-fb64-4e3b-bc36-839eed9d9dff']

In [11]:
db.get()

{'ids': ['0e6c836b-fb64-4e3b-bc36-839eed9d9dff',
  'e44775ef-e6d4-4363-935a-da991d70c097'],
 'embeddings': None,
 'metadatas': [{'source': 'example_data\\text\\file_2.txt'},
  {'source': 'example_data\\text\\file_1.txt'}],
 'documents': ["these were unbelievely expensive and he ' ll grow out of them in 20 minutes, but i couldn ' t resist! look at these.",
  'this is pretty much what \' s happened so far. ross was in love with rachel since forever. every time he tried to tell her, something got in the way iike cats, italian guys. and finally, chandler was, like, " forget about her. "'],
 'uris': None,
 'data': None}

**<---New Vector Database is ready!--->**

## **Step 3: Create a Retriever Object**  

In [41]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 1})

## **Step 4: Initialize a Chat Prompt Template** 

In [42]:
from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
Use only the following context to answer the question:
{context}

Question: {question}

Provide a concise, detailed answer based solely on the context. 
Do not include any information that is not explicitly stated in the context. 
Avoid phrases like "according to the context" or similar.
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

## **Step 5: Initialize a Generator (i.e. Chat Model)**

In [43]:
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("GOOGLE_API_KEY")

In [44]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(
    google_api_key=api_key,
    model="gemini-1.5-pro",
    temperature=1
)

## **Step 6: Initialize a Output Parser**

In [45]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

## **Step 7: Define a RAG Chain**

In [46]:
from langchain_core.runnables import RunnablePassthrough

def format_doc(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG Chain
rag_chain = {"context": retriever | format_doc, "question": RunnablePassthrough()} | prompt_template | chat_model | output_parser

## **Step 8: Invoke the Chain**

In [47]:
query = "who is ross?"

rag_chain.invoke(query)

'Ross is someone in love with Rachel. He has repeatedly tried to tell her, but has been thwarted by various obstacles.\n'