## Getting started with housing Chatbot


In [None]:
%pip install chromadb
%pip install tiktoken

In [None]:
%pip show chromadb

### Create chroma db client 


In [None]:
import chromadb
from chromadb.config import Settings


client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory="db/"
                                ))

In [None]:
%pip install langchain

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import Chroma
from langchain .text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import (
    HuggingFaceInferenceAPIEmbeddings,
)
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

#### Load PDFS

In [None]:
%pip install pypdf

In [18]:
loader = PyPDFDirectoryLoader('pdfs')

In [19]:
data=loader.load()

In [None]:
data[0]

## Text Splitting 

In [38]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=900,chunk_overlap=20)
text_chunks=text_splitter.split_documents(data)

In [None]:
text_chunks

In [None]:
len(text_chunks)

In [None]:
print(text_chunks[102].page_content)

### Loading my Environment Variables

In [73]:
from dotenv import load_dotenv
load_dotenv()
gemini_api_key=os.getenv('GEMINI_API_KEY')
huggingface_api_key=os.getenv('HUGGINGFACE_API_KEY')

In [None]:
huggingface_api_key

### My embender

In [75]:
hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=huggingface_api_key,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)

### Create my Chroma Db

In [36]:
persist_directory='db'

In [None]:
text_chunks

In [None]:
vectordb=Chroma.from_documents(documents=text_chunks,
                               embedding=hf_embeddings,
                               persist_directory=persist_directory,
                               )

In [None]:
# Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.
#   vectordb.persist()
# vectordb.persist()

In [50]:
vectordb=Chroma(persist_directory=persist_directory,embedding_function=hf_embeddings)

In [None]:
vectordb

### Make Retriever

In [52]:
retriever=vectordb.as_retriever()

In [57]:
# The get_relevant_documents has depreciated and being replaced by invoke from langchain
# docs=retriever.get_relevant_documents("What is the website about?")
docs=retriever.invoke("What is the website about?")

In [None]:
docs[0]

In [63]:
retriever=vectordb.as_retriever(search_kwargs={"k":2})

In [66]:
docs2=retriever.invoke("What is the website about?")

In [None]:
docs2

## Make a chain


In [85]:
from langchain.chains import RetrievalQA

In [None]:
gemini_api_key

In [72]:
llm=ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    api_key=gemini_api_key,
    temperature=0.5
)

### Create A chain

In [None]:
qa_chain=RetrievalQA.from_chain_type(llm=llm,
                                     chain_type='stuff',
                                     retriever=retriever,
                                     return_source_documents=True)

In [80]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSource:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query='What is real estate?'
llm_response=qa_chain(query)
process_llm_response(llm_response=llm_response)

### Performing Embedding


In [76]:


hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=huggingface_api_key,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)
texts = ["Hello, world!"]
embeddings=hf_embeddings.embed_documents(texts)

In [None]:
len(embeddings)

In [None]:
embeddings