#### Simple GenAi App using Langchain

In [None]:
### Load Data -> Docs -> Divide into Chunks -> Convert into vectors -> embeddings -> store 

In [1]:
# export all the env variables 

import os 
from dotenv import load_dotenv
load_dotenv()


os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# For Langsmith Tracking
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')

In [4]:
# to retreive any data from the website we need to read the entire website 
# bs4 for web scraping

## Data Ingestion -> from the website scrap the data

from langchain_community.document_loaders import WebBaseLoader
import bs4
loader = WebBaseLoader(web_path=('https://docs.smith.langchain.com/self_hosting/architectural_overview'))
loader

<langchain_community.document_loaders.web_base.WebBaseLoader at 0x1997e9b74c0>

In [5]:
docs = loader.load()
docs

[Document(metadata={'source': 'https://docs.smith.langchain.com/self_hosting/architectural_overview', 'title': 'Architectural overview | 🦜️🛠️ LangSmith', 'description': 'Self-Hosted LangSmith is an add-on to the Enterprise Plan designed for our largest, most security-conscious customers. See our pricing page for more detail, and contact us at sales@langchain.dev if you want to get a license key to trial LangSmith in your environment.', 'language': 'en'}, page_content='\n\n\n\n\nArchitectural overview | 🦜️🛠️ LangSmith\n\n\n\n\n\n\nSkip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!API ReferenceRESTPythonJS/TSSearchRegionUSEUGo to AppGet StartedObservabilityEvaluationPrompt EngineeringDeployment (LangGraph Platform)AdministrationSelf-hostingArchitectural overviewScriptsInstallationConfigurationUsageUpgradesEgress for Subscription Metrics and Operational MetadataOrganization ChartsRelease notes (self-hosted)Frequently asked ques

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter( chunk_size=300 , chunk_overlap=20 )
documents = text_splitter.split_documents(docs)
documents

[Document(metadata={'source': 'https://docs.smith.langchain.com/self_hosting/architectural_overview', 'title': 'Architectural overview | 🦜️🛠️ LangSmith', 'description': 'Self-Hosted LangSmith is an add-on to the Enterprise Plan designed for our largest, most security-conscious customers. See our pricing page for more detail, and contact us at sales@langchain.dev if you want to get a license key to trial LangSmith in your environment.', 'language': 'en'}, page_content='Architectural overview | 🦜️🛠️ LangSmith'),
 Document(metadata={'source': 'https://docs.smith.langchain.com/self_hosting/architectural_overview', 'title': 'Architectural overview | 🦜️🛠️ LangSmith', 'description': 'Self-Hosted LangSmith is an add-on to the Enterprise Plan designed for our largest, most security-conscious customers. See our pricing page for more detail, and contact us at sales@langchain.dev if you want to get a license key to trial LangSmith in your environment.', 'language': 'en'}, page_content='Skip to mai

In [13]:
# converting into embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()


In [None]:
from langchain_community.vectorstores import FAISS
vector_store = FAISS.from_documents(documents , embeddings)

In [None]:
vector_store    # show u <langchain_community.vectorstores.FAISS at 0x7f7f3c1b3b50>

In [None]:
## Query from vector store db

query = "The LangSmith application consists of several components including 5 LangSmith servers and 3 stateful services"
result = vector_store.similarity_search(query)
result[0].page_content

In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = "gpt-4o")

In [None]:
## If we want some meaningful query and we will provide context to the query then use RETRIEVAL CHAIN

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
    ''' 
        Answer the following question based on the provided context:
        <context>
        {context}
        </context
    '''
)

documents_chain = create_stuff_documents_chain(llm, prompt)
documents_chain      # ChatPromptTemplate -> ChatOpenAI -> StrOutputParser


In [None]:
from langchain_core.documents import Document

documents_chain.invoke({
    "input":"The LangSmith application consists of several components including 5 LangSmith servers and 3 stateful services" ,
    "context":[Document(page_content="rrnkrrcorcnr3cn3")]
})

In [None]:
# Retriever 
''' 
The data is stored in vectorstoredb . Retriever acts as an interface that get the data from vectorstoredb and get the resp alternately 
from it .


Input ---> Retriever ---> VectorStoreDB ---> Response

'''

retriever = vector_store.as_retriever()    # <langchain_core.retrievers.VectorStoreDBRetriever at 0x7f7f3c1b3b50>

In [None]:

from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever , documents_chain)

In [None]:
## get the resp 
resp = retrieval_chain.invoke({
    "input":"The LangSmith application consists of several components including 5 LangSmith servers and 3 stateful services"
})

resp['answer']