## RAG Search Example

In [2]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

In [3]:
vectorstore = DocArrayInMemorySearch.from_texts(
    ["harrison worked at kensho", "bears like to eat honey"],
    embedding=OpenAIEmbeddings(),
)




In [4]:
retriever = vectorstore.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

In [5]:
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()
output_parser = StrOutputParser()

In [6]:
setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)
chain = setup_and_retrieval | prompt | model | output_parser

In [7]:
result = chain.invoke("where did harrison work?")

In [8]:
result

'Harrison worked at Kensho.'

## JSON parsar

In [9]:
from typing import List

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

In [10]:
model = ChatOpenAI(temperature=0)

In [11]:
# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

In [12]:
# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Joke)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

result = chain.invoke({"query": joke_query})

In [13]:
result

{'setup': "Why couldn't the bicycle stand up by itself?",
 'punchline': 'Because it was two tired!'}

## AI ML Assessment

In [125]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma

# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.text_splitter import RecursiveCharacterTextSplitter

In [170]:
documents = []
# Iterate through files in the "docs" directory for processing
for file in os.listdir("docs"):
    # Load PDF files
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    # Load Word documents
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    # Load Txt documents
    # elif file.endswith('.txt'):
    #     text_path = "./docs/" + file
    #     loader = TextLoader(text_path)
    #     documents.extend(loader.load())


In [None]:
# # Split document into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# texts = text_splitter.split_documents(documents)

In [None]:
# #This is used to split text based on chunk_size provided.
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
# all_splits = text_splitter.split_documents(data)# Store

In [171]:
# Split text into chunks for processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
documents = text_splitter.split_documents(documents)

# Create and persist a vector database for document retrieval
vectordb = Chroma.from_documents(documents, embedding=OpenAIEmbeddings())

In [172]:
retriever = vectordb.as_retriever()

In [173]:
model = ChatOpenAI(temperature=0)
# model = ChatOpenAI()

In [174]:
# Define your desired data structure.
# class Joke(BaseModel):
#     setup: str = Field(description="question to set up a joke")
#     punchline: str = Field(description="answer to resolve the joke")

class Joke(BaseModel):
    answer: str = Field(description="The answer provided by the system to the question asked")
    bullet_points: list = Field(description="A list of bullet points emphasizing key details in the answer to improve understanding")
    test_question: str = Field(description="Generated question to evaluate if the user understood the answer provided")
    test_answer: str = Field(description="Generate an answer which will be used to evaluate the user answer for the generated question")

In [175]:
# And a query intented to prompt a language model to populate the data structure.
joke_query = "Who is the CV about?"

In [159]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Joke)

# prompt = PromptTemplate(
#     template="Answer the user query.\n{format_instructions}\n{query}\n",
#     input_variables=["query"],
#     partial_variables={"format_instructions": parser.get_format_instructions()},
# )

In [214]:
prompt = PromptTemplate(
    template="Answer the user query.\n{context}\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions(), "context": documents},
)

In [51]:
# vectorstore = DocArrayInMemorySearch.from_texts(
#     ["harrison worked at kensho", "bears like to eat honey"],
#     embedding=OpenAIEmbeddings(),
# )
# retriever = vectorstore.as_retriever()

In [137]:
setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)

# chain = setup_and_retrieval | prompt | model | output_parser

In [205]:
setup_and_retrieval = RunnableParallel(
    {"context": retriever}
)


In [103]:
# template = """Answer the question based only on the following context:
# {context}

# Question: {question}
# """
# prompt = ChatPromptTemplate.from_template(template)

# parser = StrOutputParser()

In [None]:
from langchain_core.runnables.base import RunnableParallel
from langchain_core.runnables.pass_through import RunnablePassthrough
from langchain_core.runnables.json_output_parser import JsonOutputParser

# Assuming you have a retriever runnable defined as 'retriever'
retriever = ...

# Set up RunnableParallel with JsonOutputParser
setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()},
    output_parsers={"context": JsonOutputParser(), "question": JsonOutputParser()}
)


In [None]:
from langchain_core.runnables.base import BaseRunnable

class RunnablePassthrough(BaseRunnable):
    def __init__(self):
        super().__init__()

    def invoke(self, input, config):
        return input


In [215]:
chain = prompt | model | parser



In [216]:
result = chain.invoke({"query": joke_query})

In [92]:
result = chain.invoke("Who is the CV about?")

In [217]:
result

{'answer': 'The CV is about Chukwuemeka Ezumezu, a Lead Machine Learning Engineer with expertise in deploying machine learning models, conducting data analysis, and software development.',
 'bullet_points': ['Results-driven Lead Machine Learning Engineer with 5+ years of experience',
  'Specializes in cloud computing, data science, MLOps, LLMOps, and algorithm development',
  'Proficient in Python, SQL, and diverse machine learning frameworks',
  'Demonstrated success in optimizing business strategies through data-driven insights'],
 'test_question': 'What is the main expertise of Chukwuemeka Ezumezu?',
 'test_answer': "Chukwuemeka Ezumezu's main expertise is in deploying machine learning models, conducting data analysis, and software development."}

In [36]:
result

{'answer': 'The CV is about John Doe, a seasoned software engineer with over 10 years of experience in the industry.',
 'bullet_points': ['John has worked at top tech companies such as Google and Microsoft.',
  'He specializes in backend development and has expertise in Java, Python, and SQL.',
  "John holds a Master's degree in Computer Science from Stanford University."],
 'test_question': "What is John Doe's area of expertise?",
 'test_answer': 'John Doe specializes in backend development and has expertise in Java, Python, and SQL.'}