In [2]:
from dotenv import load_dotenv
import openai
import os
from uuid import uuid4
load_dotenv(".env")
openai_api_key = os.environ.get('OPENAI_API_KEY')
serp_api_key = os.environ.get('SERPER_API_KEY')


unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.environ.get('LANGCHAIN_API_KEY')

In [77]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from dataclasses import dataclass
chroma_directory = 'db3/'

db = Chroma(persist_directory=chroma_directory, embedding_function=OpenAIEmbeddings())
file_path = "JiaHao_Lo.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()


@dataclass
class Document:
    page_content: str
    metadata: dict

def extract_information(documents):
    # Schema
    schema = {
        "properties": {
            "name": {"type": "string"},
            "phone_number": {"type": "string"},
            "email": {"type": "string"},
            "local": {"type": "string"},
            "last role": {"type": "string"},
            "years of experience": {"type": "string"},
            "education level": {"type": "string"},
            "CGPA": {"type": "integer"},
            "University": {"type": "string"},
            "Education Background": {"type": "string"},
            "Data Science Background": {"type": "string"},
            "Relevant experience": {"type": "string"},
        },
        "required": ["name", "height"],
    }

    # Input
    inp = documents

    # Run chain
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125",verbose=True)
    chain = create_extraction_chain(schema, llm,verbose=True)
    result = chain.run(inp)
    return result

results = extract_information(documents)

document_objects = []
# Construct page content from dictionary values
page_content = ""
for key, value in results[0].items():
    page_content += f"{key}: {value}\n"

# Construct metadata
metadata = {'source': file_path, 'page': 0}  # You may adjust the source accordingly

# Create Document object
document = Document(page_content=page_content, metadata=metadata)
document_objects.append(document)

print(document_objects)
db.add_documents(documents=document_objects)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: Extract and save the relevant entities mentioned in the following passage together with their properties.

Only extract the properties mentioned in the 'information_extraction' function.

If a property is not present and is not required in the function parameters, do not include it in the output.

Passage:
[Document(page_content='Jia-Hao Lo   \njiahaolo@u.northwestern.edu | www.linkedin.com/in/jiahaolo | (872) -810-5588  \nEDUCATION \n \nNorthwestern University CGPA: 3. 83/4.00                                                                                        Evanston, Illinois  \nMaster of Science in Analytics                                                                                     Sept 2021 – Dec 202 2 (Expected)  \nPETRONAS Educational Sponsorship Program 2015 ( Full sponsorship for postgraduate study) \n \nUniversity Technology P ETRONAS  CGPA: 3.92/4. 00                          

['703a3752-d410-11ee-a600-40b0765f3723']

In [59]:
file_path = "GooYeJui_CV.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()
db.add_documents(documents=documents)

['efc1cc25-d40d-11ee-a289-40b0765f3723',
 'efc1cc26-d40d-11ee-9dc2-40b0765f3723']

In [68]:
file_path = "JiaHao_Lo.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()
db.add_documents(documents=documents)

['07c8c641-d40f-11ee-a10e-40b0765f3723']

In [69]:
documents

[Document(page_content='Jia-Hao Lo   \njiahaolo@u.northwestern.edu | www.linkedin.com/in/jiahaolo | (872) -810-5588  \nEDUCATION \n \nNorthwestern University CGPA: 3. 83/4.00                                                                                        Evanston, Illinois  \nMaster of Science in Analytics                                                                                     Sept 2021 – Dec 202 2 (Expected)  \nPETRONAS Educational Sponsorship Program 2015 ( Full sponsorship for postgraduate study) \n \nUniversity Technology P ETRONAS  CGPA: 3.92/4. 00                                                                   Malaysia   \nBachelor of Civil Engineering  in Offshore Engineering                                      Sept 2016 – Sept 2020  \nFirst Class Honors , Dean’s List Award (2016 -2020)  \nPETRONAS  Educational Sponsorship Program 2015 ( Top 3% of >10000 applicants)   \n \nRelevant & Expected Coursework :  \nPredictive  Analytics, Databases (SQL) , Data Vis

In [49]:
from langchain.memory import ConversationSummaryMemory
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
def get_doc_search(text_splitter):
    return FAISS.from_texts(text_splitter, OpenAIEmbeddings())

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
memory = ConversationSummaryMemory(
    llm=llm, memory_key="chat_history", return_messages=True
)

db_retriever = db.as_retriever()

qa = ConversationalRetrievalChain.from_llm(llm, retriever=db_retriever, memory=memory)
qa("What are the names of these candidates?")

{'question': 'What are the names of these candidates?',
 'chat_history': [SystemMessage(content='')],
 'answer': "I'm sorry, but the information provided does not include the names of the candidates."}

In [50]:
qa("Who is Ang Teik Hun?")

{'question': 'Who is Ang Teik Hun?',
 'chat_history': [SystemMessage(content='The human asks for the names of the candidates. The AI apologizes and explains that the information provided does not include the names of the candidates.')],
 'answer': 'Ang Teik Hun is a highly motivated and skilled data scientist with a strong background in computer science and statistical analysis. They have experience in using data mining and machine learning techniques to solve complex problems. Ang Teik Hun is proficient in Python, R, and SQL. They have completed a Master of Applied Data Analytics at the Australian National University, majoring in Neural Network and Document Analysis. Additionally, they have certifications in Microsoft Azure Fundamentals and Azure AI Engineer Associate. Ang Teik Hun has also worked as a Data Scientist Intern at Petroliam Nasional Berhad and is currently a tutor at the Australian National University teaching courses on database management and design.'}

In [30]:
from langchain import OpenAI 
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.agents import Tool
import random
from PyPDF2 import PdfReader



# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0.3,
    model_name='gpt-3.5-turbo-0125'
)

# def text_extractor(file_path=""):
#     file_path = "Ang Teik Hun Resume.pdf"
#     pdf_reader = PdfReader(f"{file_path}")

#     # read data from the file and put them into a variable called text
#     text = ''
#     for i, page in enumerate(pdf_reader.pages):
#         extracts = page.extract_text()
#         text += extracts
#     print("final result:",text)
#     return text

def text_extractor(file_path=""):
    file_path = "Ang Teik Hun Resume.pdf"
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents)
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

    # Retrieve and generate using the relevant snippets of the blog.
    retriever = vectorstore.as_retriever()
    return retriever

text_extractor_tool = Tool(
                        name = "text_extractor",
                        func=text_extractor,
                        description="useful for when you need to extract text from a pdf of the resume."
                    )

def extract_information(documents):
    # Schema
    schema = {
        "properties": {
            "name": {"type": "string"},
            "phone_number": {"type": "string"},
            "email": {"type": "string"},
            "local": {"type": "string"},
            "last role": {"type": "string"},
            "years of experience": {"type": "string"},
            "education level": {"type": "string"},
            "CGPA": {"type": "integer"},
            "University": {"type": "string"},
            "Education Background": {"type": "string"},
            "Data Science Background": {"type": "string"},
            "Relevant experience": {"type": "string"},
        },
        "required": ["name", "height"],
    }

    # Input
    inp = documents

    # Run chain
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125",verbose=True)
    chain = create_extraction_chain(schema, llm,verbose=True)
    result = chain.run(inp)
    return result


information_extractor_tool = Tool(
                        name = "extract_information",
                        func=extract_information,
                        description="useful for when you need to extract information into structured python dictonary."
                    )

def educational_background(input):
    return input[0]['Education Background']

educational_background_tool = Tool(
                        name = "educational_background",
                        func=educational_background,
                        description="useful for when you need to answer question about education background."
                    )

In [28]:
from langchain.agents import initialize_agent

tools = [text_extractor_tool, information_extractor_tool, educational_background_tool]

# conversational agent memory
memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=3,
    return_messages=True
)


# create our agent
conversational_agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=turbo_llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=memory
)

conversational_agent("Which university is this candidate from?")

  warn_deprecated(
  warn_deprecated(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "educational_background",
    "action_input": "Which university is this candidate from?"
}
```[0m

TypeError: string indices must be integers, not 'str'

In [29]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

loader = PyPDFLoader('Ang Teik Hun Resume.pdf')

documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)


turbo_llm = ChatOpenAI(
    temperature=0.3,
    model_name='gpt-3.5-turbo-0125'
)

from langchain.chains import RetrievalQA
from langchain.schema import retriever

retriever = db.as_retriever()
chain = RetrievalQA.from_chain_type(llm=turbo_llm, chain_type="stuff", retriever=retriever)

query="When was this candidate graduated?"
out = chain.invoke(query)
print(out['result'])



The candidate, Ang Teik Hun, graduated with a Master of Applied Data Analytics from the Australian National University in Canberra, Australia in December 2022.


In [41]:
from langchain import OpenAI 
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.agents import Tool
from langchain.tools import BaseTool
import random
from PyPDF2 import PdfReader
from langchain.chains import create_extraction_chain


# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0.3,
    model_name='gpt-3.5-turbo-0125'
)

def text_extractor(file_path=""):
    file_path = "Ang Teik Hun Resume.pdf"
    pdf_reader = PdfReader(f"{file_path}")

    # read data from the file and put them into a variable called text
    text = ''
    for i, page in enumerate(pdf_reader.pages):
        extracts = page.extract_text()
        text += extracts
    print("final result:",text)
    return text

text_extractor_tool = Tool(
                        name = "text_extractor",
                        func=text_extractor,
                        description="useful for when you need to extract text from a pdf of the resume."
                    )

def extract_information(documents):
    # Schema
    schema = {
        "properties": {
            "name": {"type": "string"},
            "phone_number": {"type": "string"},
            "email": {"type": "string"},
            "local": {"type": "string"},
            "last role": {"type": "string"},
            "years of experience": {"type": "string"},
            "education level": {"type": "string"},
            "CGPA": {"type": "integer"},
            "University": {"type": "string"},
            "Education Background": {"type": "string"},
            "Data Science Background": {"type": "string"},
            "Relevant experience": {"type": "string"},
        },
        "required": ["name", "height"],
    }

    # Input
    inp = documents

    # Run chain
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125",verbose=True)
    chain = create_extraction_chain(schema, llm,verbose=True)
    result = chain.run(inp)
    return result


information_extractor_tool = Tool(
                        name = "extract_information",
                        func=extract_information,
                        description="useful for when you need to extract information into structured python dictonary."
                    )

def educational_background(input):
    return input[0]['Education Background']

educational_background_tool = Tool(
                        name = "educational_background",
                        func=educational_background,
                        description="useful for when you need to answer question about education background."
                    )

In [6]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [11]:
rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique used to break down complex tasks into smaller and simpler steps. This approach helps agents to plan and execute tasks more efficiently by dividing them into manageable components. Task decomposition can be achieved through various methods such as prompting with specific instructions or utilizing human inputs.'

In [13]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large",
    }
)

'What is the definition of "large" in this context?'

In [16]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):
        return contextualize_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(
        context=contextualized_question | retriever | format_docs
    )
    | qa_prompt
    | llm
)

chat_history = []

question = "What is Task Decomposition?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

second_question = "What are common ways of doing it?"
rag_chain.invoke({"question": second_question, "chat_history": chat_history})

AIMessage(content='Task decomposition can be done in common ways such as using prompting techniques like Chain of Thought or Tree of Thoughts to guide models to break down tasks into smaller steps. Additionally, task-specific instructions can be provided to help with decomposition, such as asking for subgoals or outlining steps for achieving a specific task. Human inputs can also be utilized for task decomposition, allowing for a more tailored approach to breaking down complex tasks.')

In [82]:
from operator import itemgetter

from langchain.memory import ConversationBufferMemory
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableLambda

from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)


template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)


DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

def text_extractor():
    chroma_directory = 'db3/'

    db = Chroma(persist_directory=chroma_directory, embedding_function=OpenAIEmbeddings())
    db_retriever = db.as_retriever()

    return db_retriever

memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | CONDENSE_QUESTION_PROMPT
    | ChatOpenAI(temperature=0)
    | StrOutputParser(),
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | text_extractor(),
    "question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | ChatOpenAI(),
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [83]:
inputs = {"question": "Name these candidates one by one."}
result = final_chain.invoke(inputs)
result

{'answer': AIMessage(content='1. Goo Ye Jui\n2. Jia-Hao Lo\n3. Ang Teik Hun'),
 'docs': [Document(page_content="name: Goo Ye Jui\nphone_number: +6018-4040438\nemail: yjyejui626@gmail.com\nlocal: LORONG JAMBU MADU 2, TAMAN JAMBU MADU, 14000 BUKIT MERTAJAM, PENANG\nlast role: DATA SCIENCE INTERN\nRelevant experience: Assisted in the development of machine learning model contributing to the automation of critical business processes. Collaborated with a team of data professionals to work on live projects, applying data analytics and machine learning to derive actionable insights and drive innovation. Gained hands-on experience in data science methodologies, tools, and technologies, furthering the journey toward becoming a proficient data scientist. Applied techniques like natural language processing and OpenAI GPT-3 text-davinci-003 engine on the Resume Parser App. Integrated a new feature to automate the process of identifying the candidate’s criteria specified by HR. Improved User Experi

In [87]:
# Note that the memory does not save automatically
# This will be improved in the future
# For now you need to save it yourself
memory.save_context(inputs, {"answer": result["answer"].content})

In [88]:
memory.load_memory_variables({})

{'history': [HumanMessage(content='Name these candidates one by one.'),
  AIMessage(content='1. Goo Ye Jui\n2. Jia-Hao Lo\n3. Ang Teik Hun'),
  HumanMessage(content='In the scale of 1-10, how would you rate their skill sets as a Data Scientis?'),
  AIMessage(content='Based on the information provided, Goo Ye Jui has relevant experience as a Data Science Intern and has a Bachelor of Computer Science in Data Engineering with Honours. Jia-Hao Lo has a Master of Science in Analytics and relevant experience in Data Science Department On-Job Training. Ang Teik Hun has a Master of Applied Data Analytics with a background in Neural Network and Document Analysis. \n\nConsidering their education, experience, and skills, Goo Ye Jui would be rated around 7, Jia-Hao Lo around 6, and Ang Teik Hun around 8 as Data Scientists on a scale of 1-10.')]}

In [90]:
inputs = {"question": "In the scale of 1-10, how would you rate their skill sets as a Data Scientis?"}
result = final_chain.invoke(inputs)
result

{'answer': AIMessage(content="Based on the provided information, Goo Ye Jui seems to have a solid foundation in data science with relevant experience in developing machine learning models and working on live projects. Jia-Hao Lo has experience in data science through on-job training and engineering internships. Ang Teik Hun has a strong background in data science with a Master's degree in Applied Data Analytics and relevant experience in using various data mining and machine learning techniques.\n\nOn a scale of 1-10, Goo Ye Jui could be rated around 7, Jia-Hao Lo around 5, and Ang Teik Hun around 8 based on their education level, relevant experience, and skills mentioned in the context."),
 'docs': [Document(page_content="name: Goo Ye Jui\nphone_number: +6018-4040438\nemail: yjyejui626@gmail.com\nlocal: LORONG JAMBU MADU 2, TAMAN JAMBU MADU, 14000 BUKIT MERTAJAM, PENANG\nlast role: DATA SCIENCE INTERN\nRelevant experience: Assisted in the development of machine learning model contribu

In [91]:
inputs = {"question": "Please rank the candidates based on their skill set score."}
result = final_chain.invoke(inputs)
result

{'answer': AIMessage(content="Based on the provided information, the candidates can be ranked based on their skill set score as follows:\n\n1. Ang Teik Hun\n2. Goo Ye Jui\n3. Jia-Hao Lo\n\nAng Teik Hun has a comprehensive skill set that includes a strong background in data analytics, machine learning, neural networks, and various relevant certifications and courses. Goo Ye Jui also has relevant experience in data science and machine learning, but Ang Teik Hun's skill set appears to be more extensive. Jia-Hao Lo has a background in analytics and engineering but may not have as much specific experience in data science and machine learning as the other two candidates."),
 'docs': [Document(page_content="name: Goo Ye Jui\nphone_number: +6018-4040438\nemail: yjyejui626@gmail.com\nlocal: LORONG JAMBU MADU 2, TAMAN JAMBU MADU, 14000 BUKIT MERTAJAM, PENANG\nlast role: DATA SCIENCE INTERN\nRelevant experience: Assisted in the development of machine learning model contributing to the automation 