- This notebooks ingests the data source, creates embeddings using OpenAI's LLM and stores it in a vector database for querying
- Refer to the following free cource for setup etc : https://learn.deeplearning.ai/langchain-chat-with-your-data/lesson/1/introduction

In [1]:
import os
import openai
import sys
# sys.path.append('../..')

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader('Notion_DB2')
docs = loader.load()

In [3]:
len(docs)

5

In [4]:
docs[0].page_content

'# Tools\n\nOwner: Arvind Narayan\n\n| Tool | Access URL | Login Credentials | Point of Contact |\n| --- | --- | --- | --- |\n| Jupyter Notebook | https://research.google.com/colaboratory/ | Provided upon completion of IT security training | Kevin Malone\nkmalone@dundermiflin.com |\n| Cloud Platform | https://console.cloud.google.com/ | Provided by IT | Kevin Malone\nkmalone@dundermiflin.com |\n| Data warehouse | https://cloud.google.com/bigquery | Provided by IT | Kevin Malone\nkmalone@dundermiflin.com |\n| Github | https://docs.github.com/en/get-started/quickstart/creating-an-account-on-github | Create your own account | Ryan Howard\nrhoward@dundermiflin.com |\n| Slack | https://slack.com/ | Your LDAP | Ryan Howard\nrhoward@dundermiflin.com |\n| Project Management Tool | https://slack.com/ | Your LDAP | Ryan Howard\nrhoward@dundermiflin.com |'

In [5]:
docs[0].metadata

{'source': 'Notion_DB2/Tools d3bb088d280948eebbd01408ad3ad3b0.md'}

In [6]:
topics = [doc.metadata for doc in docs]
topics

[{'source': 'Notion_DB2/Tools d3bb088d280948eebbd01408ad3ad3b0.md'},
 {'source': 'Notion_DB2/Team Members 5bc2908dd5de46f8a572a4e78b56f986.md'},
 {'source': 'Notion_DB2/Onboarding 293779f65bd041b1b77e116e5fb7ab6b.md'},
 {'source': 'Notion_DB2/Data Science and Analytics fd5309a796924d6f819829a19164228e.md'},
 {'source': 'Notion_DB2/Projects e6349f4e41024ccca2a0358224bf903e.md'}]

### Split in to chunks

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
#RecursiveCharacterTextSplitter aids in semantic splitting based on paragraph structure

In [9]:
splits = text_splitter.split_documents(docs)

In [10]:
len(splits)

7

In [11]:
splits[0]

Document(page_content='# Tools\n\nOwner: Arvind Narayan\n\n| Tool | Access URL | Login Credentials | Point of Contact |\n| --- | --- | --- | --- |\n| Jupyter Notebook | https://research.google.com/colaboratory/ | Provided upon completion of IT security training | Kevin Malone\nkmalone@dundermiflin.com |\n| Cloud Platform | https://console.cloud.google.com/ | Provided by IT | Kevin Malone\nkmalone@dundermiflin.com |\n| Data warehouse | https://cloud.google.com/bigquery | Provided by IT | Kevin Malone\nkmalone@dundermiflin.com |\n| Github | https://docs.github.com/en/get-started/quickstart/creating-an-account-on-github | Create your own account | Ryan Howard\nrhoward@dundermiflin.com |\n| Slack | https://slack.com/ | Your LDAP | Ryan Howard\nrhoward@dundermiflin.com |\n| Project Management Tool | https://slack.com/ | Your LDAP | Ryan Howard\nrhoward@dundermiflin.com |', metadata={'source': 'Notion_DB2/Tools d3bb088d280948eebbd01408ad3ad3b0.md'})

In [12]:
splits[0].metadata ## page_content and metadata are available...

{'source': 'Notion_DB2/Tools d3bb088d280948eebbd01408ad3ad3b0.md'}

### Create Embeddings

In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [15]:
from langchain.vectorstores import Chroma

In [16]:
persist_directory = './chroma/'

In [17]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [18]:
question = "Who is the manager?"

In [19]:
docs = vectordb.similarity_search(question,k=3)

In [20]:
len(docs)

3

In [21]:
docs[0].page_content

'# Team Members\n\nOwner: Arvind Narayan\n\n| Name | Role | Email Id |\n| --- | --- | --- |\n| Michael Scott | Team Manager | mscott@dundermifflin.com |\n| Jim Halpert | Program Manager | jhalpert@dundermifflin.com |\n| Dwight Schrute | Program Manager | dschrute@dundermifflin.com |\n| Pam Beesly | Sr Data Scientist | pbeesly@dundermifflin.com |\n| Andy Bernard | Sr Data Scientist | abernard@dundermifflin.com |\n| Kelly Kapoor | Data Scientist | kkapoor@dundermifflin.com |\n| Oscar Martinez | Data Engineer | omartinez@dundermifflin.com |'

In [22]:
docs[2].metadata

{'source': 'Notion_DB2/Onboarding 293779f65bd041b1b77e116e5fb7ab6b.md'}

In [23]:
# save the vector db
vectordb.persist()

In [None]:
# #to retrieve vectordb from local memory
# persist_directory = './chroma2/'
# embedding = OpenAIEmbeddings()
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

### Retreival with memory

In [24]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [25]:
#initialize llm
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")

'Hello! How can I assist you today?'

In [26]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [27]:
## run this cell to clear chat history

#memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

#retriever
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    chain_type = 'stuff' #default = 'stuff'
)

In [28]:
### chat bot with history

In [29]:
question = "who is the manager?"
result = qa({"question": question})

In [30]:
print(result['answer'])

The manager of the Data Science and Analytics team is Michael Scott.


In [31]:
question = "what is his emailid?"
result = qa({"question": question})

In [32]:
print(result['answer'])

Michael Scott's email id is mscott@dundermifflin.com.


In [33]:
result['chat_history']

[HumanMessage(content='who is the manager?'),
 AIMessage(content='The manager of the Data Science and Analytics team is Michael Scott.'),
 HumanMessage(content='what is his emailid?'),
 AIMessage(content="Michael Scott's email id is mscott@dundermifflin.com.")]