In [None]:
!pip -q install langchain openai tiktoken chromadb 

In [None]:
!pip show langchain

In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info 
- gpt-3.5-turbo API

## Setting up LangChain 


In [1]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader


## Load multiple and process documents

In [7]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./Documents/', glob="./*.pdf")

documents = loader.load()

In [8]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [9]:
len(texts)

1488

In [14]:
texts[1000]

Document(page_content='Note:\n\nFor seasonal work see subsection 16A(1).\n\n(2) Carer payment is not payable to the person:\n\n(a) if the person is subject to a seasonal work preclusion period (whether in relation to the claim referred to in subsection (1) or any other claim under this Act) and the Secretary has not made a determination under subsection (3) in relation to the person—for the person’s seasonal work preclusion period; or\n\n(b) if the Secretary has made a determination under\n\nsubsection (3) in relation to the person—for that part (if any) of the person’s seasonal work preclusion period to which the person is subject as a result of the determination.\n\nNote:\n\nFor seasonal work preclusion period see subsection 16A(1).', metadata={'source': 'Documents/Social Security Act 1991.pdf'})

## create the DB

In [15]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [16]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [17]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

## Make a retriever

In [18]:
retriever = vectordb.as_retriever()

In [19]:
docs = retriever.get_relevant_documents("Define three rules from social security")

In [20]:
len(docs)

4

In [21]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [22]:
retriever.search_type

'similarity'

In [23]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [27]:
# full example
query = "Define three rules from social security"
llm_response = qa_chain(query)
process_llm_response(llm_response)


1. Where a provision of the Social Security Act 1991 refers to the least or lowest of three or more amounts and two or more (but not all) of the amounts are equal and are less than the other amount or other amounts, the provision is taken to refer to one only of those equal amounts.
2. For the purposes of the Social Security Act 1991, a person is taken to be receiving a payment from the earliest day on which the payment is payable to the person, even if the first instalment of the payment is not paid until a later day.
3. For the purposes of the Social Security Act 1991, a person is taken to be receiving a social security payment until the latest day on which the payment is payable to the person, even if the last instalment of the payment is not paid until a later day.


Sources:
Documents/Social Security Act 1991.pdf
Documents/Social Security Act 1991.pdf


In [28]:
# break it down
query = "Define three rules from social security"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'Define three rules from social security',
 'result': ' Rule 1: If a provision of the Social Security Act 1991 refers to the least or lowest of 3 or more amounts, and two or more of the amounts (but not all) are equal and are less than the other amount or amounts, the provision is taken to refer to one only of those equal amounts.\n\nRule 2: A person is taken to be receiving a payment under the Social Security Act 1991 from the earliest day on which the payment is payable to the person, even if the first instalment of the payment is not paid until a later day.\n\nRule 3: A person is taken to be receiving a social security payment until the latest day on which the payment is payable to the person, even if the last instalment of the payment is not paid until a later day.\n\nRule 4: The Secretary may determine that an income stream that meets the requirements of subsection (2) is not an asset-test exempt income stream if the Secretary is satisfied that the person who has purchas

In [None]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Iron Pillar and Uncorrelated Ventures.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [None]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Databricks acquired Okera, a data governance platform with a focus on AI.


Sources:
new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt


In [None]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Generative AI is a type of artificial intelligence that is used to create new content associated with a company, such as content for a website or ads. It can also be used to automate processes and workflows.


Sources:
new_articles/05-04-slack-updates-aim-to-put-ai-at-the-center-of-the-user-experience.txt
new_articles/05-03-nova-is-building-guardrails-for-generative-ai-content-to-protect-brand-integrity.txt


In [None]:
query = "Who is CMA?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The CMA stands for the Competition and Markets Authority.


Sources:
new_articles/05-04-cma-generative-ai-review.txt
new_articles/05-04-cma-generative-ai-review.txt


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7f9f7dc82aa0>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


## Deleteing the DB

In [None]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma-collections.parquet (deflated 50%)
  adding: db/index/ (stored 0%)
  adding: db/index/index_metadata_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 5%)
  adding: db/index/uuid_to_id_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 39%)
  adding: db/index/index_59c51927-205d-4fd7-88d8-c7ba851bd2a5.bin (deflated 17%)
  adding: db/index/id_to_uuid_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 35%)
  adding: db/chroma-embeddings.parquet (deflated 29%)


In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

## Starting again loading the db

restart the runtime

In [None]:
!unzip db.zip

Archive:  db.zip
   creating: db/
  inflating: db/chroma-collections.parquet  
   creating: db/index/
  inflating: db/index/index_metadata_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/uuid_to_id_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/index_59c51927-205d-4fd7-88d8-c7ba851bd2a5.bin  
  inflating: db/index/id_to_uuid_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/chroma-embeddings.parquet  


In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})



In [None]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [None]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Pando raised $30 million in a Series B round, bringing its total raised to $45 million.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


### Chat prompts

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
