# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- gpt-3.5-turbo API

## Setting up LangChain


In [29]:
import os
import dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

In [30]:
DOT_ENV_PATH = "./.env"
dotenv.load_dotenv(DOT_ENV_PATH)

False

## Load multiple and process documents

In [31]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./data', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [39]:
documents[0]

Document(page_content='Original Article\nMethods Matter: Standard Production\nPlatforms for Recombinant AAV ProduceChemically and Functionally Distinct Vectors\nNeil G. Rumachik,1Stacy A. Malaker,1Nicole Poweleit,2Lucy H. Maynard,3Christopher M. Adams,4Ryan D. Leib,4\nGiana Cirolia,3Dennis Thomas,5Susan Stamnes,6Kathleen Holt,6Patrick Sinn,6Andrew P. May,3\nand Nicole K. Paulk3,7\n1Department of Chemistry, Stanford University, Stanford, CA 94305, USA;2Department of Medicine, University of California San Francisco, San Francisco, CA 94305,\nUSA;3Genome Engineering, Chan Zuckerberg Biohub, San Francisco, CA 94158, USA;4Vincent Coates Foundation Mass Spectrometry Laboratory, Stanford\nUniversity, Stanford, CA 94305, USA;5Cryo-EM Core Facility, Cold Spring Harbor Laboratory, Cold Spring Harbor, NY 11724, USA;6Viral Vector Core, University of\nIowa Carver College of Medicine, Iowa City, IA 52242, USA;7Department of Biochemistry & Biophysics, University of California San Francisco, San Franc

In [41]:
PyPDFLoader('/Users/danielgeorge/Documents/work/ml/hypolab/Synapse/server/data/Xia and Maes - 2013 - The design of artifacts for augmenting intellect.pdf').load()

[Document(page_content='The Design  of Artifacts for Augmenting Intel lect\nCassandra Xia  \nMIT Media Lab  \nCambridge, MA 02139  \n(617) 253 -8321  \nxiac@ media. mit.edu  Pattie Maes  \nMIT Media Lab  \nCambridge, MA 02139  \n(617) 253 -7442  \npattie@media.mit.edu  \nABSTRACT  \nFifty years ago, Doug Engelbart created a conceptual framework \nfor augmenting human intellect in the context of problem -solving . \nWe expand upon Engelbart \'s framework and use his concepts of \nprocess hierarchies and artifact augmentation for the design of  \npersonal  intelligence augmentation  (IA) systems within the \ndomains of memory,  motivation,  decision making,  and mood. \nThis paper propose s a systematic design method ology  for personal \nIA devices,  organize s existing IA  research within a logical  \nframework, and uncover s underexplored areas of IA that could \nbenefit from the invention of new artifacts.  \nCategories and Subject Descriptors  \nH.1.2  [User/Machine Sys tems ] \nGen

In [33]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [34]:
len(texts)

249

In [35]:
texts[3]

Document(page_content='in vitro (p < 0.05 –0.0001), in various mouse tissues in vivo\n(p < 0.03 –0.0001), and in human liver in vivo (p < 0.005). These\ndifferences may have clinical implications for rAAV receptorbinding, traf ﬁcking, expression kinetics, expression durability,\nvector immunogenicity, as well as cost considerations.\nINTRODUCTION\nAdeno-associated virus (AAV) is a single-stranded DNA virus that\nis non-pathogenic to humans, exhibits low immunogenicity but\nhigh transduction ef ﬁciency, and is unable to replicate itself.1Re-combinant AAV (rAAV) can stably express gene products from\neither unintegrated episomes2in quiescent tissues, or via integration\nin actively dividing tissues3when designed with appropriate homol-\nogy arms. Gene therapies and passive vaccines with rAAV arerapidly gaining attention and investment following the ﬁrst rAAV', metadata={'source': 'data/Rumachik et al. - 2020 - Methods Matter Standard Production Platforms for.pdf', 'page': 0})

## create the DB

In [36]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [10]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [11]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [12]:
retriever = vectordb.as_retriever()

In [13]:
docs = retriever.get_relevant_documents("How would we use AAV to edit cells?")

In [14]:
len(docs)

4

In [15]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [16]:
retriever.search_type

'similarity'

In [17]:
retriever.search_kwargs

{'k': 2}

## Make a chain

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How would we use AAV to edit cells?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 AAV can be used to edit cells through a gene therapy or a passive vaccine. It can be produced in HEK293 cells using a platform of transient transfection or using baculovirus infection of Spodoptera frugiperda insect cells.


Sources:
data/data/Rumachik et al. - 2020 - Methods Matter Standard Production Platforms for.pdf
data/data/Rumachik et al. - 2020 - Methods Matter Standard Production Platforms for.pdf


In [None]:
# break it down
query = """Favorite representations
- notations (Leibniz), automata, graphs (Bret victor), car recliner button
- keep reading design of everyday things
- Going from 1 to 0 registers through analogy (Python)
- Mendeleev and the periodic table
"""
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'Favorite representations\n- notations (Leibniz), automata, graphs (Bret victor), car recliner button\n- keep reading design of everyday things\n- Going from 1 to 0 registers through analogy (Python)\n- Mendeleev and the periodic table\n',
 'result': " I don't know.",
 'source_documents': [Document(page_content='during the day. If yesterday you met three new people, and you \nwere made aware of the fact today, you might feel pressure d to \nmeet or exceed yesterday\'s number. If you were not keeping track \nof the daily number, yesterday\'s achievement would have no \npositive bearing on your actions today. Effectively this means that \neven if the artifacts we design for augmenting aspects of cognition \ndo not fun ction perfectly, we may get at least an initial \nimprovement in  functionality purely based on this measurement \nand increased awa reness phenomenon.  \nPopulations  \nUseful parallels with the biological sciences need not end with co -\nevolution. In his 1962 p

In [None]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Iron Pillar and Uncorrelated Ventures.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [None]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Databricks acquired Okera, a data governance platform with a focus on AI.


Sources:
new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt


In [None]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Generative AI is a type of artificial intelligence that is used to create new content associated with a company, such as content for a website or ads. It can also be used to automate processes and workflows.


Sources:
new_articles/05-04-slack-updates-aim-to-put-ai-at-the-center-of-the-user-experience.txt
new_articles/05-03-nova-is-building-guardrails-for-generative-ai-content-to-protect-brand-integrity.txt


In [None]:
query = "Who is CMA?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The CMA stands for the Competition and Markets Authority.


Sources:
new_articles/05-04-cma-generative-ai-review.txt
new_articles/05-04-cma-generative-ai-review.txt


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7f9f7dc82aa0>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


## Deleteing the DB

In [None]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/chroma-collections.parquet (deflated 50%)
  adding: db/index/ (stored 0%)
  adding: db/index/index_metadata_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 5%)
  adding: db/index/uuid_to_id_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 39%)
  adding: db/index/index_59c51927-205d-4fd7-88d8-c7ba851bd2a5.bin (deflated 17%)
  adding: db/index/id_to_uuid_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl (deflated 35%)
  adding: db/chroma-embeddings.parquet (deflated 29%)


In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

## Starting again loading the db

restart the runtime

In [None]:
!unzip db.zip

Archive:  db.zip
   creating: db/
  inflating: db/chroma-collections.parquet  
   creating: db/index/
  inflating: db/index/index_metadata_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/uuid_to_id_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/index_59c51927-205d-4fd7-88d8-c7ba851bd2a5.bin  
  inflating: db/index/id_to_uuid_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/chroma-embeddings.parquet  


In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})



In [None]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Pando raised $30 million in a Series B round, bringing its total raised to $45 million.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


### Chat prompts

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}


In [42]:
try:
    with open('/Users/danielgeorge/Documents/work/ml/hypolab/Synapse/server/dirty_index/test.txt', 'w') as f:
        f.write('test')
except Exception as e:
    print(e)

[Errno 2] No such file or directory: '/Users/danielgeorge/Documents/work/ml/hypolab/Synapse/server/dirty_index/test.txt'
