In [10]:
import os
os.environ["OPENAI_API_KEY"]=""

In [2]:
pip install langchain

Collecting langchain
  Downloading langchain-0.0.306-py3-none-any.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.0/1.8 MB[0m [31m15.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.38 (from langchain)
  Downloading langsmith-0.0.41-py3-none-any.whl (39 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langcha

In [19]:
pip install -r requirements.txt


Collecting tiktoken (from -r requirements.txt (line 4))
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.5.1


In [3]:
## This cell downloads the langchain

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [4]:
## Load and process multiple documents
### For a single file: loader= TextLoader('single_text_file.txt')
loader=DirectoryLoader('./news_articles/',glob="./*.txt",loader_cls=TextLoader)
documents=loader.load()

In [5]:
## splitting the text into chunks
text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
texts=text_splitter.split_documents(documents)

In [6]:
## checks whether the code is chunked properly
len(texts)

13

In [8]:
## checks whether the documents are working
texts[3]

Document(page_content="drone which was flown over a packed football stadium in Manchester, England, just over a week ago, resulting in the suspected pilot being arrested . They are consulting with the military and members of its counterterrorism, bomb squad, emergency services and aviation units are working on a plan to counter weaponized drones. The NYPD hasn't received any intelligence indicating there is an imminent threat, but has become increasingly concerned over the last year. Deputy Chief Salvatore DiPace told CBS News:\xa0'We've looked at some people that have jury-rigged these drones to carry guns, to carry different types of explosives if they wanted to; there's just so many possibilities that we're very worried about.' Mr Dipace said police had also seen video showing how accurate an attack from a drone could be:\xa0'We've seen some video where the drone was flying at different targets along the route and very accurately hitting the targets with the paintball. The NYPD now 

** Creating the ChromaDB**

In [20]:
## Embedding and storing the texts
# Supplying a persist_directory will store the embeddings on disk. The location and name of the folder
persist_directory='db'

##3 initially we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding= OpenAIEmbeddings()
vectordb= Chroma.from_documents(documents=texts,
                                embedding=embedding,
                                persist_directory=persist_directory)

In [22]:
##persist the db to disk
vectordb.persist()
vectordb= None

In [23]:
## Now we can load the persisted database from disk and use it as normal
vectordb= Chroma(persist_directory=persist_directory,
                 embedding_function=embedding)

**Make a Retriever**

In [24]:
## This section will help extract the files from the database
retriever= vectordb.as_retriever()
docs=retriever.get_relevant_documents("What has the NYPD been doing?")
len(docs)

4

In [25]:
retriever= vectordb.as_retriever(search_kwargs={"k":2})

In [26]:
retriever.search_type

'similarity'

In [27]:
 retriever.search_kwargs

{'k': 2}

** Make A Chain**

In [28]:
## create the chain to answer questions

qa_chain=RetrievalQA.from_chain_type(llm=OpenAI(),
                                     chain_type="stuff",
                                     retriever=retriever,
                                     return_source_documents=True)

In [29]:
## Cite Sources
def process_llm_responses(llm_response):
  print(llm_response['result'])
  print('\n\nSources:')
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [30]:
## full example
query=" What has the NYPD been doing?"
llm_response=qa_chain(query)
process_llm_responses(llm_response)

 The NYPD has been investigating ways to stop potential attacks by drones, developing technology which will allow them to take control of drones, and scanning the skies for drones before major events.


Sources:
news_articles/article_000000.txt
news_articles/article_000000.txt


In [31]:
##breaking it down to see what is going on
query="How is NYPD like?"
llm_response=qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'How is NYPD like?',
 'result': ' The NYPD is concerned about drones being used as potential weapons. They are investigating ways to stop potential attacks and are consulting with the military and members of the public to develop technology that will allow them to take control of drones and scan the skies for them.',
 'source_documents': [Document(page_content="in New York City in the last year, with 40 recorded. In some cases unmanned aircraft systems or drones had flown into airspace being used by NYPD helicopters. In one incident this summer, a drone which was almost 800 feet off the ground, nearly collided with a police helicopter. NYPD Aviation Unit Member, Sergeant Antonio Hernandez said: 'We're flying in the dark; we have night-vision goggles on, we're trying to get a job done and then the next thing you know we see this drone come up to our altitude.'", metadata={'source': 'news_articles/article_000000.txt'}),
  Document(page_content="New York police are concerned dro

In [32]:
qa_chain.retriever.search_type,qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7c6d1a2e8cd0>)

In [33]:
## This is prompt the chain has been given
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


**Deleting the Database**

In [35]:
!zip -r db.zip ./db


  adding: db/ (stored 0%)
  adding: db/5475ee9d-a315-48bc-be2a-00cf722423a3/ (stored 0%)
  adding: db/5475ee9d-a315-48bc-be2a-00cf722423a3/header.bin (deflated 61%)
  adding: db/5475ee9d-a315-48bc-be2a-00cf722423a3/length.bin (deflated 60%)
  adding: db/5475ee9d-a315-48bc-be2a-00cf722423a3/data_level0.bin (deflated 100%)
  adding: db/5475ee9d-a315-48bc-be2a-00cf722423a3/link_lists.bin (stored 0%)
  adding: db/chroma.sqlite3 (deflated 63%)


In [36]:
## To clean up, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

**Restart the Runtime**

In [37]:
!unzip db.zip

Archive:  db.zip
   creating: db/
   creating: db/5475ee9d-a315-48bc-be2a-00cf722423a3/
  inflating: db/5475ee9d-a315-48bc-be2a-00cf722423a3/header.bin  
  inflating: db/5475ee9d-a315-48bc-be2a-00cf722423a3/length.bin  
  inflating: db/5475ee9d-a315-48bc-be2a-00cf722423a3/data_level0.bin  
 extracting: db/5475ee9d-a315-48bc-be2a-00cf722423a3/link_lists.bin  
  inflating: db/chroma.sqlite3       


In [38]:
import os
os.environ["OPENAI_API_KEY"]= ""

In [39]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


In [40]:
persist_directory='db'
embedding= OpenAIEmbeddings()
vectordb2=Chroma(persist_directory=persist_directory,
                 embedding_function=embedding)
retriever=vectordb2.as_retriever(search_kwargs={"k":2})

In [41]:
## Set Up the turbo LLM
turbo_llm=ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [42]:
#create chain to answer questions
qa_chain=RetrievalQA.from_chain_type(llm=turbo_llm,
                                     chain_type="stuff",
                                     retriever=retriever,
                                     return_source_documents=True)

In [43]:
## Cite sources
def process_llm_response(llm_response):
  print(llm_response['result'])
  print('\n\nSources:')
  for source in llm_response["source_documents"]:
    print(source.metadata["source"])

In [44]:
## full example
query= " Who is Tiger?"
llm_response=qa_chain(query)
process_llm_response(llm_response)

Tiger refers to Tiger Woods, a professional golfer who is widely regarded as one of the greatest golfers of all time.


Sources:
news_articles/article_000002.txt
news_articles/article_000002.txt


**Getting The System Prompt**

In [45]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [46]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
