In [2]:
# pip install langchain --upgrade
# Version: 0.0.164

! pip install pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [10]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [11]:
loader = PyPDFLoader("../data/test3.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [12]:
data = loader.load()

Multiple definitions in dictionary at byte 0xd7 for key /Mask
Multiple definitions in dictionary at byte 0x30d for key /Mask
Multiple definitions in dictionary at byte 0x31d for key /Mask
Multiple definitions in dictionary at byte 0x32d for key /Mask
Multiple definitions in dictionary at byte 0x33d for key /Mask
Multiple definitions in dictionary at byte 0x34d for key /Mask
Multiple definitions in dictionary at byte 0x589 for key /Mask
Multiple definitions in dictionary at byte 0x8db for key /Mask
Multiple definitions in dictionary at byte 0x2878 for key /Mask


In [13]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 18 document(s) in your data
There are 1048 characters in your document


### Chunk your data up into smaller documents

In [14]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [9]:
print (f'Now you have {len(texts)} documents')

Now you have 19 documents


### Create embeddings of your documents to get ready for semantic search

In [3]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os

In [4]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', "asia-southeast1-gcp-free") # You may need to switch with your env

In [5]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [6]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain1" # put in the name of your pinecone index here

In [15]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

### Query those docs to get your answer back

In [16]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [17]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
queries = [
    "Why is almighty great?",
    "What are the little things mentioned in the poem?",
    "How does the poet describe the beauty of flowers?",
    "Which of the things are bright and which of the things are small?"
]

In [22]:
for query in queries:
    docs = docsearch.similarity_search(query)
    result = chain.run(input_documents=docs, question=query)
    print(f"{query = }\n{result = }\n")

query = 'Why is almighty great?'
result = ' The poem suggests that God is great because he has made all things, including the little flowers, birds, mountains, rivers, sunsets, mornings, winds, summer sun, fruit, trees, meadows, and rushes. He has also given us eyes to see them and lips to tell of his greatness.'

query = 'What are the little things mentioned in the poem?'
result = ' The little things mentioned in the poem are flowers, birds, and rushes.'

query = 'How does the poet describe the beauty of flowers?'
result = ' The poet does not directly describe the beauty of flowers, but he does mention the fragrance of flowers and suggests that we have lips to praise the beauty of the creation.'

query = 'Which of the things are bright and which of the things are small?'
result = ' All things bright are beautiful, and all creatures small.'

