In [2]:
# pip install langchain --upgrade
# Version: 0.0.164

! pip install pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [4]:
loader = PyPDFLoader("../data/test3.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [5]:
data = loader.load()

Multiple definitions in dictionary at byte 0xd7 for key /Mask
Multiple definitions in dictionary at byte 0x30d for key /Mask
Multiple definitions in dictionary at byte 0x31d for key /Mask
Multiple definitions in dictionary at byte 0x32d for key /Mask
Multiple definitions in dictionary at byte 0x33d for key /Mask
Multiple definitions in dictionary at byte 0x34d for key /Mask
Multiple definitions in dictionary at byte 0x589 for key /Mask
Multiple definitions in dictionary at byte 0x8db for key /Mask
Multiple definitions in dictionary at byte 0x2878 for key /Mask


In [6]:
data

[Document(page_content='1All Things Bright and Beautiful\n   C.F. Alexander\n A. Lead – in:\nWe see and enjoy many things around us such as beautiful sunrises and sunsets,\ntrees and flowers, rivers and mountains, birds and animals. We smell the fragrance of\nflowers and feel the change of seasons. We have eyes to see all these and lips to\npraise the beauty of the creation. Have you ever thought  who has created such beautiful\nthings for us ? Who has blessed us with the senses of sight, smell and feeling ? Read\nthe poem to know what the poet thinks about these things.\n B. Let’s listen to the Poem :\nThe teacher reads the whole poem aloud. Listen to the teacher only. Don’t\nopen the book.\nOpen your text book when the teacher reads aloud a portion of the poem.\nListen to him / her and mark the manner of reading.\nRead the poem silently to understand. Refer to the glossary if you find some\nwords / expressions difficult.\n C.  The Text\nAll things bright and beautiful,\nAll creatures

In [7]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 18 document(s) in your data
There are 1048 characters in your document


### Chunk your data up into smaller documents

In [8]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [9]:
print (f'Now you have {len(texts)} documents')

Now you have 19 documents


### Create embeddings of your documents to get ready for semantic search

In [11]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [12]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', "asia-southeast1-gcp-free") # You may need to switch with your env

In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [14]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain1" # put in the name of your pinecone index here

In [15]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

### Query those docs to get your answer back

In [19]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [20]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [25]:
query = "Why is almighty great?"
docs = docsearch.similarity_search(query)

In [26]:
chain.run(input_documents=docs, question=query)

' The poem suggests that God is great because he created all things bright and beautiful, all creatures great and small, all things wise and wonderful.'