In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

## Load document from pdf

In [2]:
loader = PyPDFLoader("../LangChainDocs/Findings.pdf")

In [3]:
data = loader.load()

In [6]:
data

[Document(page_content="Study\n1Study\nOriginal Documentation\n https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/\nMy Findings\nPrompt Engineering, also known as In-Context Prompting, is a method used to guide  \nthe behavior of autoregressive language models (LLMs) without modifying their weights.  \nThe goal is to align the model's output with the desired outcomes by carefully designing  \nprompts. This approach requires experimentation and heuristic methods due to the  \nvariation in the ef fectiveness of prompt engineering across dif ferent models.\nZero-shot learning and few-shot learning are two basic approaches used for prompting  \nLLMs. In zero-shot learning, the task text is directly fed to the model to generate results.  \nFew-shot learning, on the other hand, involves providing a set of high-quality  \ndemonstrations that include input and desired output examples. Few-shot learning  \ngenerally leads to better performance than zero-shot learning, but it cons

## Chunking

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [17]:
type(texts)

list

In [14]:
type(docs[0])

langchain.schema.document.Document

In [18]:
print (f'Now we have {len(texts)} documents')

Now you have 5 documents


## Creating embeddings and storing in pinecone

In [21]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [26]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass('Enter you openai keys')

PINECONE_API_KEY = getpass('Enter you pinecone api key')
PINECONE_API_ENV = getpass('Enter you pinecone env name')

Enter you openai keys········
Enter you pinecone api key········
Enter you pinecone env name········


In [45]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

'us-west4-gcp-free'

In [36]:
embeddings = OpenAIEmbeddings()

In [46]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.22s/it]


In [47]:
docsearch

<langchain.vectorstores.pinecone.Pinecone at 0x146343510>

In [51]:
docs = docsearch.similarity_search("few-shot learning")

In [52]:
docs[0].page_content

"pretrained models. Reinforcement Learning from Human Feedback (RLHF) is a  \ncommon method used for instruction following-style fine-tuning. It improves the model's  \nalignment with human intention and reduces communication costs."

## Query with document to the LLM

In [53]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [54]:
llm = OpenAI()

In [55]:
chain = load_qa_chain(llm, chain_type = "stuff")

In [56]:
chain.run(input_documents = docs, question = "what is in context learning")

' In-Context Prompting, also known as Prompt Engineering, is a method used to guide the behavior of autoregressive language models (LLMs) without modifying their weights.'