In [1]:
# pip install langchain --upgrade
# Version: 0.0.164

! pip install pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [27]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [26]:
loader = PyPDFLoader("../data/UHV Text Book.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [29]:
data = loader.load()

In [None]:
data

In [31]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 314 document(s) in your data
There are 137 characters in your document


### Chunk your data up into smaller documents

In [None]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
texts = text_splitter.split_documents(data)
texts

In [54]:
print (f'Now you have {len(texts)} documents')

Now you have 488 documents


### Create embeddings of your documents to get ready for semantic search

In [4]:
# Why is it used
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os

  from tqdm.autonotebook import tqdm


In [5]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', "us-west1-gcp-free") # You may need to switch with your env

In [6]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [12]:
def create_index(index_name):
    pinecone.create_index(index_name, dimension=1536,  metric='cosine')

def delete_index(index_name):
    pinecone.delete_index(index_name)

In [13]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "uhv-book-index" # put in the name of your pinecone index here

### Helper Functions

In [8]:
# No need to load any pdf, use existing index to query.
def get_docs_from_exsting_index():
    return Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

# Need to load pdf first.
def get_docs_from_text():
    return Pinecone.from_texts(texts=[doc.page_content for doc in texts], metadatas=[doc.metadata for doc in texts], embedding=embeddings, index_name=index_name)

In [14]:
docsearch = get_docs_from_exsting_index()

In [None]:
# docsearch = Pinecone.from_existing_index(index_name="langchain1", embedding=embeddings)

In [45]:
docsearch

<langchain.vectorstores.pinecone.Pinecone at 0x7fecdf9c9780>

### Query those docs to get your answer back

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

In [16]:
prompt_template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Answer the question in 50-90 words. Just write answer don't use "based on given context".

{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template = prompt_template,
    input_variables=["context", "question"]
)
#Load an LLM
# llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
llm = ChatOpenAI(
    temperature=0,
    model_name="gpt-3.5-turbo",            
)

#Setup chain
chain = load_qa_chain(
    llm=llm,
    chain_type="stuff",
    prompt = prompt # Add custom prompt
)


In [104]:
chain.llm_chain.prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template='\nUse the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer. Answer the question in 50-90 words. Just write answer don\'t use "based on given context".\n\n{context}\n\nQuestion: {question}\nAnswer:\n', template_format='f-string', validate_template=True)

In [None]:
loader = PyPDFLoader("../data/UHV Question Bank_single_line.pdf")
docs = loader.load()
pages ="".join(doc.page_content for doc in docs)
queries = [query.strip() for query in pages.split("\n") if query.strip()]
queries

In [17]:
import json
def save_answer(answers: dict, file_name: str):
    with open(f"../data/{file_name}", "w", encoding="utf-8") as fp:
        json.dump(answers, fp, indent=2, ensure_ascii=True)

def read_answers(file_name: str) -> dict:
    with open(f"../data/{file_name}", "r") as fp:
        answers = json.load(fp)
        return answers

In [37]:
def convert_json_to_md(file_name="UHV_Answers_gpt-3.5-turbo.json"):
    answers = read_answers(file_name)
    lines= []
    for question, answer_details in answers.items():
        answer = answer_details.get("answer")
        pages = answer_details.get("pages")
        lines.append(f"### {question}\n")
        lines.append(f"{answer}\n")
        lines.append(f" - Source: {pages}\n\n\n")
    with open("../data/answers.md", "w") as f:
        f.writelines(lines)

In [51]:
pinecone.describe_index(index_name)

IndexDescription(name='uhv-book-index', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

### Multiple Queries

In [None]:
from collections import defaultdict
file_name = "UHV_Answers_gpt-3.5-turbo.json"
answers_dict = read_answers(file_name=file_name)
sub_queries = queries[53:]
for query in sub_queries:
    docs = docsearch.similarity_search(query)
    details = defaultdict()
    answer = chain.run(input_documents=docs, question=query)
    print(f"{query = }\n{answer = }\n")
    details["answer"] = answer
    details["pages"] = [int(doc.metadata.get("page"))+1 for doc in docs]
    answers_dict[query] = details
    save_answer(answers_dict, file_name = file_name)
convert_json_to_md()

### Single query

In [48]:
from collections import defaultdict
file_name = "UHV_Answers_gpt-3.5-turbo.json"
answers_dict = read_answers(file_name=file_name)


query = "Distinguish between love and affection?"

print("Search similar docs")
docs = docsearch.similarity_search(query)
details = defaultdict()

print("Running LLM")
answer = chain.run(input_documents=docs, question=query)
# print(f"{query = }\n{answer = }\n")

details["answer"] = answer
details["pages"] = [int(doc.metadata.get("page"))+1 for doc in docs]
print(details)
answers_dict[query] = details
# print(answers_dict)
save_answer(answers_dict, file_name = file_name)
convert_json_to_md()

Search similar docs
Running LLM
defaultdict(None, {'answer': 'Love is the feeling of being related to everyone and seeing each and everyone as our relative. It is a feeling of being connected to others and wanting their happiness. Affection, on the other hand, is the recognition of the feeling that the other is related to me. It comes naturally once trust and respect are recognized in a relationship. Love is a broader concept that encompasses affection, as it extends to all beings, while affection is specific to individual relationships.', 'pages': [187, 185, 186, 186]})
