# Project: Question-Answering on Private Documents

In [None]:
%pip install dotenv pypdf langchain langchain-community docx2txt wikipedia tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [None]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)


    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)

    elif extension == '.docx':
            from langchain.document_loaders import Docx2txtLoader
            print(f'Loading {file}')
            loader = Docx2txtLoader(file)
    else:
        raise ValueError(f'Unsupported file type: {extension}')

    return loader.load()

# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    print(f'Loading {query} from Wikipedia')
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    return loader.load()

In [None]:
def chunk_data(data, chunk_size=256):
  from langchain.text_splitter import RecursiveCharacterTextSplitter
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
  chunks = text_splitter.split_documents(data)
  return chunks

In [None]:
def print_embedding_cost(texts):
  import tiktoken
  enc = tiktoken.encoding_for_model("text-embedding-ada-002")
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
  print(f"Total embedding tokens: {total_tokens}")
  print(f"Total embedding cost (at $0.0004 / 1K tokens): ${total_tokens * 0.0004 / 1000:.6f}")

### Embedding and Uploading to a  Vector Database (Pinecone)

In [None]:
%pip install openai langchain-openai pinecone langchain-pinecone -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
def insert_or_fetch_embeddings(index_name, chunks):
  import pinecone
  from langchain_pinecone import PineconeVectorStore  # Updated import for new Pinecone API
  from langchain_openai import OpenAIEmbeddings
  from pinecone import ServerlessSpec

  pc = pinecone.Pinecone()
  embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

  if index_name in pc.list_indexes().names():
    print(f'Index {index_name} already exists. Loading embeddings ...', end='')
    vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
    print('Done')
    return vector_store
  else:
    print(f'Creating index {index_name} and embeddings ...', end='')
    pc.create_index(
      name=index_name, 
      dimension=1536, 
      metric='cosine', 
      spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    vector_store = PineconeVectorStore.from_documents(
      documents=chunks, 
      embedding=embeddings, 
      index_name=index_name
    )
    print('Done')
    return vector_store

In [None]:
def delete_pinecone_index(index_name = 'all'):
  import pinecone
  pc = pinecone.Pinecone()

  if index_name == 'all':
    indexes = pc.list_indexes().names()
    print(f'Deleting all indexes: {indexes}')
    for index in indexes:
      pc.delete_index(index)
    print('Done')
  else:
    print(f'Deleting index: {index_name} ...', end='')
    pc.delete_index(index_name)
    print('Done')
  

In [None]:
data = load_document('files/us_constitution.pdf')
print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[0].page_content)} characters in the first page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 639 characters in the first page


In [None]:
# data = load_document('files/the_great_gatsby.docx')
# print(f'You have {len(data)} pages in your data')
# print(f'There are {len(data[0].page_content)} characters in the first page')

In [None]:
# data = load_from_wikipedia('GPT-5', 'sk')
# print(data[0].page_content)

In [None]:
chunks = chunk_data(data)
print(f'Now you have {len(chunks)} chunks of data')
print(chunks[0].page_content)

Now you have 224 chunks of data
The United States Constitution 
 W e the People of the United States, in Order to form a more perfect 
 Union, establish Justice, insure domestic T ranquility , provide for the 
 common defence, promote the general W elfare, and secure the


In [None]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name, chunks)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


Index askadocument already exists. Loading embeddings ...Done


### Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q):
  from langchain.chains import RetrievalQA
  from langchain_openai import ChatOpenAI

  llm = ChatOpenAI(model='gpt-4o', temperature=1)

  retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})

  chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

  answer = chain.invoke(q)
  return answer

In [None]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

The document provides information about ChatGPT, a language model developed by OpenAI. It mentions that ChatGPT is a variant of the GPT model and is used in conversational robots and natural language processing applications. Additionally, it discusses the importance of registration for usage, the initial testing phase aimed at identifying strengths and weaknesses, and collecting user feedback for improvements. The document also references related topics like internet bots and offers links to external resources related to chatbots.


In [None]:
import time

i = 1
print('Write Quit or Exit to quit.')

while True:
  q = input('Question #{i}: ')
  i += 1
  if q.lower() in ['quit', 'exit']:
    print('Exiting ... bye bye!')
    time.sleep(2)
    break

  answer = ask_and_get_answer(vector_store, q)
  print(f'\nAnswer: {answer['result']}')
  print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.
Exiting ... bye bye!


In [None]:
data = load_from_wikipedia('ChatGPT', 'sk')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name, chunks)

Loading ChatGPT from Wikipedia
Index chatgpt already exists. Loading embeddings ...Done


In [None]:
q = 'Kedy bol GPT 5 spustený?'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

Neviem, či bol GPT-5 spustený, pretože vo vašom kontexte sa neuvádzajú žiadne informácie o GPT-5.


In [19]:
delete_pinecone_index()

Deleting all indexes: ['chatgpt', 'askadocument']
Done


### Using Chroma as a Vector DB

In [20]:
%pip install chromadb -q

Note: you may need to restart the kernel to use updated packages.


In [22]:
def create_embeddings_chroma(chunks, persist_directory='.chroma_db'):
  from langchain.vectorstores import Chroma
  from langchain_openai import OpenAIEmbeddings

  embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
  vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
  return vector_store

def load_embeddings_chroma(persist_directory='.chroma_db'):
  from langchain.vectorstores import Chroma
  from langchain_openai import OpenAIEmbeddings

  embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
  vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
  return vector_store

In [23]:
data = load_document('files/rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [25]:
q = 'What is Vertext AI Search?'

answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

Vertex AI Search is a fully-managed platform by Google that enables developers to build AI-powered search applications. It offers capabilities such as customizable answers, search tuning, vector search, grounding, and compliance updates. Vertex AI Search leverages Tensor Processing Units (TPUs) to power large-scale semantic searches with Google-quality performance, providing developers with the tools to create and enhance AI applications without the need to design and build their own advanced search engines.


In [27]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'

answer = ask_and_get_answer(db, q)
print(answer['result'])

The Stack Overflow dataset had 8 million pairs of questions and answers.


In [28]:
q = 'Multiply that number by 2.'
answer = ask_and_get_answer(db, q)
print(answer['result'])

I'm sorry, but I need a specific number to multiply by 2. Could you please provide the number you'd like to multiply?


### Adding Memory (Chat History)

In [29]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(model='gpt-4o', temperature=0)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",
    verbose=True
)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [30]:
def ask_question(q, chain):
  result = chain.invoke({"question": q})
  return result

In [31]:
data = load_document('files/rag_powered_by_google_search.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [33]:
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-answer or query-and-

distinctly di erent meanings. Why, then, do you use similarity search to
 nd answers?
Semantic search is not just similarity
search
In the Stack Ove low demo that we introduced in a previous post,

distinctly di erent meanings. Why, then, do you use similarity search to
 nd answer

In [34]:
print(result['answer'])

The StackOverflow dataset had 8 million pairs of questions and answers.


In [35]:
q = 'Multiply that number by 2.'
result = ask_question(q, crc)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many pairs of questions and answers had the StackOverflow dataset?
Assistant: The StackOverflow dataset had 8 million pairs of questions and answers.
Follow Up Input: Multiply that number by 2.
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
simple similarity search was highly e ective because the dataset had 8
million pairs of questions and answers. However, datasets do not
usually contain pre-existing question-and-ans

In [36]:
print(result['answer'])

When you multiply the 8 million pairs of questions and answers by 2, you get 16 million pairs.


In [38]:
for item in result['chat_history']:
  print(item)

content='How many pairs of questions and answers had the StackOverflow dataset?' additional_kwargs={} response_metadata={}
content='The StackOverflow dataset had 8 million pairs of questions and answers.' additional_kwargs={} response_metadata={}
content='Multiply that number by 2.' additional_kwargs={} response_metadata={}
content='When you multiply the 8 million pairs of questions and answers by 2, you get 16 million pairs.' additional_kwargs={} response_metadata={}


### Using a Custom Prompt

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(model='gpt-4o', temperature=0)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

system_template = r'''
Use the following pieces of context to answer the user's question in Spanish.
If you don't find the answer in the provided context, just respond "I don't know".
-------------------
Context: ```{context}```
'''

user_template = r'''
Question: ```{question}```
Chat History: ```{chat_history}```
'''

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",
    combine_docs_chain_kwargs={"prompt": qa_prompt},
    verbose=True
)

In [None]:
db = load_embeddings_chroma()

q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result['answer'])
