# Document-enhanced QA system using retrieval-augmented generation (RAG) 


Keywords: Embeddings, VectorDB, OpenAI GPT API

### Step 1: Use langchain to chop the input PDF into smaller pieces

In [20]:
# %cd C:\Users\megah\Dropbox\UMMalaya\SQB7027 STATISTICAL CONSULTANCY AND DATA ANALYSIS\Lecture 
# !dir 
from tqdm import tqdm
import pinecone
from pinecone import Pinecone, ServerlessSpec, PodSpec
import os
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from pprint import pprint
##############################################################################################################################
os.environ['OPENAI_API_KEY'] = "XXX"
os.environ['OPENAI_BASE_URL'] = "XXX"

os.environ['EMBEDDING_MODEL'] = "text-embedding-3-large	"
os.environ['PINECONE_API_KEY'] = 'XXX'
os.environ['PINECONE_ENVIRONMENT'] = 'XXX'
os.environ['PINECONE_QUERY_TOP_K'] = '15'

##############################################################################################################################
# pdf_path = r"C:\Users\megah\Dropbox\Book\Alexander Shuiskov - Microservices with Go-Packt Publising Pvt Ltd (2022).pdf"

folder_path = r"XXXX\PDFs"
pdf_paths = os.listdir(folder_path)
pdf_paths = [os.path.join(folder_path, pdf_path) for pdf_path in pdf_paths]
pdf_paths = [pdf_path for pdf_path in pdf_paths if pdf_path.endswith(".pdf")]
print(f"{len(pdf_paths)} PDFs found in the folder. File list:")
[print(f"\t - {os.path.basename(pdf_path)}") for pdf_path in pdf_paths]
print()

main_all_splits_dics = []
main_all_splits = []
for pdf_path in pdf_paths:
    loader = PyMuPDFLoader(pdf_path)
    
    PDF_data = loader.load()
    # print(PDF_data)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000*4, chunk_overlap=2000*0.5*4)
    all_splits = text_splitter.split_documents(PDF_data)
    print(f"No. of splits: {len(all_splits)} -> {os.path.basename(pdf_path)}")

    ##############################################################################################################################
    all_splits = all_splits
    all_splits = [all_splits_.dict() for all_splits_ in all_splits]
    for i in range(0, len(all_splits)):
        all_splits[i]['metadata']['page_content'] = all_splits[i]['page_content']
        del all_splits[i]['page_content']
    all_splits_dic = [all_splits_['metadata'] for all_splits_ in all_splits]
    all_splits = [all_splits_['metadata']['page_content']
                for all_splits_ in all_splits]
    main_all_splits_dics.extend(all_splits_dic)
    main_all_splits.extend(all_splits)

    
all_splits_dics = main_all_splits_dics
all_splits = main_all_splits
len(all_splits_dics), len(all_splits)

print("\n\nChopped Example:")
all_splits_dics[0]

4 PDFs found in the folder. File list:
	 - Benedikt Schack - Wikipedia.pdf
	 - Franz Xaver Gerl - Wikipedia.pdf
	 - Joseph Haydn - Wikipedia.pdf
	 - Wolfgang Amadeus Mozart - Wikipedia.pdf

No. of splits: 4 -> Benedikt Schack - Wikipedia.pdf
No. of splits: 2 -> Franz Xaver Gerl - Wikipedia.pdf
No. of splits: 20 -> Joseph Haydn - Wikipedia.pdf
No. of splits: 22 -> Wolfgang Amadeus Mozart - Wikipedia.pdf


Chopped Example:


{'source': 'C:\\Users\\megah\\Dropbox\\Resume\\Interview-Prep\\Starluck\\PDFs2\\Benedikt Schack - Wikipedia.pdf',
 'file_path': 'C:\\Users\\megah\\Dropbox\\Resume\\Interview-Prep\\Starluck\\PDFs2\\Benedikt Schack - Wikipedia.pdf',
 'page': 0,
 'total_pages': 4,
 'format': 'PDF 1.4',
 'title': 'Benedikt Schack - Wikipedia',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
 'producer': 'Skia/PDF m126',
 'creationDate': "D:20240627103252+00'00'",
 'modDate': "D:20240627103252+00'00'",
 'trapped': '',
 'page_content': 'Benedikt Schack in performance with the\nSchikaneder troupe: the number "O Anton du bist\nmein" from the Singspiel Die Zween Anton.\nSchack is at center, his hands held by soprano\nJosepha Hofer and baritone/impresario Emanuel\nSchikaneder. Click on image for the identities of\nthe other players and the original source.\nBenedikt Schack\nBenedikt Eman

### Step 2: Use Embeddings to convert the text into embeddings, then upload to Pinecone vectorDB

In [21]:
cur_namespace='XXX'
##############################################################################################################################
embeddings = []
for i in range(0, len(all_splits_dics), 10):
    print(f"Generating Embeddings for {i} to {i+10}")
    texts = all_splits[i:i+10]

    client = openai.OpenAI()
    response = client.embeddings.create(
        input=texts,
        model="text-embedding-ada-002"
    )
    embeddings_ = [x_.embedding for x_ in response.data]
    embeddings.extend(embeddings_)

##############################################################################################################################
use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"
api_key = os.environ.get('PINECONE_API_KEY')
environment = os.environ.get('PINECONE_ENVIRONMENT')

if use_serverless:
    cloud = os.environ.get('PINECONE_CLOUD')
    spec = ServerlessSpec(cloud='aws', region='us-west-2')
else:
    spec = PodSpec(environment=environment)

index_name = "hello-pinecone"
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)
try:
    index.delete(delete_all=True, namespace=cur_namespace)
except Exception as e:
    print(e)

##############################################################################################################################
ids = range(len(all_splits))
ids = [str(x) for x in ids]

for i in range(0, len(all_splits), 10):
    print(f"Uploading index {i} to {i+9} to PineconeDB")
    index.upsert(vectors=zip(ids[i:i+10], embeddings[i:i+10],
                 all_splits_dics[i:i+10]), namespace=cur_namespace)

Generating Embeddings for 0 to 10
Generating Embeddings for 10 to 20
Generating Embeddings for 20 to 30
Generating Embeddings for 30 to 40
Generating Embeddings for 40 to 50
Uploading index 0 to 9 to PineconeDB
Uploading index 10 to 19 to PineconeDB
Uploading index 20 to 29 to PineconeDB
Uploading index 30 to 39 to PineconeDB
Uploading index 40 to 49 to PineconeDB


### Step 3: Embed the Custom Questions and find the related chunks in the vectorDB

In [23]:
##############################################################################################################################

query = '''
Who are Mozart's best friend? 
How do they know each other and become friends?
What song did they compose or play together?
'''


os.environ['OPENAI_API_KEY'] = "XXX"
os.environ['OPENAI_BASE_URL'] = "XXX"

client = openai.OpenAI()
response = client.embeddings.create(
    input=[query],
    model="text-embedding-ada-002"
)

embedded_query = response.data[0].embedding
# Query namespace passed as parameter using title vector, get metadata
query_result = index.query(vector=embedded_query,
                           top_k=int(os.environ.get(
                               'PINECONE_QUERY_TOP_K', 5)),
                           namespace=cur_namespace,
                           include_metadata=True
                           )

query_result['matches'] = [ x for x in query_result['matches'] if x['score'] >= 0.2]

# sort by score for query_result['matches']
query_result['matches'] = sorted(
    query_result['matches'], key=lambda x: x['score'], reverse=True)



query_result_content = [x['metadata']['page_content']
                        for x in query_result['matches']]
query_result_score = [x['score'] for x in query_result['matches']]

for i in range(len(query_result_content)):
    print("Match " + str(i+1) + ":")
    print(f"\tScore: {query_result_score[i]}")
    print(f"\tContent:" + query_result_content[i].replace('\n', ' ').replace("  ", "")[0:200] + "...")
    print()

    if i >= 5:
        break

Match 1:
	Score: 0.38652575
	Content:Posthumous painting by Barbara Krafft in 1819 ( p p p death)[83] and the Little Masonic Cantata K. 623, premiered on 17 November 1791.[84] Mozart fell ill while in Prague for the premiere, on 6 Septem...

Match 2:
	Score: 0.382590175
	Content:Mozart wearing the badge of the Order of the Golden Spur which he received in 1770 from Pope Clement XIV in Rome. The painting is a 1777 copy of a work now lost.[28] Mozart family, c. 1780 (della Croc...

Match 3:
	Score: 0.381687284
	Content:Fortepiano played by Mozart in 1787, Czech Museum of Music, Prague[70] y , y p y g p p g q six quartets dedicated to Haydn (K. 387, K. 421, K. 428, K. 458, K. 464, and K. 465) date from the period 178...

Match 4:
	Score: 0.381097078
	Content:w.worldcat.org/oclc/594813). Mozart's Letters, Mozart's Life: Selected Letters. Translated by Robert Spaethling. W.W. Norton. 2000. "Mozart, Mozart's Magic Flute and Beethoven" (https://web.archive.or...

Match 5:
	Score: 0.378985524


### Final Step: QA using LLM


In [24]:
os.environ['OPENAI_API_KEY'] = "XXX"
os.environ['OPENAI_BASE_URL'] = "XXX"


prompt = "Context: " + "-------------\n".join(query_result_content) + "\n\nQuestion: " + query + "\nAnswer:"
client = openai.OpenAI()
completion = client.chat.completions.create(
  model="gpt-4-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt},
  ]
)


# pretty print the response
print(completion.choices[0].message.content)

Mozart's close friends included Gottfried von Jacquin, Count August Hatzfeld, and Sigmund Barisani. These friendships were likely formed through his social and professional circles, as all were connected to the arts or aristocracy, which were central to Mozart's life in Vienna.

1. **Gottfried von Jacquin**: Son of the chemist Nikolaus von Jacquin, a professor whom Mozart met in Vienna. Their friendship was marked by a mutual appreciation of music and intellectual pursuits. Mozart composed several pieces for Jacquin, including the song "Komm, liebe Zither, komm" K. 351.

2. **Count August Hatzfeld**: He was part of the Viennese aristocracy and known for his involvement in music. The specifics of their musical collaborations are not well-documented, but their friendship was likely facilitated by their mutual involvement in Vienna's musical scene.

3. **Sigmund Barisani**: Details about their relationship are less documented, but as with many of Mozart's acquaintances, the connection was

# What's Next?

- Fine-tune Emebeddings model with domain specific knowledge
- Use Open-Source model for local deployment
  - General: Llama2
  - language specific: 
    - Malay: Merak-7B-v2
    - Chinese: Chinese-LLaMA-2-7B-hf
    - Cantonese: cantonese-llama-2-7b-oasst-v1
