### API Keys

In [54]:
groq_api_key = 'Your Key'
cohere_api_key = 'Your Key'
pinecone_api_key = 'Your Key'

### install lib

In [55]:
%pip install langchain langchain-community pypdf cohere groq pinecone -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### connect

In [56]:
# for embedding
import cohere
co = cohere.Client(api_key=cohere_api_key)

In [57]:
# use for asking questions
from groq import Groq
client = Groq(api_key=groq_api_key)

In [58]:
# vector database
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index("rag-bu")

### load PDF and split text

In [59]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [60]:
Loader = PyPDFLoader('sample.pdf')

In [61]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [62]:
pages = text_splitter.split_documents(Loader.load())

In [63]:
texts = [i.page_content for i in pages]

In [64]:
# clean texts
for text in texts:
    text.replace("\n", " ")

### embed document

In [65]:
embeddings =  co.embed(
    texts=texts,
    model="embed-english-v3.0",
    input_type="search_query",
).embeddings

In [66]:
embeddings

[[0.0018177032,
  0.004310608,
  -0.027832031,
  -0.019454956,
  -0.030288696,
  -0.009643555,
  -0.027008057,
  -0.018707275,
  0.015380859,
  0.07305908,
  0.031158447,
  -0.03274536,
  0.035583496,
  -0.013076782,
  -0.009208679,
  -0.025527954,
  0.05432129,
  0.027633667,
  0.06640625,
  0.013031006,
  -0.003868103,
  0.015083313,
  0.021072388,
  -0.02861023,
  -0.022003174,
  0.010719299,
  0.017730713,
  0.009780884,
  -0.018005371,
  -0.011352539,
  -0.00067949295,
  0.040374756,
  0.005832672,
  -0.0067443848,
  -0.015319824,
  0.03265381,
  -0.0625,
  -0.0016679764,
  -0.002702713,
  -0.01525116,
  -0.034942627,
  0.023422241,
  -0.031143188,
  0.003540039,
  -0.06890869,
  -0.034088135,
  0.004550934,
  0.008293152,
  0.055877686,
  0.050842285,
  0.04397583,
  -0.04333496,
  0.0017929077,
  0.04953003,
  -0.033111572,
  0.017089844,
  -0.0053901672,
  0.011672974,
  0.027359009,
  -0.0129470825,
  -0.01574707,
  0.02104187,
  0.013336182,
  -0.009666443,
  0.029266357,
  -

### upsert data to vector database

In [67]:
import uuid

vectors = []
for d, e in zip(texts, embeddings):
    vectors.append({
        "id": str(uuid.uuid4()),
        "values": e,
        "metadata": {'text': d}
    })

In [68]:
vectors

[{'id': '5c0afc61-5b78-4bdb-a808-64f1a8d7df4f',
  'values': [0.0018177032,
   0.004310608,
   -0.027832031,
   -0.019454956,
   -0.030288696,
   -0.009643555,
   -0.027008057,
   -0.018707275,
   0.015380859,
   0.07305908,
   0.031158447,
   -0.03274536,
   0.035583496,
   -0.013076782,
   -0.009208679,
   -0.025527954,
   0.05432129,
   0.027633667,
   0.06640625,
   0.013031006,
   -0.003868103,
   0.015083313,
   0.021072388,
   -0.02861023,
   -0.022003174,
   0.010719299,
   0.017730713,
   0.009780884,
   -0.018005371,
   -0.011352539,
   -0.00067949295,
   0.040374756,
   0.005832672,
   -0.0067443848,
   -0.015319824,
   0.03265381,
   -0.0625,
   -0.0016679764,
   -0.002702713,
   -0.01525116,
   -0.034942627,
   0.023422241,
   -0.031143188,
   0.003540039,
   -0.06890869,
   -0.034088135,
   0.004550934,
   0.008293152,
   0.055877686,
   0.050842285,
   0.04397583,
   -0.04333496,
   0.0017929077,
   0.04953003,
   -0.033111572,
   0.017089844,
   -0.0053901672,
   0.01167

In [69]:
index.upsert(
    vectors=vectors
    )

{'upserted_count': 4}

### Chat

In [79]:
query = 'what is the first sentence of the sample pdf'

In [80]:
query_embeddings =  co.embed(
    texts=[query],
    model="embed-english-v3.0",
    input_type="search_query",
).embeddings

#### query data

In [81]:
documents = index.query(
    vector=query_embeddings[0],
    top_k=3,
    include_values=False,
    include_metadata=True
)['matches']

print(documents)

[{'id': '5c0afc61-5b78-4bdb-a808-64f1a8d7df4f',
 'metadata': {'text': 'Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\n'
                      'Lorem ipsum dolor sit amet, consectetuer adipiscing '
                      'elit. Phasellus facilisis odio sed mi. \n'
                      'Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum '
                      'ut lectus. Proin aliquam, erat eget \n'
                      'pharetra commodo, eros mi condimentum quam, sed commodo '
                      'justo quam ut velit. \n'
                      'Integer a erat. Cras laoreet ligula cursus enim. Aenean '
                      'scelerisque velit et tellus. \n'
                      'Vestibulum dictum aliquet sem. Nulla facilisi. '
                      'Vestibulum accumsan ante vitae elit. Nulla \n'
                      'erat dolor, blandit in, rutrum quis, semper pulvinar, '
                      'enim. Nullam varius congue risus. \n'
                      'Vivamus sollicitudi

In [82]:
references_for_LLMs = []
for document in documents:
    references_for_LLMs.append(document['metadata']['text'])

In [83]:
references_for_LLMs

['Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\nLorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. \nCurabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget \npharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. \nInteger a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. \nVestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla \nerat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. \nVivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique \naccumsan eros quam et risus. Suspendisse libero odio, mattis sit amet, aliquet eget, \nhendrerit vel, nulla. Sed vitae augue. Aliquam erat volutpat. Aliquam feugiat vulputate nisl. \nSuspendisse quis nulla pretium ante pretium mollis. Proin velit ligula, sagittis at, egestas a, \npulvinar quis, nisl.',
 'Sample PDFThis

#### ask chat

In [84]:
prompt = f"Question: {query}\nContext: {references_for_LLMs}\nAnswer:"

In [85]:
chat_completion = client.chat.completions.create(
    messages=[{"role": "user", 'content': prompt}],
    model="llama-3.3-70b-specdec",
)
res = chat_completion.choices[0].message.content

In [86]:
# Extract the response after the </think> tag
if '</think>' in res:
    res_after_think = res.split('</think>')[1].strip()
else:
    res_after_think = res.strip()

# Preprocess the extracted response
preprocessed_res = res_after_think.replace("\n", " ").replace("  ", " ")

print(preprocessed_res)

The first sentence of the sample PDF is: "This is a simple PDF file. Fun fun fun."
