# Import all necessary libraries

In [None]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai
from langchain_community.embeddings import HuggingFaceEmbeddings


  from .autonotebook import tqdm as notebook_tqdm


# Load env variables , Setup embedding model, llm model

In [None]:
load_dotenv()

GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_3")

embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

genai.configure(api_key=GEMINI_API_KEY)

llm_model = genai.GenerativeModel("gemini-2.5-pro")


# Load pdf file, split into chunks, create vectors for each chunks

In [None]:
loader = PyPDFLoader(r"C:\Users\Abdul\OneDrive\Desktop\learning_projects\FINAL_RAG\ML_Lectures.pdf")
pages = loader.load()


splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(pages)
print(len(chunks))


docs_with_vectors = []

for i, chunk in enumerate(chunks):
    text = chunk.page_content

    vector = embedding_model.embed_query(text)

    docs_with_vectors.append({
                                "id": f"doc-{i}",
                                "values": vector,
                                "metadata": {"text": text}
                            })

# Setup Pinecone and create index if not exists

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "updated-rag"

dimension = len(docs_with_vectors[0]['values'])

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
                            cloud="aws",
                            region="us-east-1"))

In [13]:
index = pc.Index(index_name)

stats = index.describe_index_stats()


In [15]:
print(stats)

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


# Upsert vectors into the vector database

check if index is empty, then insert the documents with vectors. Else just skip inserting

In [None]:
from tqdm import tqdm

index = pc.Index(index_name)

def is_index_empty(index):
    stats = index.describe_index_stats()
    return stats["total_vector_count"] == 0

def batch_upsert(index, docs_with_vectors, batch_size=1000):
    for i in tqdm(range(0, len(docs_with_vectors), batch_size), desc="Uploading vectors"):
        batch = docs_with_vectors[i:i + batch_size]
        index.upsert(batch)

# Usage
if is_index_empty(index):
    print("Index is empty. Uploading vectors...")
    batch_upsert(index, docs_with_vectors)
    print("Upload completed.")
else:
    print("Index already has data. Skipping upload.")


Index is empty. Uploading vectors...


Uploading vectors: 100%|██████████| 2/2 [00:06<00:00,  3.21s/it]

Upload completed.





# Query 1

query for similar documents

In [24]:
query = "What are different types of machine learning?"
query_vector = embedding_model.embed_query(query)

response = index.query(
    index_name=index_name,
    vector=query_vector,
    top_k=10,
    include_metadata=True
)

print(response)

{'matches': [{'id': 'doc-77',
              'metadata': {'text': 'Chapter 1\n'
                                   'Introduction to machine learning\n'
                                   'In this chapter, we consider different '
                                   'deﬁnitions of the term “machine learning” '
                                   'and explain what\n'
                                   'is meant by “learning” in the context of '
                                   'machine learning. We also discuss the '
                                   'various components\n'
                                   'of the machine learning process. There are '
                                   'also brief discussions about different '
                                   'types learning like\n'
                                   'supervised learning, unsupervised learning '
                                   'and reinforcement learning.\n'
                                   '1.1 Introduction\n'
  

# Final response using the prompt template

In [None]:
matches = response.get("matches",[])

context = "\n\n".join(match["metadata"]["text"] for match in matches)

# Load the prompt template
with open("prompt_template.txt", "r") as file:
    prompt_template = file.read()

# Use it with .format() or f-strings
final_prompt = prompt_template.format(context=context, query=query)

gemini_response = llm_model.generate_content(final_prompt)

print("\nGemini's Answer:\n")
print(gemini_response.text)


Gemini's Answer:

Machine learning algorithms can be classified into three types:
*   Supervised learning
*   Unsupervised learning
*   Reinforcement learning


# Similarity score of the context and the final response

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

answer_text = gemini_response.text
answer_embedding = embedding_model.embed_query(answer_text)


for i, match in enumerate(matches):
    chunk_text = match["metadata"]["text"]
    chunk_embedding = embedding_model.embed_query(chunk_text)

    similarity_score = cosine_similarity(
        [answer_embedding],
        [chunk_embedding])[0][0]

    print(f"🔍 Semantic Similarity with Chunk {i+1}: {similarity_score:.2f}")


🔍 Semantic Similarity with Chunk 1: 0.73
🔍 Semantic Similarity with Chunk 2: 0.58
🔍 Semantic Similarity with Chunk 3: 0.54
🔍 Semantic Similarity with Chunk 4: 0.55
🔍 Semantic Similarity with Chunk 5: 0.55


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


# Query 2

In [39]:
query = "Name the author of Lecture Notes in Machine Learning"
query_vector = embedding_model.embed_query(query)

response = index.query(
    index_name=index_name,
    vector=query_vector,
    top_k=10,
    include_metadata=True
)

print(response)

{'matches': [{'id': 'doc-0',
              'metadata': {'text': 'Lecture Notes in \n'
                                   'MACHINE LEARNING \n'
                                   ' \n'
                                   ' Dr V N Krishnachandran \n'
                                   '  \n'
                                   ' \n'
                                   'Vidya Centre for Artificial Intelligence '
                                   'Research'},
              'score': 0.72664839,
              'values': []},
             {'id': 'doc-1',
              'metadata': {'text': 'Syllabus\n'
                                   'Course code Course Name L - T - P - '
                                   'Credits Year of introduction\n'
                                   'CS467 Machine Learning 3 - 0 - 0 - 3 2016\n'
                                   'Course Objectives\n'
                                   '• To introduce the prominent methods for '
                                   'machin

In [None]:
matches = response.get("matches",[])

context = "\n\n".join(match["metadata"]["text"] for match in matches)

# Load the prompt template
with open("prompt_template.txt", "r") as file:
    prompt_template = file.read()

# Use it with .format() or f-strings
final_prompt = prompt_template.format(context=context, query=query)

gemini_response = llm_model.generate_content(final_prompt)

print("\nGemini's Answer:\n")
print(gemini_response.text)


Gemini's Answer:

Dr V N Krishnachandran
