In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!pip install -q -U langchain-community pinecone-client openai tqdm python-dotenv pypdf

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/290.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader

from pinecone.grpc import PineconeGRPC as pinecone
from pinecone import ServerlessSpec

from openai import OpenAI
from tqdm.auto import tqdm

import os
import getpass

In [14]:
pinecone_api_key = getpass.getpass("Enter Pinecone API key: ")

Enter Pinecone API key: ··········


In [15]:
openai_api_key = getpass.getpass("Enter OpenAI API key: ")
os.environ['OPENAI_API_KEY'] = openai_api_key
openai_client = OpenAI()

Enter OpenAI API key: ··········


In [16]:
index_name = "potteroracle"

model_name="text-embedding-3-small"

In [17]:
def data_ingestion(directory: str) -> list:
    """
    Load and split PDF documents into manageable chunks.

    Args:
    directory (str): The directory containing PDF documents.

    Returns:
    list: A list of document chunks with cleaned text and metadata.
    """
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)

    for i, doc in enumerate(docs):
        doc.page_content = doc.page_content.replace("\n", " ")
        doc.metadata['id'] = f"chunk_{i}"
        doc.metadata['text'] = doc.page_content
    return docs

In [20]:
document_directory = '/content/drive/MyDrive/Artificial Intelligence/projects/The Oracle of Hogwarts/datasets'

# Perform data ingestion and get the processed chunks
document_chunks = data_ingestion(directory=document_directory)
print(f"\n********************* Number of chunks created: {len(document_chunks)} ***************************")


********************* Number of chunks created: 9049 ***************************


In [21]:
# Print the first few chunks for verification
num_chunks_to_print = 5
for i, chunk in enumerate(document_chunks[:num_chunks_to_print]):
    print(f"Chunk {i+1}:")
    print(chunk)
    print("\n" + "-"*80 + "\n")

Chunk 1:
page_content='CHAPTER ONE THE OTHER MINISTER It was nearing midnight and the Prime Minister was sitting alone in his office, reading a long memo that was slipping through his brain without leaving the slightest trace of meanin g behind. He was waiting for a call from the President of a far distant country, and between wondering when the wretched man would telephone, and trying to suppress unpleasant memor ies of what had been a very long, tiring, and difficult week, there was not much space in his head for anything else. The more he attempted to focus on the print on the page before  him, the more clearly the Prime Minister could see the gloating face of one of his political opponents. This particular opponent had appeared on the news that very day, not only to' metadata={'source': '/content/drive/MyDrive/Artificial Intelligence/projects/The Oracle of Hogwarts/datasets/book6.pdf', 'page': 0, 'id': 'chunk_0', 'text': 'CHAPTER ONE THE OTHER MINISTER It was nearing midnight and t

In [22]:
def create_serverless_index(name: str, dimension: int, metric: str, cloud: str, region: str, api_key: str) -> pinecone:
    """
    Create a serverless Pinecone index.

    Args:
    name (str): The name of the index.
    dimension (int): The dimension of the vectors.
    metric (str): The distance metric to use.
    cloud (str): The cloud provider.
    region (str): The region of the cloud provider.
    api_key (str): The API key for Pinecone.

    Returns:
    pinecone: The Pinecone client instance.
    """
    pinecone_client = pinecone(api_key=api_key)
    pinecone_client.create_index(
        name=name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(cloud=cloud, region=region)
    )
    print(f"\n******************* Serverless index '{name}' created successfully. **********************")
    return pinecone_client


In [23]:
# Create the serverless index
pinecone_client = create_serverless_index(index_name, 1536, "cosine", "aws", "us-east-1", pinecone_api_key)
index = pinecone_client.Index(index_name)


******************* Serverless index 'potteroracle' created successfully. **********************


In [24]:
def embed(documents: list[str], model_name: str) -> list[list[float]]:
    """
    Generate embeddings for a list of documents using OpenAI.

    Args:
    documents (list[str]): A list of documents to embed.
    model_name (str): The name of the OpenAI model to use.

    Returns:
    list[list[float]]: A list of embeddings.
    """
    response = openai_client.embeddings.create(input=documents, model=model_name)
    document_embeddings = [result.embedding for result in response.data]
    return document_embeddings

In [25]:
def upsert_chunks_in_batches(chunks: list, batch_size: int, model_name: str) -> None:
    """
    Upsert document chunks into the Pinecone index in batches.

    Args:
    chunks (list): A list of document chunks.
    batch_size (int): The number of chunks to upsert in each batch.
    model_name (str): The name of the OpenAI model to use for embedding.

    Returns:
    None
    """
    for i in tqdm(range(0, len(chunks), batch_size)):
        batch_end = min(len(chunks), i + batch_size)
        document_batch = chunks[i:batch_end]
        embeddings = embed([chunk.page_content for chunk in document_batch], model_name=model_name)
        vectors_to_upsert = [(chunk.metadata['id'], embedding, chunk.metadata) for chunk, embedding in zip(document_batch, embeddings)]
        index.upsert(vectors=vectors_to_upsert)
    print("\n****************** Upsert Operation successful*********************")

In [26]:
upsert_chunks_in_batches(document_chunks, batch_size=100, model_name=model_name)

  0%|          | 0/91 [00:00<?, ?it/s]


****************** Upsert Operation successful*********************


In [27]:
def get_docs(query: str, top_k: int, model_name: str) -> list[str]:
    """
    Retrieve documents matching a query from the Pinecone index.

    Args:
    query (str): The query string.
    top_k (int): The number of top documents to retrieve.

    Returns:
    list[str]: A list of document texts matching the query.
    """
    print(f"Getting docs with {index_name}")
    query_embedding = embed([query], model_name=model_name)[0]
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    documents = [result["metadata"]['text'] for result in results["matches"]]
    return documents

In [28]:
def execute_sample_query(query: str, top_k: int, model_name: str) -> None:
    """
    Execute a sample query and print the retrieved documents.

    Args:
    query (str): The query string.
    top_k (int): The number of top documents to retrieve.

    Returns:
    None
    """
    retrieved_docs = get_docs(query=query, top_k=top_k, model_name=model_name)
    print(">>>")
    for doc in retrieved_docs:
        print(doc)
        print(">>>")

In [29]:
# Execute a sample query
execute_sample_query(
    query="What is the function of the Marauder's Map?",
    top_k=5,
    model_name=model_name
)

Getting docs with potteroracle
>>>
“What is this thing?” said Moody, drawing the Marauder’s Map out of his pocket and unfolding it.
>>>
And at once, thin ink lines began to spread like a spider’s web from the point that George’s wand had touched. They joined each other, they crisscrossed, they fanned into every corn er of the parchment; then words began to blossom across the top, great, curly green words, that proclaimed: Messrs. Moony, Wormtail, Padfoot, and ProngsPurveyors of Aids to Magical Mischief-Makersare proud to present THE MARAUDER’S MAP It was a map showing every detail of the Hogwarts castle and grounds. But the truly remarkable thing were the tiny ink dots moving around it, each labeled with a name in minuscule w riting. Astounded, Harry bent over it. A labeled dot in the top left corner showed that Professor Dumbledore was pacing his study; the caretaker’s cat, Mrs. Norris, was prowling the
>>>
>>>
“The map,” said Lupin. “The Marauder’s Map. I was in my office examining i