In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import os
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.embeddings.openai import OpenAIEmbedding
from pinecone import Pinecone, ServerlessSpec
from llama_index.core.node_parser import SimpleNodeParser
import pinecone

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=PINECONE_API_KEY)

  from tqdm.autonotebook import tqdm


In [3]:
def find_subfolders(folder_path):
    subfolders = []
    for root, dirs, files in os.walk(folder_path):
        for dir in dirs:
            subfolders.append(os.path.join(root, dir))
    return subfolders

def ingest_pdfs(directory_path, index_name):
    """
    Args:
        directory_path (str): Path to directory containing PDFs
        index_name (str): Name of the Pinecone index to create

    Returns:
        VectorStoreIndex: Index containing the documents from the directory
    """
    list_of_available_indexes = pc.list_indexes().indexes
    index_names = [index['name'] for index in list_of_available_indexes]
    if index_name not in index_names:
        pc.create_index(
            name=index_name,
            dimension=1536,  # OpenAI embedding dimension
            metric="cosine",
            spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
        )
    
    pinecone_index = pc.Index(index_name)
    
    # Set up vector store
    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Load documents
    documents = SimpleDirectoryReader(directory_path,
                                      required_exts=[".txt", ".pdf", ".docx"],
                                      exclude=["*.pptx","*.zip"]).load_data()
    
    parser = SimpleNodeParser.from_defaults(chunk_overlap=50)
    nodes = parser.get_nodes_from_documents(documents)

    for node in nodes:
        node.metadata["page_number"] = node.metadata.get("page_label", "N/A")
    
    embed_model = OpenAIEmbedding()
    index = VectorStoreIndex(nodes, 
                             storage_context=storage_context, 
                             embed_model=embed_model)
    
    return index

In [4]:
main_path = "/Users/rishub/Desktop/projects/personal/agents-grant/sample_files/HS Grants"

In [5]:
index_name = os.getenv("PINECONE_INDEX_NAME")
index = ingest_pdfs(main_path, index_name)

Upserted vectors: 100%|██████████| 105/105 [00:10<00:00,  9.58it/s]
