In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.chains import RetrievalQA
from pinecone import Pinecone
from langchain.prompts import PromptTemplate
import os
import openai
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['PINECONE_API_KEY'] = os.getenv("PINECONE_API_KEY")

embed = OpenAIEmbeddings()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
def load_data(dir_path):
    loader = PyPDFDirectoryLoader(dir_path)
    data = loader.load()
    return data

def split_chunks(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(data)
    texts = [doc.page_content for doc in chunks]
    return chunks, texts

def embed_vectors(texts,chunks,embed):
    
    embeddings = embed.embed_documents(texts)
    vectors = [
            {
                "id": f"id-{i}",
                "values": vector,
                "metadata": {
                    "source": chunks[i].metadata.get("source", ""),
                    "text": chunks[i].page_content,  # ✅ this is what you're missing
                    "vector": vector
                }
            }
            for i, vector in enumerate(embeddings)
            ]
    return vectors

def format_vectors(embedded_vectors):
    pc_vectors = [
                            {
                                'id': vector['id'],
                                'values': vector['values'],
                                'metadata': {
                                    key: str(value) if isinstance(value, (list, dict)) else value
                                    for key, value in vector['metadata'].items()
                                }
                            }
                            for vector in embedded_vectors
                        ]
    return pc_vectors

In [None]:
if __name__ == "__main__":
    data = load_data("Data/")
    print("Data loaded successfully")
    chunks, texts = split_chunks(data)
    print("Chunks completed successfully")
    embedded_vectors = embed_vectors(texts,chunks,embed)
    print("Embedding completed successfully")
    pc_vectors = format_vectors(embedded_vectors)
    print("Formmated vectors to Pinecone DB format")
    print(f"Dimension of vector: {embedded_vectors[0]}")

Data loaded successfully


In [8]:
data

[Document(metadata={'producer': 'PDFKit.NET 2.0.10.0', 'creator': '', 'creationdate': 'D:20240229033610', 'moddate': 'D:20240229152246', 'author': '', 'keywords': '', 'title': '', 'subject': '', 'source': 'Data\\finance.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='THE\nSURPRISINGL\nYSIMPLE W\nAY T\nO LAUNCHA 7\n-FIGUREBUSINESS IN\n 48 HOURS\nMI\nLLION DOLL\nAR WE\nEKENDM\nILLION DOLLAR WEEKENDN\nOAH KAGANwi\nth TAHL RAZFOUNDER \nAND CEO OF APPSUMON\nOAH KAGANw\nith TAHL RAZ‘Noah \nKagan changed my life. I created a business because of him. That business went on to sell for tens of millions of dollars.\n Million Dollar Weekend is the real deal.’                         SAM P\nARR, COFOUNDER OF HAMPTON, FOUNDER OF THE HUSTLE , HOST OF MY FIRST MILLIONEntrepreneurial success has a simple formula, which you can do\n in one life-changing weekend.\n It really is THAT easy. N\nOW is the best time in history to start a profitable business. Entrepreneurship doesn’t have