In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader


# Load PDFs from a directory
def load_pdf_directory(data_dir):
    loader = PyPDFDirectoryLoader(data_dir)
    documents = loader.load()
    return documents

# Load a single PDF file
def load_pdf_file(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

# Split the text into smaller chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

# Download the embeddings from huggingface
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [2]:
from flask import Flask, render_template, jsonify, request
import os
from werkzeug.utils import secure_filename
from src.helper import text_split, download_hugging_face_embeddings
from langchain_pinecone import PineconeVectorStore
from langchain_together import ChatTogether
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from src.prompt import *
from langchain.document_loaders import PyPDFLoader
from pinecone.grpc import PineconeGRPC as Pinecone
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import uuid
from dotenv import load_dotenv
from werkzeug.utils import secure_filename
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC as Pinecone
from src.helper import text_split, download_hugging_face_embeddings
from langchain_community.document_loaders import PyPDFLoader

In [4]:
# Cell 2: Load keys and init
load_dotenv()  
API_KEY = os.getenv("PINECONE_API_KEY")
assert API_KEY, "PINECONE_API_KEY not set in .env!"

# set for pinecone-client
os.environ["PINECONE_API_KEY"] = API_KEY
pc = Pinecone(api_key=API_KEY)

# Choose your test index name
INDEX_NAME = "pdfbot"

In [5]:
# Cell 3: port your helper function
def process_uploaded_pdfs(file_paths, session_id):
    # load + chunk
    docs = []
    for fp in file_paths:
        loader = PyPDFLoader(fp)
        docs.extend(loader.load())
    chunks = text_split(docs)
    
    # embeddings
    emb = download_hugging_face_embeddings()
    
    # ensure index exists
    existing = pc.list_indexes()  # returns List[str]
    if INDEX_NAME not in existing:
        pc.create_index(name=INDEX_NAME, dimension=384, metric="cosine")
        print(f"➡️ Created index {INDEX_NAME}")
    else:
        print(f"✅ Index {INDEX_NAME} already exists")
    
    # upsert
    vs = PineconeVectorStore.from_documents(
        documents=chunks,
        index_name=INDEX_NAME,
        embedding=emb,
        namespace=session_id
    )
    return vs


In [7]:
# Cell 4: point to your PDF(s)
uploaded = ["Data/Brown-Giving-PsychSci-2003.pdf"]  # adjust path as needed
session = str(uuid.uuid4())
vector_store = process_uploaded_pdfs(uploaded, session)

# Check how many vectors you now have
print("Vectors upserted:", vector_store.client.describe_index(INDEX_NAME).namespaces[session].vector_count)


ValueError: File path Data/Brown-Giving-PsychSci-2003.pdf is not a valid file or url

In [12]:
# Cell 3: port your helper logic, now using load_pdf_file
from src.helper import load_pdf_directory, text_split, download_hugging_face_embeddings
from langchain_pinecone import PineconeVectorStore

def process_uploaded_pdfs(data_dir, session_id):
    """
    1) Loads all PDFs from a directory via load_pdf_file
    2) Splits into chunks via text_split
    3) Embeds with download_hugging_face_embeddings
    4) Ensures the INDEX_NAME exists in Pinecone
    5) Upserts into PineconeVectorStore under namespace=session_id
    """
    # 1. Load & chunk
    documents    = load_pdf_directory(data_dir)       # returns List[Document]
    text_chunks  = text_split(documents)         # returns List[DocumentChunk]

    # 2. Get embeddings instance
    embeddings   = download_hugging_face_embeddings()

    # 3. Ensure index exists
    existing = pc.list_indexes()                 # returns List[str]
    if INDEX_NAME not in existing:
        pc.create_index(
            name=INDEX_NAME,
            dimension=384,                       # must match your embedder
            metric="cosine",
            spec=ServerlessSpec(
                cloud = "aws",
                region = "us-east-1"
    )
        )
        print(f"➡️ Created index '{INDEX_NAME}'")
    else:
        print(f"✅ Index '{INDEX_NAME}' already exists")

    # 4. Upsert chunks into Pinecone under this session namespace
    vs = PineconeVectorStore.from_documents(
        documents=text_chunks,
        index_name=INDEX_NAME,
        embedding=embeddings,
        namespace=session_id
    )

    return vs


In [14]:
from pinecone import ServerlessSpec

In [15]:
# Cell 4: point to your PDF directory instead of individual files
session = str(uuid.uuid4())
vs = process_uploaded_pdfs(data_dir="Data/", session_id=session)

# confirm
print("Vectors in namespace:", 
      vs.client.describe_index(INDEX_NAME)
         .namespaces[session]
         .vector_count
)


➡️ Created index 'pdfbot'


AttributeError: 'PineconeVectorStore' object has no attribute 'client'

In [16]:
vs

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x227f2d9e9b0>

In [17]:
existing_indexes = pc.list_indexes()
existing_indexes

[
    {
        "name": "medicalbot",
        "metric": "cosine",
        "host": "medicalbot-m2e5ynz.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "pdfbot",
        "metric": "cosine",
        "host": "pdfbot-m2e5ynz.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    }
]