In [None]:
pip install pinecone-client langchain langchain-community langchain-huggingface huggingface-hub


In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# =========================
# Step 1: Load Environment
# =========================
load_dotenv(override=True)
PINECONE_API_KEY = os.getenv("PINECONE_API")
PINECONE_REGION = "us-east-1"
INDEX_NAME = "medical-chatbot-index"
EMBED_DIM = 1024

# =========================
# Step 2: Load & Split PDFs
# =========================
loader = DirectoryLoader("data", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
text_chunks = splitter.split_documents(documents)
texts = [chunk.page_content for chunk in text_chunks]

# =========================
# Step 3: Generate Embeddings
# =========================
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
embeddings = embedding_model.embed_documents(texts)

# =========================
# Step 4: Initialize Pinecone v3
# =========================
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if not exists
if INDEX_NAME not in [i.name for i in pc.list_indexes().index_list["indexes"]]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBED_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_REGION)
    )

index = pc.Index(INDEX_NAME)

# =========================
# Step 5: Upload Vectors
# =========================
vectors = [
    {
        "id": f"chunk-{i}",
        "values": embeddings[i],
        "metadata": {"text": texts[i]}
    }
    for i in range(len(texts))
]

index.upsert(vectors=vectors)
print(f"✅ Uploaded {len(vectors)} vectors to Pinecone index.")
