In [None]:
pip install pinecone-client langchain sentence-transformers tqdm pinecone

Collecting pinecone-client
  Using cached pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-6.0.0-py3-none-any.whl (6.7 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-client, pinecone
Successfully installed pinecone-6.0.2 pinecone-client-6.0.0 pinecone-plugin-interface-0.0.7


In [None]:
!pip install --upgrade transformers
!pip install langchain_huggingface

Collecting transformers
  Downloading transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.3
    Uninstalling transformers-4.50.3:
      Successfully uninstalled transformers-4.50.3
Successfully installed transformers-4.51.0
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloadin

In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-community)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain-community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [None]:
from langchain.vectorstores import Pinecone as LangchainPinecone

In [None]:
import os
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import pinecone
from langchain.docstore.document import Document

In [None]:
import os

os.environ["PINECONE_API_KEY"] = "your-api-key"
PINECONE_API_KEY = "your-api-key"
PINECONE_ENV = "us-east-1"
INDEX_NAME = "legal-db"

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_ENV)
    )

In [None]:
# Connect to index
index = pc.Index(INDEX_NAME)

In [None]:
# Load Q&A pairs
def load_qa_pairs(json_paths):
    pairs = []
    for path in json_paths:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            pairs.extend(data)
    return pairs

In [None]:
# Embed and upsert
def upload_to_pinecone(pairs):
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")
    vectors = []
    for i, pair in enumerate(tqdm(pairs)):
        content = f"Q: {pair['question']}\nA: {pair['answer']}"
        vector = embed_model.encode(content).tolist()
        metadata = {
            "question": pair["question"],
            "answer": pair["answer"]
        }
        vectors.append({
            "id": f"qa-{i}",
            "values": vector,
            "metadata": metadata
        })

        # Batch upsert every 100
        if len(vectors) == 100:
            index.upsert(vectors=vectors, namespace="rag-legal")
            vectors = []

    # Final flush
    if vectors:
        index.upsert(vectors=vectors, namespace="rag-legal")

    print(f"✅ Uploaded {len(pairs)} documents.")

In [None]:
if __name__ == "__main__":
    json_files = ["constitution_qa.json", "crpc_qa.json", "ipc_qa.json"]
    qa_pairs = load_qa_pairs(json_files)
    upload_to_pinecone(qa_pairs)

100%|██████████| 14543/14543 [02:30<00:00, 96.51it/s] 


✅ Uploaded 14543 documents.
