In [None]:
!pip install sentence-transformers faiss-cpu
!pip install llama-index faiss-cpu openai



In [None]:
import fitz
import json
import os

In [None]:
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

In [None]:
pdf_files = {
    "CPA2019": "/content/CPA2019.pdf",
    "MVA": "/content/MVA.pdf",
    "RTI": "/content/rti-act.pdf"
}

corpus = []

for label, path in pdf_files.items():
    full_text = extract_pdf_text(path)
    chunks = chunk_text(full_text, chunk_size=500)
    for chunk in chunks:
        corpus.append({
            "text": chunk,
            "metadata": {"source": label}
        })

In [None]:
with open("/content/IndicLegalQA Dataset_10K_Revised.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

for qa in qa_data:
    corpus.append({
        "text": f"Q: {qa['question']} A: {qa['answer']}",
        "metadata": {
            "source": "IndicLegalQA",
            "case_name": qa.get("case_name", "Unknown"),
            "judgement_date": qa.get("judgement_date", "")
        }
    })



In [None]:
print(f"Legal corpus created with {len(corpus)} entries.")


Legal corpus created with 10190 entries.


In [None]:
print(corpus)



In [None]:
import json
import faiss
from llama_index.indices.vector_store.faiss import FaissVectorStore
from llama_index.vector_stores import VectorStoreQuery
from llama_index import ServiceContext, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from openai import OpenAI
from llama_index.llms import OpenAI as LlamaOpenAI

# Load metadata and index
with open("legal_metadata.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

index = faiss.read_index("legal_index.faiss")

# Use LlamaIndex-compatible vector store wrapper
vector_store = FaissVectorStore(faiss_index=index, docstore={i: {"text": chunk["text"]} for i, chunk in enumerate(corpus)})

# Set up OpenAI (GPT-3.5 / GPT-4)
llm = LlamaOpenAI(model="gpt-3.5-turbo", temperature=0.3)  # Requires OPENAI_API_KEY

service_context = ServiceContext.from_defaults(llm=llm)

# Create final queryable index object
rag_index = GPTVectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)
