In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [11]:
!pip install faiss-cpu --quiet


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import dotenv
dotenv.load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY is not set in the environment variables.")

In [4]:
embeddings = GoogleGenerativeAIEmbeddings(
    model="model/embedding-001",
    google_api_key=GOOGLE_API_KEY
)

In [5]:
pdf_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=250,
)

In [9]:
root_dir = r"C:\Users\bhuva\Desktop\projects_2025\AI agents\Agentic_RAG\dataset\supreme_court_judgments"
output_dir = r"C:\Users\bhuva\Desktop\projects_2025\AI agents\Agentic_RAG\output_embeddings"

In [7]:
os.makedirs(output_dir, exist_ok=True)


In [None]:
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    if not os.path.isdir(folder_path):
        continue  

    print(f"\nProcessing folder: {folder_name}")
    all_chunks = []

    pdf_count = 0
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".PDF") or file_name.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file_name)
            try:
                loader = PyPDFLoader(pdf_path)
                docs = loader.load()
                chunks = pdf_splitter.split_documents(docs)
                all_chunks.extend(chunks)
                pdf_count += 1
                print(f"Successfully processed {file_name} - {len(chunks)} chunks")
            except Exception as e:
                print(f"Failed to load {file_name}: {e}")
    
    print(f"Total PDFs processed in {folder_name}: {pdf_count}")
    print(f"Total chunks from {folder_name}: {len(all_chunks)}")

    if len(all_chunks) > 0:
        try:
            print(f"Creating vector store for {len(all_chunks)} chunks...")
            vectorstore = FAISS.from_documents(all_chunks, embeddings)
            save_path = os.path.join(output_dir, f"{folder_name}_vectorstore")
            vectorstore.save_local(save_path)
            print(f"Saved FAISS store for {folder_name} at {save_path}")
        except Exception as e:
            print(f"Error creating vector store for {folder_name}: {str(e)}")
            print(f"First chunk content preview: {all_chunks[0].page_content[:200] if all_chunks else 'No chunks'}")
    else:
        print(f"Skipping vector store creation for {folder_name} - no valid chunks found")


Processing folder: 2000
Successfully processed Abdul_Karim_Etc_Etc_vs_State_Of_Karnataka_Others_Etc_Etc_on_7_November_2000_1.PDF - 53 chunks
Successfully processed Abdul_Nazar_Madani_vs_State_Of_Tamil_Nadu_Anr_on_5_May_2000_1.PDF - 20 chunks
Successfully processed Abdul_Rashid_Ibrahim_Mansurl_vs_State_Of_Gujarat_on_1_February_2000_1.PDF - 21 chunks
Successfully processed Abdul_Wahab_Ansari_vs_State_Of_Bihar_Anr_on_17_October_2000_1.PDF - 16 chunks
Successfully processed Agricultural_Produce_Market_Committee_vs_Shri_Ashok_Harikuni_Anr_Etc_on_22_September_2000_1.PDF - 48 chunks
Successfully processed Air_India_vs_Cochin_International_Airport_Ltd_on_31_January_2000_1.PDF - 23 chunks
Successfully processed Aligarh_Muslim_University_And_Ors_vs_Mansoor_Ali_Khan_on_28_August_2000_1.PDF - 25 chunks
Successfully processed Allahabad_Bank_vs_Canara_Bank_Another_on_10_April_2000_1.PDF - 74 chunks
Successfully processed Almitra_H_Patel_And_Anr_Petitioners_vs_Union_Of_India_And_Ors_Respondents_on_1

In [15]:
len(all_chunks)

12722

In [None]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="embedding-001", 
    google_api_key=GOOGLE_API_KEY
)


In [None]:
print(f"Total chunks in folder '{folder_name}': {len(all_chunks)}")

print(f"Sample chunk:\n{all_chunks[0].page_content[:500]}")


try:
    vectorstore = FAISS.from_documents(all_chunks, embedding_model)

    save_path = os.path.join(output_dir, f"{folder_name}_vectorstore")
    vectorstore.save_local(save_path)

    print(f"Saved vector store for '{folder_name}' at: {save_path}")
except Exception as e:
    print(f"Error creating vector store for '{folder_name}': {e}")


Total chunks in folder 'script.py': 12722
Sample chunk:
Abdul Nassar vs The State Of Kerala on 7 January, 2025
Author: Vikram Nath
Bench: Vikram Nath, Sanjay Karol
        2025 INSC 35
                                                      REPORTABLE
                             IN THE SUPREME COURT OF INDIA
                            CRIMINAL APPELLATE JURISDICTION
                         CRIMINAL APPEAL NO(S). 1122-1123 OF 2018
         ABDUL NASSAR                                               ..APPELLANT(S)
                                      
✅ Saved vector store for 'script.py' at: C:\Users\bhuva\Desktop\projects_2025\AI agents\Agentic_RAG\output_embeddings\script.py_vectorstore
