In [1]:
import os
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Loading BNS Dataset...")
df = pd.read_csv("bns_sections.csv")
df.head()

Loading BNS Dataset...


Unnamed: 0,Chapter,Chapter_name,Chapter_subtype,Section,Section _name,Description
0,1,Preliminary,Preliminary,1,"Short title, commencement and application",(1) This Act may be called the Bharatiya Nyaya...
1,1,Preliminary,Preliminary,2,Definitions.,"In this Sanhita, unless the context otherwise ..."
2,1,Preliminary,Preliminary,3,General explanations,(1) Throughout this Sanhita every definition o...
3,2,Of punishments,Of punishments,4,Punishments,The punishments to which offenders are liable ...
4,2,Of punishments,Of punishments,5,Commutation of sentence,"The appropriate Government may, without the co..."


In [3]:
df['combined_text'] = (
    "Section: " + df['Section'].astype(str) + "\n" + #type: ignore
    "Offense: " + df['Section _name'] + "\n" +
    "Full Law: " + df['Description']
)

df.head()

Unnamed: 0,Chapter,Chapter_name,Chapter_subtype,Section,Section _name,Description,combined_text
0,1,Preliminary,Preliminary,1,"Short title, commencement and application",(1) This Act may be called the Bharatiya Nyaya...,"Section: 1\nOffense: Short title, commencement..."
1,1,Preliminary,Preliminary,2,Definitions.,"In this Sanhita, unless the context otherwise ...",Section: 2\nOffense: Definitions.\nFull Law: I...
2,1,Preliminary,Preliminary,3,General explanations,(1) Throughout this Sanhita every definition o...,Section: 3\nOffense: General explanations\nFul...
3,2,Of punishments,Of punishments,4,Punishments,The punishments to which offenders are liable ...,Section: 4\nOffense: Punishments\nFull Law: Th...
4,2,Of punishments,Of punishments,5,Commutation of sentence,"The appropriate Government may, without the co...",Section: 5\nOffense: Commutation of sentence \...


In [4]:
loader = DataFrameLoader(df, page_content_column="combined_text")
documents = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [6]:
print("Creating Vector Database...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Creating Vector Database...


In [7]:
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory="./bns_vector_db"
)

In [8]:
print("Vector Database 'bns_vector_db' created.")

Vector Database 'bns_vector_db' created.


In [11]:
from sentence_transformers import SentenceTransformer

print("Downloading embedding model for offline use...")

model = SentenceTransformer('all-MiniLM-L6-v2')
model.save('./my_offline_model')

print("Success! Model saved to folder: ./my_offline_model")
print("It will now work without an internet connection as well.")

Downloading embedding model for offline use...
Success! Model saved to folder: ./my_offline_model
It will now work without an internet connection as well.
