In [1]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain xformers==0.0.21 \
bitsandbytes==0.41.1 sentence-transformers==2.2.2 chromadb==0.4.12 google-generativeai

Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl.metadata (17 kB)
Collecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl.metadata (12 kB)
Collecting langchain
  Downloading langchain-0.1.7-py3-none-any.whl.metadata (13 kB)
Collecting xformers==0.0.21
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl.metadata (9.8 kB)
Collecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ do

In [2]:
import json
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader  # Use TextLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import google.generativeai as genai  # Import the Gemini API

2024-02-15 10:25:12.305428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 10:25:12.305546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 10:25:12.474083: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Initialize Gemini with your API key (replace with your actual key)
genai.configure(api_key='API_KEY')

In [4]:
# Replace with the path to your JSON file
json_file_path = "/kaggle/input/indian-laws/combined.json"

def process_json_file(json_file_path):
    with open(json_file_path, "r") as f:
        data = json.load(f)

    documents = []
    document_ids = []
    for i, doc in enumerate(data):
        title = doc.get("title")
        description = doc.get("description")
        content = f"{title}\n{description}" if title and description else title

        # Create Document object
        document_obj = Document(content, id=title)
        documents.append(document_obj)
        document_ids.append(title)

    return documents, document_ids

In [5]:
# Load documents from the JSON file
documents, document_ids = process_json_file(json_file_path)

# Text splitter for large documents (adjust chunk size if needed)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [6]:
# Choose a compatible Gemini model (check their docs for details)
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

# Create embedding model and vector database
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

# Define RAG retrieval and generation components
retriever = vectordb.as_retriever()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [7]:
def generate_with_retrieval(prompt):

  # Retrieve relevant documents
  top_docs = vectordb.search(prompt,search_type= 'similarity', k=3) # Retrieve top 3 relevant documents
    
  # Prepare retrieved document context
  retrieved_context = " ".join([doc.to_json()["kwargs"]["page_content"] for doc in top_docs])
    
  # Combine prompt and retrieved context for generation
  combined_prompt = prompt + "\n" + retrieved_context
    
  # Generate response using Gemini
  model = genai.GenerativeModel('gemini-pro')
  response = model.generate_content(combined_prompt)
  return response.text.strip()

In [8]:
prompt = "how do driving licenses work?"
response = generate_with_retrieval(prompt)
print("Prompt:", prompt)
print("Response:", response)

Prompt: how do driving licenses work?
Response: **Issuance of Driving Licenses:**

* The Central Government can issue driving licenses valid throughout India to individuals over 18 years of age for driving vehicles used for government defense purposes.

**Purpose and Restrictions:**

* Driving licenses specify the class or description of vehicle the holder is allowed to drive.
* Licenses are only valid for the specified period indicated on the license.
* Holders are not permitted to drive any vehicles other than those authorized by their license.

**State Registers of Driving Licenses:**

* Each State Government maintains a register of driving licenses issued within their jurisdiction.
* Printed copies or other specified formats of these registers are supplied to the Central Government.
* State Governments must promptly notify the Central Government of any changes or additions to their registers.

**National Register of Driving Licenses:**

* The Central Government maintains a National