# Installation part

In [1]:
# !pip install accelerate -U -qq

In [2]:
# !pip install bitsandbytes -qq

In [3]:
# !pip install chromadb -qq

In [4]:
# !pip install langchain llama-index llama_hub -qq

In [5]:
# !pip install sentence-transformers -qq

In [6]:
# !pip install langchain-community -qq

In [7]:
# pip install -U bitsandbytes

# Testing my data

In [8]:
# Import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
import torch
from transformers import BitsAndBytesConfig
from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import textwrap

In [9]:
# Set model name and quantization config
name = "NousResearch/Llama-2-7b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16)


In [10]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(name, cache_dir="./model/")
model = AutoModelForCausalLM.from_pretrained(name, cache_dir="./model/",
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             quantization_config=quantization_config)

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                temperature=0.4,
                top_p=0.95,
                repetition_penalty=1.15)
llm = HuggingFacePipeline(pipeline=pipe)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=pipe)


In [11]:
# Prompt for context-based questions
context_prompt = ChatPromptTemplate.from_messages([
    ('system', "Answer the question using only the context\n\nQuestion: {question}\n\nContext: {context}"),
    ('user', "{question}")
])

In [12]:
# Load and process CSV data
csv_path = "/content/drive/MyDrive/Sem 5/DL_FINAL_PROJ/data.csv"
df = pd.read_csv(csv_path)


In [13]:
# Combine relevant text fields (adjust based on dataset structure)
text_data = df.apply(lambda row: ' '.join(map(str, row.values)), axis=1).tolist()

In [14]:
from langchain.schema import Document

# Wrap each text string in a Document object
documents = [Document(page_content=text) for text in text_data]

# Split data into chunks for retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
chunks = text_splitter.split_documents(documents)


In [15]:
# Create embeddings and vector store
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
vectorsdb = Chroma.from_documents(chunks, embeddings, persist_directory="db")


  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Create chain with context prompt
chain = (
    {
        'context': vectorsdb.as_retriever(),
        'question': lambda x: x
    }
    | context_prompt
    | llm
    | StrOutputParser()
)

In [38]:
# # Define function for querying and extracting only the system's last answer
# def output(query):
#     result = chain.invoke(query)

#     # Ensure the answer is captured only from the final "System:" tag
#     if "System:" in result:
#         # Extract the text from the last occurrence of "System:" onward
#         answer_start = result.rfind("System:") + len("System:")
#         answer = result[answer_start:].strip()

#         # Split out any additional text (like "Human:") following the answer
#         answer = answer.split("Human:")[0].strip()

#         # If the answer still contains "Context:", remove everything before and including it
#         if "Context:" in answer:
#             answer = answer.split("Context:")[-1].strip()

#         return answer  # Return only the final answer text

#     return result  # Fallback if "System:" is not found


In [49]:
def output(query):
    result = chain.invoke(query)

    # Ensure we capture only the answer text starting from the last "System:" tag
    if "System:" in result:
        answer_start = result.rfind("System:") + len("System:")
        answer = result[answer_start:].strip()

        # Remove any additional tags like "Human:" after the answer
        answer = answer.split("Human:")[0].strip()

        # Remove "Context:" if it still appears in the answer
        if "Context:" in answer:
            answer = answer.split("Context:")[-1].strip()

        # Remove introductory phrases
        if answer.startswith("Based on the provided context,"):
            answer = answer[len("Based on the provided context,"):].strip()

        # Final check: filter out any remaining document or metadata content
        if "Document(" in answer:
            answer = answer.split("Document(")[0].strip()

        return answer  # Return only the cleaned answer text

    return result  # Fallback if "System:" is not found


In [50]:
# Print the first few chunks to verify the output
for i, chunk in enumerate(chunks[:5]):  # Display first 5 chunks as a sample
    print(f"Chunk {i+1}:")
    print(chunk.page_content)
    print("\n" + "-"*50 + "\n")

Chunk 1:
https://doj.gov.in/ 









DEPARTMENT OF JUSTICE
Latest News

--------------------------------------------------

Chunk 2:
https://doj.gov.in/# 









DEPARTMENT OF JUSTICE
Latest News

--------------------------------------------------

Chunk 3:
https://doj.gov.in/history/ Last updated: 13-01-2023

--------------------------------------------------

Chunk 4:
As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law

--------------------------------------------------

Chunk 5:
of Law & Justice, Government of India. It is one of the oldest Ministries of the Government of

--------------------------------------------------



In [51]:
# Example query
query = "What is the key insight from the DOJ data?"
result = output(query)
print(result)

The key insight from the DOJ data is that the Department of Justice (DOJ) in India has a variety of documents and information available on its website, including press releases, organization charts, and historical information.


In [52]:
# Example query
query = "What is the mission or vision of the Department of Justice"
result = output(query)
print(result)

it appears that the mission or vision of the Department of Justice is to facilitate the administration of justice in India through various means such as appointing, removing, and resigning officials, raising awareness about government schemes and programs, and enhancing public trust in the department.


In [53]:
# Example query
query = "What does the Department of Justice do to support justice administration?"
result = output(query)
print(result)

the Department of Justice supports justice administration by facilitating the administration of justice through various means such as appointing, resigning, or removing officials, enforcing contracts, and providing a nodal point for related activities.


In [57]:
# Example query
query = "What are the different sections or topics covered on the DOJ website?"
result = output(query)
print(result)

the following sections or topics are covered on the DOJ website:

* Press releases
* Guidelines for submission
* Digital library
* Release of state booklets highlighting dedicated state schemes, activities of DoJ, and beneficiaries.


In [None]:
# Example query
query = "How has the Department of Justice evolved over time?"
result = output(query)
print(result)

In [59]:
# Example query
query = "What resources or documents are available in the DOJ’s digital library?"
result = output(query)
print(result)

the following resources or documents are available in the DOJ's digital library:

* Press releases
* State booklets highlighting dedicated state schemes, activities of DoJ, and beneficiaries
* Guidelines for submission

Note: The system is unable to provide more detailed information without additional context or access to external databases.


In [60]:
# Example query
query = "How does the DOJ support justice administration at the state level?"
result = output(query)
print(result)

The DOJ supports justice administration at the state level through various means, including providing legal guidance and resources to state agencies and officials, offering training and technical assistance programs for state law enforcement and judicial personnel, and collaborating with state authorities on criminal investigations and prosecutions.


In [64]:
# Example query
query = "What are the recent updates on the guidelines provided by the DOJ?"
result = output(query)
print(result)

The latest update to the guidelines provided by the DOJ was on March 14, 2022, for grievance submission. (Last updated: 04-09-2024)


In [65]:
import pickle
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the save path
save_path = "/content/drive/MyDrive/Sem 5/DL_FINAL_PROJ/Saved_model"

# 1. Save the model and tokenizer
model.save_pretrained(f"{save_path}/model")
tokenizer.save_pretrained(f"{save_path}/tokenizer")

# 2. Save the vector store (Chroma)
vectorsdb.persist()  # This will persist to the original directory set up in `persist_directory`

# 3. Serialize pipeline configuration and other components
rag_config = {
    "embedding_model": embedding_model,
    "quantization_config": quantization_config,
    "vector_store_path": "db",  # Path where vector store is persisted (if needed, change this as well)
}

# 4. Save the RAG configuration and chain to the specified directory
with open(f"{save_path}/rag_pipelineV2.pkl", "wb") as f:
    pickle.dump(rag_config, f)

  vectorsdb.persist()  # This will persist to the original directory set up in `persist_directory`
