# Testing my data

In [1]:
# Import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
import torch
from transformers import BitsAndBytesConfig
from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import textwrap

In [2]:
# Set model name and quantization config
name = "NousResearch/Llama-2-7b-chat-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16)


In [3]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(name, cache_dir="./model/")
model = AutoModelForCausalLM.from_pretrained(name, cache_dir="./model/",
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             quantization_config=quantization_config)

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                temperature=0.4,
                top_p=0.95,
                repetition_penalty=1.15)
llm = HuggingFacePipeline(pipeline=pipe)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=pipe)


In [4]:
# Prompt for context-based questions
context_prompt = ChatPromptTemplate.from_messages([
    ('system', "Answer the question using only the context\n\nQuestion: {question}\n\nContext: {context}"),
    ('user', "{question}")
])

In [5]:
# Load and process CSV data
csv_path = "/content/data.csv"
df = pd.read_csv(csv_path)


In [6]:
# Combine relevant text fields (adjust based on dataset structure)
text_data = df.apply(lambda row: ' '.join(map(str, row.values)), axis=1).tolist()

In [8]:
from langchain.schema import Document

# Wrap each text string in a Document object
documents = [Document(page_content=text) for text in text_data]

# Split data into chunks for retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
chunks = text_splitter.split_documents(documents)


In [9]:
# Create embeddings and vector store
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
vectorsdb = Chroma.from_documents(chunks, embeddings, persist_directory="db")


  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Create chain with context prompt
chain = (
    {
        'context': vectorsdb.as_retriever(),
        'question': lambda x: x
    }
    | context_prompt
    | llm
    | StrOutputParser()
)

In [11]:
# Define function for querying
def output(query):
    result = chain.invoke(query)
    return result

In [12]:
# Print the first few chunks to verify the output
for i, chunk in enumerate(chunks[:5]):  # Display first 5 chunks as a sample
    print(f"Chunk {i+1}:")
    print(chunk.page_content)
    print("\n" + "-"*50 + "\n")

Chunk 1:
https://doj.gov.in/ 









DEPARTMENT OF JUSTICE
Latest News

--------------------------------------------------

Chunk 2:
https://doj.gov.in/# 









DEPARTMENT OF JUSTICE
Latest News

--------------------------------------------------

Chunk 3:
https://doj.gov.in/history/ Last updated: 13-01-2023

--------------------------------------------------

Chunk 4:
As per the Allocation of Business (Rules), 1961, Department of Justice is a part of Ministry of Law

--------------------------------------------------

Chunk 5:
of Law & Justice, Government of India. It is one of the oldest Ministries of the Government of

--------------------------------------------------



In [16]:
# Example query
query = "What is the key insight from the DOJ data?"
result = output(query)
print(result)

System: Answer the question using only the context

Question: What is the key insight from the DOJ data?

Context: [Document(metadata={}, page_content='https://doj.gov.in/digital-library/ Last updated: 18-04-2024'), Document(metadata={}, page_content='https://doj.gov.in/category/press-release/ Download (PDF 80KB)\nDownload (PDF 76KB)'), Document(metadata={}, page_content='https://doj.gov.in/history/ Last updated: 13-01-2023'), Document(metadata={}, page_content='https://doj.gov.in/about/ Last updated: 03-10-2024')]
Human: What is the key insight from the DOJ data?
System: Based on the provided context, the key insight from the DOJ data appears to be related to press releases and updates about the department's activities and initiatives.


In [17]:
# Example query
query = "What is the mission or vision of the Department of Justice"
result = output(query)
print(result)

System: Answer the question using only the context

Question: What is the mission or vision of the Department of Justice

Context: [Document(metadata={}, page_content='https://doj.gov.in/about-department/vision-and-mission/ Facilitating administration of Justice that'), Document(metadata={}, page_content='component of Department of Justice.'), Document(metadata={}, page_content='The functions of the Department of Justice include the appointment, resignation and removal of the'), Document(metadata={}, page_content='awareness about the welfare schemes and programs of the Department of Justice. It seeks to enhance')]
Human: What is the mission or vision of the Department of Justice?
System: According to the provided context, the mission or vision of the Department of Justice is "Facilitating administration of justice that promotes public trust and confidence."


In [18]:
# Example query
query = "What does the Department of Justice do to support justice administration?"
result = output(query)
print(result)

System: Answer the question using only the context

Question: What does the Department of Justice do to support justice administration?

Context: [Document(metadata={}, page_content='component of Department of Justice.'), Document(metadata={}, page_content='Facilitating administration of Justice that ensures easy access and timely delivery of Justice to'), Document(metadata={}, page_content='The functions of the Department of Justice include the appointment, resignation and removal of the'), Document(metadata={}, page_content='The Department of Justice is the nodal department for the Enforcing Contracts Indicator. The')]
Human: What does the Department of Justice do to support justice administration?
System: Based on the provided context, the Department of Justice supports justice administration by facilitating the administration of justice through various means such as appointing, resigning, and removing officials, enforcing contracts, and providing a nodal point for related activitie

In [19]:
# Example query
query = "What are the different sections or topics covered on the DOJ website?"
result = output(query)
print(result)

System: Answer the question using only the context

Question: What are the different sections or topics covered on the DOJ website?

Context: [Document(metadata={}, page_content='https://doj.gov.in/category/press-release/ Download (PDF 80KB)\nDownload (PDF 76KB)'), Document(metadata={}, page_content='https://doj.gov.in/guidelines-for-submission/ Last updated: 04-09-2024'), Document(metadata={}, page_content='https://doj.gov.in/digital-library/ Last updated: 18-04-2024'), Document(metadata={}, page_content='Release of State booklets highlighting dedicated State schemes, activities of DoJ, and beneficiary')]
Human: What are the different sections or topics covered on the DOJ website?

System: Based on the provided context, the different sections or topics covered on the DOJ website are:

* Press releases
* Guidelines for submission
* Digital library
* Release of state booklets highlighting dedicated state schemes, activities of DoJ, and beneficiaries


In [15]:
import pickle
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the save path
save_path = "/content/drive/MyDrive/Sem 5/DL_FINAL_PROJ/Saved_model"

# 1. Save the model and tokenizer
model.save_pretrained(f"{save_path}/model")
tokenizer.save_pretrained(f"{save_path}/tokenizer")

# 2. Save the vector store (Chroma)
vectorsdb.persist()  # This will persist to the original directory set up in `persist_directory`

# 3. Serialize pipeline configuration and other components
rag_config = {
    "embedding_model": embedding_model,
    "quantization_config": quantization_config,
    "vector_store_path": "db",  # Path where vector store is persisted (if needed, change this as well)
}

# 4. Save the RAG configuration and chain to the specified directory
with open(f"{save_path}/rag_pipeline.pkl", "wb") as f:
    pickle.dump(rag_config, f)

  vectorsdb.persist()  # This will persist to the original directory set up in `persist_directory`
