In [1]:
import random
import pandas as pd
import numpy as np
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


# Import embeddings_df
constitution = pd.read_csv("constitution_embeddings.csv")


# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
constitution["embedding"] = constitution["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts- this is later needed once we find those embedding indices with top similarity to the input to map back to the text.
constitution_and_chunks = constitution.to_dict(orient="records") # each row becomes a dicttionary and all these becomes a list of dicts. records => each row is a dictionary

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(constitution["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

In [None]:
constitution.head()

## Semantic Search

In [None]:
#call the model
from sentence_transformers import util, SentenceTransformer #util is for dot product

# embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device = device)
embedding_model = SentenceTransformer(model_name_or_path="multi-qa-mpnet-base-dot-v1",device = device)

In [None]:
query = "elections"
print(f'Query : {query}')

#embed the query
q_embed = embedding_model.encode(query,convert_to_tensor = True)

#get similarity scores with dot product
##to time this 
from time import perf_counter as timer 

start_time = timer()
dot_scores = util.dot_score(a= q_embed,b= embeddings)[0]
end_time = timer()

print(f'Time taken to get scores on {len(embeddings)} embeddings : {end_time-start_time:.5f} seconds.')

#get top k=5 results
top_results = torch.topk(dot_scores,k=5)
top_results


In [None]:
import textwrap

#whatever text you give , this with return a wrapped version of it
def print_wrapped(text,wrap_length = 80)  :
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

In [None]:
print(f"Query: '{query}'\n")
print("Results:")

#zip them together and loop over to print
for score,idx in zip(top_results[0],top_results[1]) :
    print(f"Score: {score:.4f}")
    print("Text:")
    #get the text from dict and print as wrapped
    print_wrapped(constitution_and_chunks[idx]["text"])
    #print the part as well as well
    print(f"section : {constitution_and_chunks[idx]["section"]}")
    print("\n")


## Functionizing the semantic search pipline

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 constitution_and_chunks: list[dict]=constitution_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Since constitution_and_chunks is a list of dictionaries, it requires constitution_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant text chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(constitution_and_chunks[idx]["text"])
        #print the part as well as well
        print(f"section : {constitution_and_chunks[idx]["section"]}")
        print("\n")



In [None]:
#testing the function-1
query = "panchayat"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
#test function-2 : Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

## LLM 

In [None]:
!nvidia-smi

In [None]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

In [None]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
    use_quantization_config = True 
    model_id = None
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# --- Settings ---
model_id = "google/gemma-2b-it"   # instruction-tuned 2B
attn_implementation = "eager"     # or "sdpa" if your stack supports it

# 4-bit quantization config (saves VRAM)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # compute in fp16
)

print(f"[INFO] Loading {model_id} in 4-bit...")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Model (device_map='auto' places weights on GPU/CPU as needed)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    low_cpu_mem_usage=False,
    attn_implementation=attn_implementation,
)

In [None]:
llm_model.device ,llm_model

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

In [None]:
input_text = "how is the vice president elected and what role does the parliament in electing the vice president?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig 
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

In [None]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

In [None]:
#format better 
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

In [None]:
query_list = [
    "What does the Preamble of the Indian Constitution declare about justice, liberty, equality, and fraternity?",
    "How can Parliament form new states or alter the boundaries of existing states under Part I?",
    "What were the provisions regarding citizenship at the commencement of the Constitution?",
    "Which Fundamental Rights are guaranteed under the Right to Equality?",
    "What Directive Principles guide the State in securing equal pay for equal work?",
    "Why is the Preamble considered the soul of the Constitution of India?",
    "How do Articles 5 to 11 reflect the framers' approach towards citizenship?",
    "In what ways do Fundamental Rights ensure limitations on the power of the State?",
    "How are Fundamental Duties different from Directive Principles of State Policy?",
    "What role does Article 32 play in making the Fundamental Rights enforceable?",

]

In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["text"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are clear, detailed, and explanatory, using examples from the Constitution wherever possible.
    Use the following examples as reference for the ideal answer style.

    Example 1:
    Query: What does the Preamble of the Indian Constitution declare?
    Answer: The Preamble declares India to be a Sovereign, Socialist, Secular, Democratic Republic. It secures to all citizens Justice—social, economic, and political; Liberty of thought, expression, belief, faith, and worship; Equality of status and opportunity; and Fraternity assuring the dignity of the individual and the unity and integrity of the Nation. It was adopted on 26 November 1949, reflecting the vision of the Constituent Assembly.

    Example 2:
    Query: How can new states be created or existing states altered under the Constitution?
    Answer: Articles 2 and 3 empower Parliament to admit new states, establish states, or alter existing states’ boundaries, names, or areas. For such changes, a bill must be introduced on the recommendation of the President, and if it affects any state’s area, boundaries, or name, the President must refer it to the concerned state legislature for its views. However, Parliament is not bound to accept the state’s opinion, ensuring flexibility in India’s federal structure.

    Example 3:
    Query: What are Fundamental Rights, and why are they important?
    Answer: Fundamental Rights, enshrined in Part III (Articles 12–35), guarantee essential freedoms like equality before law, freedom of speech, protection of life and liberty, and the right to constitutional remedies. They act as limitations on state power and safeguard individual dignity. For instance, Article 32 empowers citizens to directly approach the Supreme Court for enforcement of rights, making these provisions justiciable and enforceable.

    Now use the following context items to answer the user query:
    {context}

    Relevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,#to get the output prompt of this function as a text and not a tokenized version
                                          add_generation_prompt=True)
    return prompt
    


In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [constitution_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)#this is why we gave tokenize = False in the prompt_formatter function

In [None]:

%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")
