In [1]:
import random
import pandas as pd
import numpy as np
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"



constitution = pd.read_csv("constitution_embeddings.csv")


# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
constitution["embedding"] = constitution["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert to a list of dictionaries- this is later needed once we find those embedding indices with top similarity to the input to map back to the text.
constitution_and_chunks = constitution.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(constitution["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([294, 768])

In [3]:
constitution.head()

Unnamed: 0,text,section,chapter,token_count,embedding
0,"PREAMBLE\r\nWE, THE PEOPLE OF INDIA, having so...",Preamble,,117,"[-0.00540214172, -0.0082473075, -0.0230496284,..."
1,PART I\r\nTHE UNION AND ITS TERRITORY\r\n1. Na...,Part I.—Union and its territory,,186,"[0.00117035874, -0.0688068941, -0.0183506459, ..."
2,3. Formation of new States and alteration of a...,Part I.—Union and its territory,,455,"[0.00985680893, -0.0103680845, -0.021209022, 0..."
3,PART II\r\nCITIZENSHIP\r\n5. Citizenship at th...,Part II.—Citizenship,,343,"[-0.00939577073, 0.0101611437, -0.0181463622, ..."
4,7. Rights of citizenship of certain migrants t...,Part II.—Citizenship,,475,"[-0.0027527248, 0.00307379779, -0.0164747369, ..."


## Semantic Search

In [4]:
#call the model
from sentence_transformers import util, SentenceTransformer 


embedding_model = SentenceTransformer(model_name_or_path="multi-qa-mpnet-base-dot-v1",device = device)

In [5]:
#testing 
query = "elections"
print(f'Query : {query}')

#embed the query
q_embed = embedding_model.encode(query,convert_to_tensor = True)

#get similarity scores with dot product
##to time this 
from time import perf_counter as timer 

start_time = timer()
dot_scores = util.dot_score(a= q_embed,b= embeddings)[0]
end_time = timer()

print(f'Time taken to get scores on {len(embeddings)} embeddings : {end_time-start_time:.5f} seconds.')

#get top k=5 results
top_results = torch.topk(dot_scores,k=5)
top_results


Query : elections
Time taken to get scores on 294 embeddings : 0.02659 seconds.


torch.return_types.topk(
values=tensor([3.2965, 3.2340, 3.0563, 2.9260, 2.9213], device='cuda:0'),
indices=tensor([ 56,  69, 143, 139,  71], device='cuda:0'))

In [6]:
import textwrap

#return a wrapped version of the text
def print_wrapped(text,wrap_length = 80)  :
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

In [7]:
print(f"Query: '{query}'\n")
print("Results:")

#zip them together and loop over to print
for score,idx in zip(top_results[0],top_results[1]) :
    print(f"Score: {score:.4f}")
    print("Text:")
    #get the text from dict and print as wrapped
    print_wrapped(constitution_and_chunks[idx]["text"])
    #print the part as well as well
    print(f"section : {constitution_and_chunks[idx]["section"]}")
    print("\n")


Query: 'elections'

Results:
Score: 3.2965
Text:
243ZK. Election of members of board.—(1) Notwithstanding anything   contained in
any law made by the Legislature of a State, the election of a board   shall be
conducted before the expiry of the term of the board so as to ensure   that the
newly elected members of the board assume office immediately on the   expiry of
the office of members of the outgoing board.  (2) The superintendence, direction
and control of the preparation of   electoral rolls for, and the conduct of, all
elections to a co-operative society   shall vest in such an authority or body,
as may be provided by the Legislature   of a State, by law:  Provided that the
Legislature of a State may, by law, provide for the   procedure and guidelines
for the conduct of such elections.
section : Part IXB.—Co-operative Societies


Score: 3.2340
Text:
PART XV  ELECTIONS  324. Superintendence, direction and control of elections to
be vested   in an Election Commission.—(1) The super

## Functionizing the semantic search pipline

In [8]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 constitution_and_chunks: list[dict]=constitution_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Since constitution_and_chunks is a list of dictionaries, it requires constitution_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant text chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(constitution_and_chunks[idx]["text"])
        #print the part as well as well
        print(f"section : {constitution_and_chunks[idx]["section"]}")
        print("\n")



In [9]:
#testing the function-1
query = "panchayat"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 294 embeddings: 0.00007 seconds.


(tensor([4.4927, 4.3322, 4.2765, 4.2324, 3.8440], device='cuda:0'),
 tensor([34, 35, 38, 37, 40], device='cuda:0'))

In [10]:
#test function-2 : Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 294 embeddings: 0.00006 seconds.
Query: panchayat

Results:
Score: 4.4927
329. Bar to interference by courts in electoral matters.—  1[Notwithstanding
anything in this Constitution 2***—]  (a) the validity of any law relating to
the delimitation of   constituencies or the allotment of seats to such
constituencies, made or   purporting to be made under article 327 or article
328, shall not be called   in question in any court;  (b) no election to either
House of Parliament or to the House or   either House of the Legislature of a
State shall be called in question   except by an election petition presented to
such authority and in such   manner as may be provided for by or under any law
made by the   appropriate Legislature.  3329A. [Special provision as to
elections to Parliament in the case of   Prime Minister and Speaker.].—Omitted
by the Constitution (Forty-fourth   Amendment) Act, 1978, s. 36 (w.e.f.
20-6-1979).
section : Part XV.—Elections


Score

## LLM 

In [11]:
!nvidia-smi

Thu Sep  4 17:08:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.52                 Driver Version: 576.52         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| 31%   38C    P2             19W /  125W |    1146MiB /   6144MiB |     21%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [12]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 6 GB


In [13]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
    use_quantization_config = True 
    model_id = None
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 6 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [None]:
#log in to hugging face using the token id to download the model
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# --- Settings ---
model_id = "google/gemma-2b-it"   # instruction-tuned 2B
attn_implementation = "eager"     # or "sdpa" if your stack supports it

# 4-bit quantization config (saves VRAM)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # compute in fp16
)

print(f"[INFO] Loading {model_id} in 4-bit...")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)


llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    low_cpu_mem_usage=False,
    attn_implementation=attn_implementation,
)

[INFO] Loading google/gemma-2b-it in 4-bit...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
llm_model.device ,llm_model

(device(type='cuda', index=0),
 GemmaForCausalLM(
   (model): GemmaModel(
     (embed_tokens): Embedding(256000, 2048, padding_idx=0)
     (layers): ModuleList(
       (0-17): 18 x GemmaDecoderLayer(
         (self_attn): GemmaAttention(
           (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
           (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
           (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
           (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
         )
         (mlp): GemmaMLP(
           (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
           (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
           (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
           (act_fn): GELUActivation()
         )
         (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
         (post_attention_layernorm): Gem

In [16]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

1515268096

In [17]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2039632384, 'model_mem_mb': 1945.15, 'model_mem_gb': 1.9}

In [18]:
#testing

input_text = "how is the vice president elected and what role does the parliament in electing the vice president?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
how is the vice president elected and what role does the parliament in electing the vice president?

Prompt (formatted):
<bos><start_of_turn>user
how is the vice president elected and what role does the parliament in electing the vice president?<end_of_turn>
<start_of_turn>model



In [19]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   1139,    603,    573,  16832,
           9456,  17710,    578,   1212,   4731,   1721,    573,  28191,    575,
         182690,    573,  16832,   9456, 235336,    107,    108,    106,   2516,
            108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]], device='cuda:0')}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1139,    603,    573,  16832,
          9456,  17710,    578,   1212,   4731,   1721,    573,  28191,    575,
        182690,    573,  16832,   9456, 235336,    107,    108,    106,   2516,
           108,  21404, 235269,   1517, 235303, 235256,    476,  25497,    576,
          1368,    573,  17939,   6021,    603,  17710,    578,    573,   4731,
           576,    573,  17685,    575, 182690,    573,  17939,   6021, 235292,
           109,    688, 235274, 23526

In [20]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
how is the vice president elected and what role does the parliament in electing the vice president?<end_of_turn>
<start_of_turn>model
Sure, here's a breakdown of how the Vice President is elected and the role of the Parliament in electing the Vice President:

**1. Nomination by the President:**

- The President of India nominates a person to become the Vice President.
- The nomination requires the President's written approval and is subject to the advice of the Prime Minister and the Cabinet.

**2. Election by the Parliament:**

- The Vice President is elected by the Lok Sabha (the lower house of the Indian Parliament) by a majority vote.
- A simple majority is required for a successful election.
- The Vice President is elected for a five-year term, with the possibility of a maximum of two terms.

**3. Secret Ballot:**

- The Vice President is elected in a secret ballot, meaning that the identity of the voter is not revealed.
- The 

In [21]:
#format better 
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: how is the vice president elected and what role does the parliament in electing the vice president?

Output text:
Sure, here's a breakdown of how the Vice President is elected and the role of the Parliament in electing the Vice President:

**1. Nomination by the President:**

- The President of India nominates a person to become the Vice President.
- The nomination requires the President's written approval and is subject to the advice of the Prime Minister and the Cabinet.

**2. Election by the Parliament:**

- The Vice President is elected by the Lok Sabha (the lower house of the Indian Parliament) by a majority vote.
- A simple majority is required for a successful election.
- The Vice President is elected for a five-year term, with the possibility of a maximum of two terms.

**3. Secret Ballot:**

- The Vice President is elected in a secret ballot, meaning that the identity of the voter is not revealed.
- The voting takes place in a designated chamber in the Parliament.


In [22]:
query_list = [
    "What does the Preamble of the Indian Constitution declare about justice, liberty, equality, and fraternity?",
    "How can Parliament form new states or alter the boundaries of existing states under Part I?",
    "What were the provisions regarding citizenship at the commencement of the Constitution?",
    "Which Fundamental Rights are guaranteed under the Right to Equality?",
    "What Directive Principles guide the State in securing equal pay for equal work?",
    "Why is the Preamble considered the soul of the Constitution of India?",
    "How do Articles 5 to 11 reflect the framers' approach towards citizenship?",
    "In what ways do Fundamental Rights ensure limitations on the power of the State?",
    "How are Fundamental Duties different from Directive Principles of State Policy?",
    "What role does Article 32 play in making the Fundamental Rights enforceable?",
]

In [23]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: How can Parliament form new states or alter the boundaries of existing states under Part I?
[INFO] Time taken to get scores on 294 embeddings: 0.00006 seconds.


(tensor([4.2529, 3.8914, 3.6986, 3.6867, 3.6737], device='cuda:0'),
 tensor([  2, 151, 112,  26, 201], device='cuda:0'))

In [24]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["text"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are clear, detailed, and explanatory, using examples from the Constitution wherever possible.
    Use the following examples as reference for the ideal answer style.

    Example 1:
    Query: What does the Preamble of the Indian Constitution declare?
    Answer: The Preamble declares India to be a Sovereign, Socialist, Secular, Democratic Republic. It secures to all citizens Justice—social, economic, and political; Liberty of thought, expression, belief, faith, and worship; Equality of status and opportunity; and Fraternity assuring the dignity of the individual and the unity and integrity of the Nation. It was adopted on 26 November 1949, reflecting the vision of the Constituent Assembly.

    Example 2:
    Query: How can new states be created or existing states altered under the Constitution?
    Answer: Articles 2 and 3 empower Parliament to admit new states, establish states, or alter existing states’ boundaries, names, or areas. For such changes, a bill must be introduced on the recommendation of the President, and if it affects any state’s area, boundaries, or name, the President must refer it to the concerned state legislature for its views. However, Parliament is not bound to accept the state’s opinion, ensuring flexibility in India’s federal structure.

    Example 3:
    Query: What are Fundamental Rights, and why are they important?
    Answer: Fundamental Rights, enshrined in Part III (Articles 12–35), guarantee essential freedoms like equality before law, freedom of speech, protection of life and liberty, and the right to constitutional remedies. They act as limitations on state power and safeguard individual dignity. For instance, Article 32 empowers citizens to directly approach the Supreme Court for enforcement of rights, making these provisions justiciable and enforceable.

    Now use the following context items to answer the user query:
    {context}

    Relevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,#to get the output prompt of this function as a text and not a tokenized version
                                          add_generation_prompt=True)
    return prompt
    


In [27]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [constitution_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)#this is why we gave tokenize = False in the prompt_formatter function

Query: What were the provisions regarding citizenship at the commencement of the Constitution?
[INFO] Time taken to get scores on 294 embeddings: 0.00007 seconds.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are clear, detailed, and explanatory, using examples from the Constitution wherever possible.
    Use the following examples as reference for the ideal answer style.

    Example 1:
    Query: What does the Preamble of the Indian Constitution declare?
    Answer: The Preamble declares India to be a Sovereign, Socialist, Secular, Democratic Republic. It secures to all citizens Justice—social, economic, and political; Liberty of thought, expression, belief, faith, and worship; Equality of status and opportunity; and Fraternity assuring the dignity of the

In [28]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, 
                             max_new_tokens=500) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: What were the provisions regarding citizenship at the commencement of the Constitution?
RAG answer:
<bos>Sure, here's the answer to the user's query:

The relevant passage from the context is:
> **PART II.—Citizenship**

According to the passage, every person who has his domicile in the territory of India and—
(a) who was born in the territory of India; or
(b) either of whose parents or any of his grand-parents was born in India as defined in the Government of India Act, 1935 (as originally enacted); or
(c) who has been ordinarily resident in the territory of India for not less than five years immediately preceding the date of his application.<eos>
CPU times: total: 31.7 s
Wall time: 31.7 s
