In [37]:
from pathlib import Path
import chromadb
import pickle
import os
from dotenv import load_dotenv
import torch

load_dotenv()

multiquery_rag_output_path = "../RAG Results/multiquery_rag_results.txt"
Relative_Database_path = "./chroma_Data_with_Fine_tuned_BERT"
Absolute_Database_path = Path(Relative_Database_path).resolve()
file_path = "../Chunking/Chunk_files/harry_potter_chunks_semantic.pkl"
# Create a new collection with a unique name
collection_name = "HP_Chunks_BERT_Finetuned_collection"
# Set API key
# os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_API_KEY")


In [38]:
# Tokenizer and embedding helper definitions (required before initializing embedding_fn)
class SimpleTokenizer:
    def __init__(self, stoi, itos, max_length=512):
        self.stoi = stoi
        self.itos = itos
        self.max_length = max_length
        self.pad_token_id = stoi.get('[PAD]', 0)
        self.cls_token_id = stoi.get('[CLS]', 1)
        self.sep_token_id = stoi.get('[SEP]', 2)
        self.unk_token_id = stoi.get('[UNK]', 3)
    
    def tokenize(self, text: str) -> list:
        tokens = text.strip().split()
        ids = [self.stoi.get(tok, self.unk_token_id) for tok in tokens]
        ids = ids[:self.max_length - 2]
        ids = [self.cls_token_id] + ids + [self.sep_token_id]
        return ids
    
    def __call__(self, texts, padding=True, max_length=None):
        if max_length is None:
            max_length = self.max_length
        all_ids = [self.tokenize(text) for text in texts]
        if padding:
            max_len = min(max(len(ids) for ids in all_ids), max_length)
            padded_ids = []
            attention_masks = []
            for ids in all_ids:
                ids = ids[:max_len]
                pad_len = max_len - len(ids)
                padded_ids.append(ids + [self.pad_token_id] * pad_len)
                attention_masks.append([1] * len(ids) + [0] * pad_len)
            return {'input_ids': torch.tensor(padded_ids, dtype=torch.long), 'attention_mask': torch.tensor(attention_masks, dtype=torch.long)}
        else:
            return {'input_ids': torch.tensor(all_ids, dtype=torch.long)}

class MyBERTEmbeddingFunction:
    def __init__(self, model, tokenizer, device, batch_size=16):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.batch_size = batch_size
        # Ensure model is on the correct device and in eval mode to avoid device-mismatch errors
        try:
            self.model.to(self.device)
        except Exception:
            pass
        try:
            self.model.eval()
        except Exception:
            pass
    
    def _embed_texts(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        all_embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i+self.batch_size]
            encoded = self.tokenizer(batch_texts, padding=True, max_length=MAX_SEQ_LEN)
            # Move inputs to the device where the model's parameters live to avoid device mismatch
            param_device = next(self.model.parameters()).device
            input_ids = encoded['input_ids'].to(param_device)
            attention_mask = encoded.get('attention_mask', (input_ids!=0).long()).to(param_device)
            with torch.no_grad():
                emb = self.model.get_pooled_embeddings(input_ids, mask=attention_mask, exclude_special=True, normalize=True)
            all_embeddings.extend(emb.cpu().numpy().tolist())
        return all_embeddings
    
    def __call__(self, input):
        return self._embed_texts(input)
    
    def embed_query(self, input=None, **kwargs):
        if input is None and 'input' in kwargs:
            input = kwargs['input']
        if input is None:
            raise ValueError('No input provided to embed_query')
        if isinstance(input, str):
            input = [input]
        return self._embed_texts(input)

# Initialize tokenizer and embedding function now that model is present
tokenizer = SimpleTokenizer(stoi, itos, max_length=MAX_SEQ_LEN)
embedding_fn = MyBERTEmbeddingFunction(model, tokenizer, DEVICE, batch_size=16)

















# os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_API_KEY")# Set API keycollection_name = "HP_Chunks_BERT_Embeddings_collection"# Create a new collection with a unique namefile_path = "../Chunking/Chunk_files/harry_potter_chunks_semantic.pkl"Absolute_Database_path = Path(Relative_Database_path).resolve()Relative_Database_path = "./chroma_Data_with_Fine_tuned_BERT"multiquery_rag_output_path = "../RAG Results/multiquery_rag_results.txt"load_dotenv()from dotenv import load_dotenvimport osimport pickleimport chromadbfrom pathlib import Path[SUCCESS] Tokenizer and embedding function defined')

### Chroma Setup and Chunk Loading
Sets up persistant client and loads previously computed chunks

In [39]:
# Initialize the persistent client
client = chromadb.PersistentClient(path=Absolute_Database_path)
print(f"[INFO] ChromaDB client initialized at: {Absolute_Database_path}")

# List existing collections
existing_collections = client.list_collections()
print(f"Existing collections: {[c.name for c in existing_collections]}")

[INFO] ChromaDB client initialized at: /home/tanish/ANLP_Proj/RAG_for_research_papers/VectorDB/chroma_Data_with_Fine_tuned_BERT
Existing collections: ['HP_Chunks_BERT_Embeddings_collection']


In [40]:

# No need for fitz or RecursiveCharacterTextSplitter here, as we are loading from a file.


loaded_docs = []

try:
    with open(file_path, "rb") as f: # 'rb' mode for reading in binary
        loaded_docs = pickle.load(f)
    print(f"Successfully loaded {len(loaded_docs)} chunks from '{file_path}'.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"Error loading file: {e}")

# Now you can inspect the loaded documents to verify.
print("\nHere is the metadata of a loaded chunk:")
if loaded_docs:
    print(loaded_docs[0].metadata)

Successfully loaded 4014 chunks from '../Chunking/Chunk_files/harry_potter_chunks_semantic.pkl'.

Here is the metadata of a loaded chunk:
{'source': '../harrypotter.pdf', 'page_number': 14, 'c': 'semantic', 'ischunk': True}


### Set up Embedding Function
Will use custom pre-trained BERT model to generate embeddings. Location for BERT is ../Encoder/saved_bert_encoder_moe_pooling

#### Recreate BERT Model 

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from typing import List, Union
import numpy as np

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load vocab
# Set MODEL_DIR to the directory that contains vocab.json and any .pt checkpoints
# Prefer the saved model directory (not the .pt file path)
MODEL_DIR = "../Encoder/saved_bert_encoder_moe_pooling"
with open(f"../Encoder/saved_bert_encoder_moe_pooling/vocab.json", "r") as f:
    vocab_data = json.load(f)
    stoi = vocab_data["stoi"]
    itos = vocab_data["itos"]
    
vocab_size = len(itos)
print(f"Loaded vocab with {vocab_size} tokens")

# Special tokens
PAD_TOKEN = "[PAD]"
CLS_TOKEN = "[CLS]"
SEP_TOKEN = "[SEP]"
MASK_TOKEN = "[MASK]"
UNK_TOKEN = "[UNK]"
SPECIAL_TOKENS = [PAD_TOKEN, CLS_TOKEN, SEP_TOKEN, MASK_TOKEN, UNK_TOKEN]

# Model configuration (must match training config)
HIDDEN_SIZE = 768
NUM_LAYERS = 12
NUM_HEADS = 12
FFN_DIM = 3072
DROPOUT = 0.1
MAX_SEQ_LEN = 512  # Changed from 1024 to 512 to match saved model
MAX_POSITION_EMBEDDINGS = 512  # This is what the saved model was trained with

# -------------------------
# Recreate Model Architecture
# -------------------------

class MoE(nn.Module):
    def __init__(self, hidden_size, ffn_dim, num_experts=5, k=2, noise_std=1.0):
        super().__init__()
        self.hidden_size = hidden_size
        self.ffn_dim = ffn_dim
        self.num_experts = num_experts
        self.k = k
        self.noise_std = noise_std
        
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, ffn_dim),
                nn.GELU(),
                nn.Linear(ffn_dim, hidden_size)
            ) for _ in range(num_experts)
        ])
        
        self.router = nn.Linear(hidden_size, num_experts)
    
    def forward(self, x, mask=None):
        B, S, H = x.size()
        logits = self.router(x)
        probs_all = F.softmax(logits, dim=-1)
        importance = probs_all.sum(dim=(0, 1))
        total_tokens = float(B * S)
        aux_loss = (self.num_experts * (importance / total_tokens).pow(2).sum())
        
        if self.training:
            noise = torch.randn_like(logits) * self.noise_std
            logits_noisy = logits + noise
        else:
            logits_noisy = logits
        
        topk_vals, topk_idx = torch.topk(logits_noisy, self.k, dim=-1)
        topk_weights = F.softmax(topk_vals, dim=-1)
        
        expert_outs = []
        for e in range(self.num_experts):
            expert_outs.append(self.experts[e](x))
        expert_stack = torch.stack(expert_outs, dim=2)
        
        device = x.device
        gating = torch.zeros(B, S, self.num_experts, device=device, dtype=x.dtype)
        flat_idx = topk_idx.view(-1, self.k)
        flat_w = topk_weights.view(-1, self.k)
        gating_flat = gating.view(-1, self.num_experts)
        rows = torch.arange(gating_flat.size(0), device=device).unsqueeze(1).expand(-1, self.k)
        gating_flat.scatter_(1, flat_idx, flat_w)
        gating = gating_flat.view(B, S, self.num_experts)
        
        out = torch.einsum('bse,bseh->bsh', gating, expert_stack)
        return out, aux_loss

class TransformerEncoderLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, ffn_dim, dropout=0.1, moe_experts=5, moe_k=2):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ffn_moe = MoE(hidden_size, ffn_dim, num_experts=moe_experts, k=moe_k)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        key_padding_mask = (mask == 0)
        attn_out, _ = self.self_attn(x, x, x, key_padding_mask=key_padding_mask)
        x = self.ln1(x + self.dropout(attn_out))
        ffn_out, aux_loss = self.ffn_moe(x, mask)
        x = self.ln2(x + self.dropout(ffn_out))
        return x, aux_loss

class BertEncoderModel(nn.Module):
    def __init__(self, vocab_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, num_heads=NUM_HEADS, 
                 ffn_dim=FFN_DIM, max_position_embeddings=512, pad_token_id=0, moe_experts=5, moe_k=2):
        super().__init__()
        self.pad_token_id = pad_token_id
        self.hidden_size = hidden_size
        self.token_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.segment_embeddings = nn.Embedding(2, hidden_size)
        self.emb_ln = nn.LayerNorm(hidden_size)
        self.emb_dropout = nn.Dropout(0.1)
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(hidden_size, num_heads, ffn_dim, dropout=DROPOUT, 
                                   moe_experts=moe_experts, moe_k=moe_k) 
            for _ in range(num_layers)
        ])
        self.nsp_classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), 
            nn.Tanh(), 
            nn.Linear(hidden_size, 2)
        )
        self.mlm_bias = nn.Parameter(torch.zeros(vocab_size))
    
    def encode(self, ids, tt=None, mask=None):
        if tt is None:
            tt = torch.zeros_like(ids)
        if mask is None:
            mask = (ids != self.pad_token_id).long()
        pos = torch.arange(ids.size(1), device=ids.device).unsqueeze(0)
        x = self.token_embeddings(ids) + self.position_embeddings(pos) + self.segment_embeddings(tt)
        x = self.emb_dropout(self.emb_ln(x))
        total_aux = 0.0
        for layer in self.layers:
            x, aux = layer(x, mask)
            total_aux = total_aux + aux
        return x, total_aux
    
    def get_pooled_embeddings(self, ids, mask=None, exclude_special=True, normalize=True):
        """
        Generate embeddings with mask-aware mean pooling
        """
        seq_out, _ = self.encode(ids, tt=None, mask=mask)
        
        if mask is None:
            mask = (ids != self.pad_token_id).long()
        
        # Mask-aware mean pooling
        mask_float = mask.unsqueeze(-1).to(seq_out.dtype)
        
        if exclude_special:
            # Exclude special tokens from pooling
            special_upper = len(SPECIAL_TOKENS)
            special_flags = (ids < special_upper).to(seq_out.dtype)
            mask_float = mask_float * (1.0 - special_flags.unsqueeze(-1))
        
        summed = (seq_out * mask_float).sum(dim=1)
        denom = mask_float.sum(dim=1).clamp(min=1e-9)
        pooled = summed / denom
        
        if normalize:
            pooled = F.normalize(pooled, p=2, dim=1)
        
        return pooled

# Load model with matching max_position_embeddings
print("Loading BERT model...")
model = BertEncoderModel(vocab_size, max_position_embeddings=MAX_POSITION_EMBEDDINGS, moe_experts=5, moe_k=2)
# Robust checkpoint loader: handle different key naming (q/k/v vs in_proj, LoRA wrappers, o_proj/out_proj)
import os, re
from collections import OrderedDict, defaultdict
# Resolve checkpoint path: support MODEL_DIR being either a file or a directory
ckpt_path = None
if os.path.isfile(MODEL_DIR):
    ckpt_path = MODEL_DIR
elif os.path.isdir(MODEL_DIR):
    # look for common checkpoint filenames first
    candidates = [os.path.join(MODEL_DIR, 'bert_encoder_moe_pooling.pt'), os.path.join(MODEL_DIR, 'bert_encoder.pt'), os.path.join(MODEL_DIR, 'lora_bert.pt')]
    found = [p for p in candidates if os.path.exists(p)]
    if found:
        ckpt_path = found[0]
    else:
        # fallback: first .pt file in directory
        pfiles = [os.path.join(MODEL_DIR, f) for f in os.listdir(MODEL_DIR) if f.endswith('.pt') or f.endswith('.pth')]
        if pfiles:
            ckpt_path = pfiles[0]
else:
    # treat MODEL_DIR as a path candidate (in case user passed a filename)
    ckpt_path = MODEL_DIR if os.path.exists(MODEL_DIR) else None
if ckpt_path is None:
    raise FileNotFoundError(f"No checkpoint found under MODEL_DIR={MODEL_DIR}")
print(f"[INFO] Loading checkpoint from {ckpt_path}")
raw = torch.load(ckpt_path, map_location='cpu')
# extract state dict if wrapped
if isinstance(raw, dict) and ('model_state_dict' in raw or 'state_dict' in raw):
    sd = raw.get('model_state_dict', raw.get('state_dict'))
else:
    sd = raw

Using device: cuda
Loaded vocab with 45706 tokens
Loading BERT model...
Loaded vocab with 45706 tokens
Loading BERT model...
[INFO] Loading checkpoint from ../Encoder/saved_bert_encoder_moe_pooling/bert_encoder_moe_pooling.pt
[INFO] Loading checkpoint from ../Encoder/saved_bert_encoder_moe_pooling/bert_encoder_moe_pooling.pt


In [42]:
# Initialize the custom BERT embedding function
embedding_fn = MyBERTEmbeddingFunction(model, tokenizer, DEVICE, batch_size=16)

# Test the embedding function
print("Testing embedding function...")
test_texts = ["This is a test sentence.", "Another example text."]
test_embeddings = embedding_fn(test_texts)
print(f"Generated {len(test_embeddings)} embeddings")
print(f"Embedding shape: {len(test_embeddings[0])} dimensions")
print(f"First embedding (first 5 values): {test_embeddings[0][:5]}")
print("\n[SUCCESS] Embedding function ready for ChromaDB!")


Testing embedding function...
Generated 2 embeddings
Embedding shape: 768 dimensions
First embedding (first 5 values): [0.03645609691739082, -0.05256899073719978, -0.0013257176615297794, 0.0320611335337162, -0.00854380801320076]

[SUCCESS] Embedding function ready for ChromaDB!


### Create Collection with BERT Embeddings

In [43]:
from datetime import datetime

# FORCE DELETE the collection if it exists
try:
    client.delete_collection(name=collection_name)
    print(f"[INFO] Deleted existing collection '{collection_name}'")
except Exception as e:
    print(f"[INFO] No existing collection named '{collection_name}' to delete.")

# Create a FRESH collection with BERT embedding function
collection = client.create_collection(
    name=collection_name,
    embedding_function=embedding_fn,
    metadata={
        "description": "Harry Potter Chunks with custom BERT embeddings (MoE + Mask-aware pooling)",
        "created": str(datetime.now()),
        "model": "Custom BERT with MoE",
        "embedding_dim": HIDDEN_SIZE
    }
)

print(f"[SUCCESS] Fresh collection '{collection_name}' created successfully")
print(f"Current count in collection: {collection.count()}")


[INFO] No existing collection named 'HP_Chunks_BERT_Finetuned_collection' to delete.
[SUCCESS] Fresh collection 'HP_Chunks_BERT_Finetuned_collection' created successfully
Current count in collection: 0
[SUCCESS] Fresh collection 'HP_Chunks_BERT_Finetuned_collection' created successfully
Current count in collection: 0


### Add Documents to Collection
Prepare and add all chunks with BERT-generated embeddings

In [44]:
# Prepare data for ChromaDB
documents = []
metadatas = []
ids = []

for idx, doc in enumerate(loaded_docs):
    documents.append(doc.page_content)
    metadatas.append(doc.metadata)
    ids.append(f"hp_chunk_{idx}")

print(f"[INFO] Prepared {len(documents)} documents for embedding")
print(f"Sample document: {documents[0][:100]}...")
print(f"Sample metadata: {metadatas[0]}")


[INFO] Prepared 4014 documents for embedding
Sample document: . yes, that would be it. The traffic moved on and a few minutes
later, Mr. Dursley arrived in the Gr...
Sample metadata: {'source': '../harrypotter.pdf', 'page_number': 14, 'c': 'semantic', 'ischunk': True}


In [45]:
# Add documents to collection in batches
# ChromaDB will automatically call our embedding_fn to generate embeddings
batch_size = 500
total_batches = (len(documents) + batch_size - 1) // batch_size

print(f"[INFO] Adding documents in {total_batches} batches...")

for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_metas = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    
    collection.add(
        documents=batch_docs,
        metadatas=batch_metas,
        ids=batch_ids
    )
    
    batch_num = (i // batch_size) + 1
    print(f"  Batch {batch_num}/{total_batches} added ({len(batch_docs)} documents)")

print(f"\n[SUCCESS] All documents added!")
print(f"Total documents in collection: {collection.count()}")


[INFO] Adding documents in 9 batches...
  Batch 1/9 added (500 documents)
  Batch 1/9 added (500 documents)
  Batch 2/9 added (500 documents)
  Batch 2/9 added (500 documents)
  Batch 3/9 added (500 documents)
  Batch 3/9 added (500 documents)
  Batch 4/9 added (500 documents)
  Batch 4/9 added (500 documents)
  Batch 5/9 added (500 documents)
  Batch 5/9 added (500 documents)
  Batch 6/9 added (500 documents)
  Batch 6/9 added (500 documents)
  Batch 7/9 added (500 documents)
  Batch 7/9 added (500 documents)
  Batch 8/9 added (500 documents)
  Batch 8/9 added (500 documents)
  Batch 9/9 added (14 documents)

[SUCCESS] All documents added!
Total documents in collection: 4014
  Batch 9/9 added (14 documents)

[SUCCESS] All documents added!
Total documents in collection: 4014


### Test the Collection
Query the collection to verify embeddings are working correctly

In [46]:
# Test query
test_query = "Who is Harry Potter?"

print(f"Test Query: '{test_query}'")
print("\nSearching with BERT embeddings...")

results = collection.query(
    query_texts=[test_query],
    n_results=5    
)

print(f"\nTop 5 Results:")
for idx, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"\n{idx+1}. Distance: {distance:.4f}")
    print(f"   Text: {doc[:150]}...")

print("\n[SUCCESS] Collection is working correctly with BERT embeddings!")


Test Query: 'Who is Harry Potter?'

Searching with BERT embeddings...

Top 5 Results:

1. Distance: 0.2702
   Text: Raising an arm in cheery farewell, he headed out of the front doors into the
darkness. Harry and Ron looked at each other....

2. Distance: 0.2729
   Text: “Yeh shouldn’ve come!” Hagrid whispered. He stood back, then shut the
door quickly. “This is the weirdest thing we’ve ever done,” Harry said fervently...

3. Distance: 0.2745
   Text: There you go, Sirius, Harry thought dully. Nothing rash. Kept my nose
clean. Exactly the opposite of what you’d have done . ....

4. Distance: 0.2758
   Text: “The new password’s ‘Fortuna Major’!”
“Oh no,” said Neville Longbottom sadly. He always had trouble
remembering the passwords....

5. Distance: 0.2825
   Text: “Did you see his face, the great lump?”
The other Slytherins joined in. “Shut up, Malfoy,” snapped Parvati Patil. ...

[SUCCESS] Collection is working correctly with BERT embeddings!


### Add queries to collection

In [None]:
import pandas as pd

# Load the generated questions from CSV
questions_csv_path = "../LLM Caller/generated_pairs_without_commas.csv"  # Adjust path as needed
df_questions = pd.read_csv(questions_csv_path)

print(f"Loaded {len(df_questions)} questions from {questions_csv_path}")

# Prepare questions for ChromaDB
question_ids = []
question_documents = []
question_metadatas = []

# Process each question
for idx, row in df_questions.iterrows():
    
    # Get the question text and chunk id
    question_text, chunk_id = row['query'], row['chunk_id']
    question_id = f"query_{idx}_{chunk_id}"
    
    # Create metadata
    metadata = {
        "id": question_id,
        "ischunk": False,
        "chunk_id": chunk_id
    }
    
    question_ids.append(question_id)
    question_documents.append(question_text)
    question_metadatas.append(metadata)

# Add questions to collection in batches
batch_size = 500
total_questions_added = 0

for i in range(0, len(question_ids), batch_size):
    end_idx = min(i + batch_size, len(question_ids))
    
    
    collection.add(
        ids=question_ids[i:end_idx],
        documents=question_documents[i:end_idx],
        metadatas=question_metadatas[i:end_idx]
    )
    
    total_questions_added += end_idx - i
    print(f"Added question batch: {i} to {end_idx-1} ({end_idx-i} questions)")

print(f"\nSuccessfully added {total_questions_added} questions to collection '{collection_name}'")
print(f"Total documents in collection now: {collection.count()}")

Loaded 19901 questions from ../LLM Caller/generated_pairs_without_commas.csv
Added question batch: 0 to 499 (500 questions)
Added question batch: 0 to 499 (500 questions)
Added question batch: 500 to 999 (500 questions)
Added question batch: 500 to 999 (500 questions)
Added question batch: 1000 to 1499 (500 questions)
Added question batch: 1000 to 1499 (500 questions)
Added question batch: 1500 to 1999 (500 questions)
Added question batch: 1500 to 1999 (500 questions)
Added question batch: 2000 to 2499 (500 questions)
Added question batch: 2000 to 2499 (500 questions)
Added question batch: 2500 to 2999 (500 questions)
Added question batch: 2500 to 2999 (500 questions)
Added question batch: 3000 to 3499 (500 questions)
Added question batch: 3000 to 3499 (500 questions)
Added question batch: 3500 to 3999 (500 questions)
Added question batch: 3500 to 3999 (500 questions)
Added question batch: 4000 to 4499 (500 questions)
Added question batch: 4000 to 4499 (500 questions)
Added question ba

: 