<a href="https://colab.research.google.com/github/Arshnoor7/CODERAG/blob/main/CODERAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

##############################
#  Install All Dependencies
##############################

!pip install -q transformers peft datasets accelerate sentence-transformers
!pip install -q faiss-cpu rank-bm25
!pip install -q google-generativeai gradio
!pip install -q scikit-learn tqdm


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.6/23.6 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
##############################
# Imports, API Key, and Global Setup
##############################

import pandas as pd
import numpy as np
import faiss
import torch
import re
import random
from tqdm.auto import tqdm
import warnings
import google.generativeai as genai
import gradio as gr

from google.colab import userdata
from datasets import Dataset, DatasetDict
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model, PeftModel
from torch.distributions import Categorical

warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    gemini_model = genai.GenerativeModel("gemini-2.5-flash-preview-09-2025")
    print("Gemini API configured successfully.")
except Exception as e:
    print(f"CRITICAL: Error configuring Gemini API. Make sure 'GEMINI_API_KEY' is set in Colab Secrets (üîë icon). {e}")
    gemini_model = None


Using device: cuda
Gemini API configured successfully.


In [12]:
##############################
# Load and Prepare Data
##############################

TRAIN_PATH = "/content/train.csv"
VALID_PATH = "/content/valid.csv"
TEST_PATH  = "/content/test.csv"

try:
    train_df = pd.read_csv(TRAIN_PATH)
    valid_df = pd.read_csv(VALID_PATH)
    test_df  = pd.read_csv(TEST_PATH)
except FileNotFoundError:
    print("Data files not found. Please upload train.csv, valid.csv, and test.csv")
    data = {
        'Hinglish': ['RAG kya hota hai?', 'mujhe machine learning ke baare mein batao', 'India ka capital kya hai?', 'Python ek achi language hai'],
        'English': ['What is RAG?', 'Tell me about machine learning', 'What is the capital of India?', 'Python is a good language'],
        'Hindi': ['RAG ‡§ï‡•ç‡§Ø‡§æ ‡§π‡•ã‡§§‡§æ ‡§π‡•à?', '‡§Æ‡•Å‡§ù‡•á ‡§Æ‡§∂‡•Ä‡§® ‡§≤‡§∞‡•ç‡§®‡§ø‡§Ç‡§ó ‡§ï‡•á ‡§¨‡§æ‡§∞‡•á ‡§Æ‡•á‡§Ç ‡§¨‡§§‡§æ‡§ì', '‡§≠‡§æ‡§∞‡§§ ‡§ï‡•Ä ‡§∞‡§æ‡§ú‡§ß‡§æ‡§®‡•Ä ‡§ï‡•ç‡§Ø‡§æ ‡§π‡•à?', '‡§™‡§æ‡§Ø‡§•‡§® ‡§è‡§ï ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§≠‡§æ‡§∑‡§æ ‡§π‡•à']
    }
    train_df = pd.DataFrame(data)
    valid_df = pd.DataFrame(data)
    test_df = pd.DataFrame(data)

for df in [train_df, valid_df, test_df]:
    df.fillna("", inplace=True)

docs_hinglish = train_df["Hinglish"].tolist()
docs_english  = train_df["English"].tolist()
docs_hindi    = train_df["Hindi"].tolist()

print(f"Data Loaded: Train: {len(train_df)}, Valid: {len(valid_df)}, Test: {len(test_df)}")

def detect_lang(q):
    hi = len(re.findall(r'[\u0900-\u097F]', q))
    en = len(re.findall(r'[a-zA-Z]', q))
    if hi > 0 and en > 0: return "cs" # Code-switched
    if hi > 0: return "hi"
    return "en"


Data Loaded: Train: 2766, Valid: 395, Test: 791


In [13]:
##############################
# Initialize Hybrid Retriever (Encoder, FAISS, BM25)
##############################

print("Initializing retriever...")

print("Loading SentenceTransformer encoder...")
encoder = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    device=device
)

print("Encoding documents for dense index...")
emb = encoder.encode(docs_hinglish, batch_size=32, show_progress_bar=True)
emb = np.array(emb).astype("float32")
faiss.normalize_L2(emb)

print("Building FAISS index...")
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

print("Building BM25 index...")
tokenized_corpus = [d.lower().split() for d in docs_hinglish]
bm25 = BM25Okapi(tokenized_corpus)

print("‚úÖ Retriever initialized.")

def retrieve_hybrid(query, top_k=5, w_dense=0.5, w_sparse=0.5):

    q_emb = encoder.encode([query]).astype("float32")
    faiss.normalize_L2(q_emb)
    D_dense, I_dense = index.search(q_emb, top_k * 5) # Retrieve more to re-rank

    bm_scores = bm25.get_scores(query.lower().split())

    results = {}
    for score, idx in zip(D_dense[0], I_dense[0]):
        if idx not in results:
            results[idx] = {"dense": 0.0, "sparse": 0.0}
        results[idx]["dense"] = float(score)

    max_bm_score = np.max(bm_scores)
    min_bm_score = np.min(bm_scores)
    if max_bm_score == min_bm_score: max_bm_score += 1e-6

    for idx in I_dense[0]:
        if idx < len(bm_scores):
            norm_sparse = (bm_scores[idx] - min_bm_score) / (max_bm_score - min_bm_score)
            if idx not in results:
                 results[idx] = {"dense": 0.0, "sparse": 0.0}
            results[idx]["sparse"] = float(norm_sparse)

    final_scores = []
    for idx, scores in results.items():
        final = (w_dense * scores["dense"]) + (w_sparse * scores["sparse"])
        final_scores.append((final, idx))

    return sorted(final_scores, reverse=True)[:top_k]


Initializing retriever...
Loading SentenceTransformer encoder...
Encoding documents for dense index...


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Building FAISS index...
Building BM25 index...
‚úÖ Retriever initialized.


In [14]:
##############################
# (Improvement 1a) Create RAG Dataset
##############################

def create_rag_dataset(df, docs_corpus_hinglish, docs_corpus_english, encoder_model, k=10):
    print(f"Creating RAG dataset from {len(df)} samples...")
    new_data = []

    print("Pre-encoding all English answers for context matching...")
    english_answer_embeddings = encoder_model.encode(docs_corpus_english, batch_size=32, show_progress_bar=True, convert_to_tensor=True, device=device)

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Building RAG triplets"):
        query = row['Hinglish']
        gold_answer = row['English']

        tokenized_query = query.lower().split()
        doc_scores = bm25.get_scores(tokenized_query)

        top_k_indices = np.argsort(doc_scores)[::-1]
        candidate_indices = [idx for idx in top_k_indices if idx != i and idx < len(english_answer_embeddings)][:k] # Added boundary check

        if not candidate_indices:
            new_data.append({"question": query, "context": "No relevant context found.", "answer": gold_answer})
            continue

        gold_answer_embedding = encoder_model.encode([gold_answer], convert_to_tensor=True, device=device)

        candidate_answer_embeddings = english_answer_embeddings[candidate_indices]

        sims = torch.nn.functional.cosine_similarity(gold_answer_embedding, candidate_answer_embeddings)
        best_candidate_idx_in_list = torch.argmax(sims).item()
        best_doc_corpus_idx = candidate_indices[best_candidate_idx_in_list]

        best_context = docs_corpus_hinglish[best_doc_corpus_idx]
        new_data.append({"question": query, "context": best_context, "answer": gold_answer})

    return Dataset.from_list(new_data)

rag_train_dataset = create_rag_dataset(train_df, docs_hinglish, docs_english, encoder)
rag_valid_dataset = create_rag_dataset(valid_df, docs_hinglish, docs_english, encoder)

print("\n--- RAG Dataset Sample ---")
print(rag_train_dataset[0])

Creating RAG dataset from 2766 samples...
Pre-encoding all English answers for context matching...


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Building RAG triplets:   0%|          | 0/2766 [00:00<?, ?it/s]

Creating RAG dataset from 395 samples...
Pre-encoding all English answers for context matching...


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Building RAG triplets:   0%|          | 0/395 [00:00<?, ?it/s]


--- RAG Dataset Sample ---
{'question': 'module , ek program hoti hai , jismen ya to source code ya machine language ke form men instructions nihit hote hain.\n', 'context': 'madyool, ek snchika hoti hai, jismen ya to srot code or machine bhasha ke roop men anudesh nihit hote hain.\n', 'answer': 'Program module is a file that contains instructions which are either in the form of source code or machine language.\n'}


In [15]:
print("Initializing RAG-LoRA model fine-tuning...")

LORA_MODEL_PATH = "/content/rag_lora_adapter"
model_name = "google/mt5-small"

lora_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def preprocess_rag_function(batch):
    inputs = [f"Context: {c} Question: {q}" for c, q in zip(batch['context'], batch['question'])]

    model_inputs = lora_tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    labels = lora_tokenizer(text_target=batch["answer"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_rag_train_ds = rag_train_dataset.map(preprocess_rag_function, batched=True)
tokenized_rag_valid_ds = rag_valid_dataset.map(preprocess_rag_function, batched=True)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none"
)
lora_model = get_peft_model(base_model, lora_config)
lora_model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="/content/rag_lora_out",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-4,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
    eval_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_rag_train_ds,
    eval_dataset=tokenized_rag_valid_ds
)

print("Starting RAG-LoRA model training...")
trainer.train()

print(f"Saving RAG-LoRA adapter to {LORA_MODEL_PATH}")
lora_model.save_pretrained(LORA_MODEL_PATH)
lora_tokenizer.save_pretrained(LORA_MODEL_PATH)

print("‚úÖ RAG-LoRA model training complete.")

Initializing RAG-LoRA model fine-tuning...


Map:   0%|          | 0/2766 [00:00<?, ? examples/s]

Map:   0%|          | 0/395 [00:00<?, ? examples/s]

trainable params: 344,064 || all params: 300,520,832 || trainable%: 0.1145
Starting RAG-LoRA model training...


Epoch,Training Loss,Validation Loss
1,37.5866,27.636684
2,25.7366,18.736238
3,20.6347,16.820719


Saving RAG-LoRA adapter to /content/rag_lora_adapter
‚úÖ RAG-LoRA model training complete.


In [16]:
##############################
# (Improvement 2a) Create Reward Model Dataset
#
##############################

def create_reward_dataset(df):
    print(f"Creating Reward dataset from {len(df)} samples...")
    data = []
    all_answers = df['English'].tolist()
    num_samples = len(all_answers)

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Building Reward Pairs"):
        query = row['Hinglish']
        good_answer = row['English']


        data.append({"query": query, "answer": good_answer, "label": 1.0})


        random_idx = random.randint(0, num_samples - 1)

        while random_idx == i:
            random_idx = random.randint(0, num_samples - 1)

        bad_answer = all_answers[random_idx]
        data.append({"query": query, "answer": bad_answer, "label": 0.0})

    return Dataset.from_list(data).shuffle(seed=42)

reward_ds_train = create_reward_dataset(train_df)
reward_ds_valid = create_reward_dataset(valid_df)

print("\n--- Reward Dataset Sample ---")
print(reward_ds_train[0])
print(reward_ds_train[1])

Creating Reward dataset from 2766 samples...


Building Reward Pairs:   0%|          | 0/2766 [00:00<?, ?it/s]

Creating Reward dataset from 395 samples...


Building Reward Pairs:   0%|          | 0/395 [00:00<?, ?it/s]


--- Reward Dataset Sample ---
{'query': 'fine rasool apni biviyon se life do ki agar o  release  duniyavi zindgi aur uski finery v zinat ki khvah ho to udhar aao i o logon ko kuchh sazo saman de wives aur unvane shaista se roosat kar wives\n', 'answer': "O Prophet, say to your wives: 'If you seek this life and its finery, come, I will release you with a fine release.\n", 'label': 1.0}
{'query': 'user ka nam bataen agar aap current user nahi hai\n', 'answer': 'Additionally, 23 post-graduate scholarships for courses in hydel power and water resources management at IIT, Roorkee have been offered to Nepali engineers and experts this year.\n', 'label': 0.0}


In [17]:
##############################
# (Improvement 2b) Fine-Tune Custom Reward Model
#############################

print("Initializing Reward Model fine-tuning...")

REWARD_MODEL_PATH = "/content/reward_model"
rm_model_name = "microsoft/deberta-v3-small"

rm_tokenizer = AutoTokenizer.from_pretrained(rm_model_name)

rm_model = AutoModelForSequenceClassification.from_pretrained(rm_model_name, num_labels=1)

def preprocess_reward_function(batch):
    model_inputs = rm_tokenizer(batch['query'], batch['answer'], max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = [float(l) for l in batch["label"]]
    return model_inputs

tokenized_rm_train_ds = reward_ds_train.map(preprocess_reward_function, batched=True)
tokenized_rm_valid_ds = reward_ds_valid.map(preprocess_reward_function, batched=True)

rm_training_args = TrainingArguments(
    output_dir="/content/rm_out",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none",
    eval_strategy="epoch",
    load_best_model_at_end=True
)

rm_trainer = Trainer(
    model=rm_model,
    args=rm_training_args,
    train_dataset=tokenized_rm_train_ds,
    eval_dataset=tokenized_rm_valid_ds
)

print("Starting Reward Model training...")
rm_trainer.train()

print(f"Saving Reward Model to {REWARD_MODEL_PATH}")
rm_model.save_pretrained(REWARD_MODEL_PATH)
rm_tokenizer.save_pretrained(REWARD_MODEL_PATH)

print("‚úÖ Reward Model training complete.")

class CustomRewardModel:
    def __init__(self, model_path, device):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.device = device
        self.model.eval()
        print(f"Custom Reward Model loaded from {model_path}")

    @torch.no_grad()
    def compute_reward(self, query, answer):
        if not query or not answer:
            return 0.0

        inputs = self.tokenizer(query, answer, return_tensors='pt', max_length=256, truncation=True, padding="max_length").to(self.device)
        logits = self.model(**inputs).logits

        score = torch.sigmoid(logits[0]).item()
        return score


Initializing Reward Model fine-tuning...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5532 [00:00<?, ? examples/s]

Map:   0%|          | 0/790 [00:00<?, ? examples/s]

Starting Reward Model training...


Epoch,Training Loss,Validation Loss
1,0.0437,0.034659
2,0.0235,0.034203
3,0.0121,0.032983


Saving Reward Model to /content/reward_model
‚úÖ Reward Model training complete.


In [18]:
##############################
# Load Trained Models (RAG-LoRA & Reward Model)
#
##############################

print("Loading all fine-tuned models...")

base_lm = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
qa_model = PeftModel.from_pretrained(base_lm, LORA_MODEL_PATH).to(device)
qa_tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_PATH)
qa_model.eval()
print("RAG-LoRA Model (qa_model) loaded.")

reward_model = CustomRewardModel(REWARD_MODEL_PATH, device)
print("Custom Reward Model (reward_model) loaded.")

print("\n--- Test Reward Model ---")
q_test = "RAG kya hota hai?"
good_a_test = "What is RAG?"
bad_a_test = "Python is a good language"
print(f"Score (Good): {reward_model.compute_reward(q_test, good_a_test):.4f}")
print(f"Score (Bad):  {reward_model.compute_reward(q_test, bad_a_test):.4f}")


Loading all fine-tuned models...
RAG-LoRA Model (qa_model) loaded.
Custom Reward Model loaded from /content/reward_model
Custom Reward Model (reward_model) loaded.

--- Test Reward Model ---
Score (Good): 0.7411
Score (Bad):  0.4949


In [19]:
##############################
#  (RL) Tune Retriever Weights
##############################

print("\nüîß Starting RL Optimization (REINFORCE)...")

class WeightPolicy(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.wd_raw = torch.nn.Parameter(torch.tensor(0.5))
        self.ws_raw = torch.nn.Parameter(torch.tensor(0.5))

    def forward(self):
        probs = torch.softmax(torch.stack([self.wd_raw, self.ws_raw]), dim=0)
        return probs[0], probs[1]
policy = WeightPolicy().to(device)
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)

rl_pairs = list(zip(train_df.Hinglish.tolist(), train_df.English.tolist()))
if len(rl_pairs) > 200:
    rl_pairs = random.sample(rl_pairs, 200)

baseline = 0.5
baseline_alpha = 0.05
num_steps = 100
top_k = 5

for step in range(num_steps)
    q, gold = random.choice(rl_pairs)

    w_dense_t, w_sparse_t = policy()

    q_emb = encoder.encode([q]).astype("float32")
    faiss.normalize_L2(q_emb)
    D_dense, I_dense = index.search(q_emb, top_k)
    bm_scores_all = bm25.get_scores(q.lower().split())

    dense_scores_np = D_dense[0]
    candidate_indices = I_dense[0]

    sparse_scores_np = []
    valid_candidate_indices = []
    for idx in candidate_indices:
        if idx < len(bm_scores_all):
            sparse_scores_np.append(bm_scores_all[idx] / 100.0)
            valid_candidate_indices.append(idx)

    if not valid_candidate_indices:
        continue

    candidate_indices = valid_candidate_indices
    dense_scores_np = dense_scores_np[:len(candidate_indices)]


    dense_tensor = torch.tensor(dense_scores_np, device=device, dtype=torch.float32)
    sparse_tensor = torch.tensor(sparse_scores_np, device=device, dtype=torch.float32)

    final_scores = w_dense_t * dense_tensor + w_sparse_t * sparse_tensor
    probs = torch.softmax(final_scores, dim=0)

    cat = Categorical(probs)
    sampled_idx_in_candidates = cat.sample()
    log_prob = cat.log_prob(sampled_idx_in_candidates)

    chosen_doc_corpus_idx = candidate_indices[sampled_idx_in_candidates.item()]
    context = docs_hinglish[chosen_doc_corpus_idx]

    prompt = f"Context: {context} Question: {q}"
    tokens = qa_tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = qa_model.generate(**tokens, max_length=120, num_beams=4, early_stopping=True)
    pred = qa_tokenizer.decode(out[0], skip_special_tokens=True)

    r = reward_model.compute_reward(q, pred)

    baseline = baseline * (1 - baseline_alpha) + r * baseline_alpha # Update baseline
    advantage = r - baseline
    advantage_t = torch.tensor(advantage, device=device, dtype=torch.float32)

    loss = -advantage_t * log_prob

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 10 == 0:
        wd_val = float(w_dense_t.detach().cpu().numpy())
        ws_val = float(w_sparse_t.detach().cpu().numpy())
        print(f"Step {step:3d} | reward={r:.4f} | adv={advantage:.4f} | loss={loss.item():.4f} | wd={wd_val:.4f} ws={ws_val:.4f}")

final_wd, final_ws = policy()
final_wd_v = float(final_wd.detach().cpu().item())
final_ws_v = float(final_ws.detach().cpu().item())
print(f"\n‚úÖ RL Tuning done. Final weights: Dense={final_wd_v:.4f}, Sparse={final_ws_v:.4f}")
if 'final_wd_v' not in locals():
    print("RL loop did not run or failed. Using default weights.")
    final_wd_v = 0.5
    final_ws_v = 0.5




üîß Starting RL Optimization (REINFORCE)...
Step   0 | reward=0.4895 | adv=-0.0099 | loss=-0.0115 | wd=0.5000 ws=0.5000
Step  10 | reward=0.4913 | adv=-0.0058 | loss=-0.0098 | wd=0.5020 ws=0.4980
Step  20 | reward=0.4920 | adv=-0.0078 | loss=-0.0131 | wd=0.5004 ws=0.4996
Step  30 | reward=0.4883 | adv=-0.0083 | loss=-0.0103 | wd=0.4987 ws=0.5013
Step  40 | reward=0.4801 | adv=-0.0191 | loss=-0.0339 | wd=0.4976 ws=0.5024
Step  50 | reward=0.4894 | adv=-0.0092 | loss=-0.0111 | wd=0.4954 ws=0.5046
Step  60 | reward=0.4920 | adv=-0.0091 | loss=-0.0074 | wd=0.4950 ws=0.5050
Step  70 | reward=0.4888 | adv=-0.0143 | loss=-0.0245 | wd=0.4962 ws=0.5038
Step  80 | reward=0.4972 | adv=-0.0018 | loss=-0.0024 | wd=0.4968 ws=0.5032
Step  90 | reward=0.4947 | adv=-0.0105 | loss=-0.0183 | wd=0.4971 ws=0.5029

‚úÖ RL Tuning done. Final weights: Dense=0.4968, Sparse=0.5032


In [20]:

##############################
#  Final Inference Function
###############################

def answer_query_final(query, top_k=3):

    retrieved = retrieve_hybrid(
        query,
        top_k=top_k,
        w_dense=final_wd_v,
        w_sparse=final_ws_v
    )

    if not retrieved:
        return "Sorry, I couldn't find any relevant context for your query."

    context = ""
    for (_, idx) in retrieved:
        if idx < len(docs_hinglish):
            context += docs_hinglish[idx] + "\n"


    if not context:
         return "Sorry, I found context indices but could not retrieve the text."


    prompt_gemini = f"Context:\n{context}\nQuestion: {query}\nAnswer:"
    if gemini_model:
        try:
            r = gemini_model.generate_content(prompt_gemini)
            if r and r.text:
                print("Generated by LoRA and optimised by Gemini")
                return r.text.strip()
        except Exception as e:
            print(f"Gemini generation failed: {e}. Falling back to RAG-LoRA model.")
            pass
    else:
        print("Gemini model not available. Falling back to RAG-LoRA model.")


    try:
        prompt_lora = f"Context: {context} Question: {query}"
        tokens = qa_tokenizer(prompt_lora, return_tensors="pt").to(device)
        out = qa_model.generate(**tokens, max_length=150, num_beams=4, early_stopping=True)
        ans = qa_tokenizer.decode(out[0], skip_special_tokens=True)
        ans = ans.replace("<extra_id_0>", "").replace("</s>", "").strip()

        if ans:

            return ans
    except Exception as e:
        print(f"RAG-LoRA model failed: {e}. Falling back to top document.")
        pass


    print("--- All generators failed. Returning top retrieved doc. --- (Fallback)")
    idx = retrieved[0][1]
    lang = detect_lang(query)
    if idx < len(docs_hindi) and lang == "hi": return docs_hindi[idx]
    if idx < len(docs_hinglish) and lang == "cs": return docs_hinglish[idx]
    if idx < len(docs_english): return docs_english[idx]

    return "Sorry, I could not generate an answer."



In [21]:

##############################
# Launch Gradio UI
##############################

print("Launching Gradio UI...")

with gr.Blocks(title="Advanced RAG Assistant", theme=gr.themes.Soft()) as ui:
    gr.Markdown("<h1 style='text-align:center;'>üöÄ CODERAG</h1>")
    gr.Markdown("This assistant uses a hybrid retriever (FAISS + BM25) with weights tuned by Reinforcement Learning. It generates answers using **Gemini 1.5 Flash**, and falls back to a custom-trained **RAG-LoRA model**.")
    with gr.Row():
        inp = gr.Textbox(label="Enter your question (Hinglish, Hindi, or English)", lines=3, scale=3, placeholder="RAG kya hota hai?")
        out = gr.Textbox(label="Answer", lines=5, scale=3, interactive=False)
    btn = gr.Button("Ask", variant="primary")
    btn.click(answer_query_final, inp, out)

    gr.Examples(
        ["RAG kya hota hai",
         "mujhe machine learning ke baare mein batao",
         "What is the capital of India?",
         "‡§™‡§æ‡§Ø‡§•‡§® ‡§è‡§ï ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§≠‡§æ‡§∑‡§æ ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§π‡•à?"],
        inputs=inp
    )

ui.launch(share=True, debug=True)

Launching Gradio UI...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://04d9f457354d8d8537.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Generated by LoRA and optimised by Gemini
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://04d9f457354d8d8537.gradio.live




In [None]:
##############################
#  Inference Function
#
##############################

def answer_query_final(query, top_k=3):

    retrieved = retrieve_hybrid(
        query,
        top_k=top_k,
        w_dense=final_wd_v,
        w_sparse=final_ws_v
    )

    if not retrieved:
        return "Sorry, I couldn't find any relevant context for your query."

    context = ""
    for (_, idx) in retrieved:
        if idx < len(docs_hinglish):
            context += docs_hinglish[idx] + "\n"

    if not context:
         return "Sorry, I found context indices but could not retrieve the text."

    try:
        prompt_lora = f"Context: {context}\nQuestion: {query}\nAnswer:"
        tokens = qa_tokenizer(prompt_lora, return_tensors="pt").to(device)
        out = qa_model.generate(**tokens, max_length=200, num_beams=4, early_stopping=True)
        lora_answer = qa_tokenizer.decode(out[0], skip_special_tokens=True)
        lora_answer = lora_answer.replace("<extra_id_0>", "").replace("</s>", "").strip()
    except Exception as e:
        print(f"LoRA generation failed: {e}. Falling back to top doc.")
        lora_answer = None


    if gemini_model and lora_answer:
        prompt_gemini = f"""
You are an expert rewriting and improvement model.

Below is:
1) The retrieved context
2) The user query
3) The answer generated by a RAG-LoRA model

Your task:
- Improve the answer
- Correct it if needed
- Add missing useful knowledge
- Keep meaning aligned with the query
- Use a clean, professional, fluent format

### Context:
{context}

### Query:
{query}

### LoRA Generated Answer:
{lora_answer}

### Improved Final Answer:
"""
        try:
            r = gemini_model.generate_content(prompt_gemini)
            if r and r.text:
                print("Generated by Gemini (Enhanced using LoRA answer + context)")
                return r.text.strip()
        except Exception as e:
            print(f"Gemini enhancement failed: {e}. Falling back to LoRA answer.")


    if lora_answer:
        print("--- Answered by RAG-LoRA --- (Gemini fallback)")
        return lora_answer

    print("--- All generators failed. Returning top retrieved doc. --- (Fallback)")
    idx = retrieved[0][1]
    lang = detect_lang(query)
    if idx < len(docs_hindi) and lang == "hi": return docs_hindi[idx]
    if idx < len(docs_hinglish) and lang == "cs": return docs_hinglish[idx]
    if idx < len(docs_english): return docs_english[idx]

    return "Sorry, I could not generate an answer."
