# **FULL MULTILINGUAL RAG SYSTEM**

### **Setup Environment**

In [5]:
!pip install transformers sentence-transformers faiss-cpu sacrebleu faster-whisper accelerate bitsandbytes



### **STEP 1 — INPUT CAPTURE (Audio OR Text)**

In [6]:
from google.colab import files


In [7]:
# Step 1: User Input (Audio or Text)

def load_audio():
    print("Upload Arabic audio file:")
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    return file_name

def load_text():
    return input("Enter Arabic or English text: ")


### **STEP 2 — ARABIC SPEECH → TEXT (ASR)**

In [8]:
# Step 2 imports
from faster_whisper import WhisperModel


In [9]:
# Load Whisper ASR model (medium works well in Colab GPU)
asr_model = WhisperModel("medium", device="cuda", compute_type="float16")

def transcribe_arabic_audio(audio_path):
    segments, info = asr_model.transcribe(audio_path, language="ar", beam_size=5)
    text = " ".join([seg.text for seg in segments])
    return text


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### **STEP 3 — TRANSLATE ARABIC → ENGLISH**

In [10]:
# Step 3 imports
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch


In [11]:
# Load translation model
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
translation_model = M2M100ForConditionalGeneration.from_pretrained(
    "facebook/m2m100_418M"
).to("cuda")

def translate_ar_to_en(text):
    tokenizer.src_lang = "ar"
    encoded = tokenizer(text, return_tensors="pt").to("cuda")
    generated = translation_model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.get_lang_id("en")
    )
    english_text = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
    return english_text


### **STEP 4 — BUILD KNOWLEDGE BASE + RETRIEVER (FAISS)**

In [12]:
# Step 4 imports
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import glob


In [6]:
# Load and embed knowledge base
def load_documents(path="/content/drive/MyDrive/medical_kb_chunks/*.txt"):
    files = sorted(glob.glob(path))
    print("FILES FOUND:", len(files))
    docs = []
    for f in files:
        with open(f, "r", encoding="utf-8") as x:
            docs.append(x.read())
    print("TOTAL DOCS LOADED:", len(docs))
    return docs


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!ls /content/drive/MyDrive/medical_kb_v2 | head


In [12]:
!ls /content/drive/MyDrive


 10222-Divyansh-10B-English.pdf
 10222-Divyansh-10B-Hindi.pdf
 10222-Divyansh-10B-Science.pdf
 10222-Divyansh-10B-Social.pdf
 17277083709786614342570311523601.jpg
'23B2204_Electrical_SLOT 2 (1).pdf'
'23B2204_Electrical_SLOT 2 (2).pdf'
'23B2204_Electrical_SLOT 2.pdf'
'23B2204_Mechanical_SLOT 1.pdf'
'2D Image.jpg'
 49_CodeTitans_The_Clash_of_the_Algorithms_SOC25_Final.gsheet
 50_Companies.gsheet
'5th_sem_Software_2pg (1).pdf'
'5th_sem_Software_2pg (2).pdf'
 5th_sem_Software_2pg.pdf
'Airframe & Recovery Assignment.pdf'
'AL ML project demos'
'Arduino Attendence 24.gsheet'
 ASM_2096.pdf
 Assignment_24B3034
'Colab Notebooks'
 Contracts.gdoc
 Contracts_RnD_Phase.gdoc
'Copy of Interview Interpretation .gsheet'
'Course mapping.gsheet'
'CP SOC '
 DB2.gsheet
 DB_GoogleSheet.gsheet
 DB.gsheet
 Divyansh_23B2204
'Divyansh Agrawal_23B2204.jpg'
'Divyansh Agrawal Resume (1).pdf'
'Divyansh_Agrawal_Resume (1).pdf'
'Divyansh_Agrawal_Resume (2).pdf'
'Divyansh_Agrawal_Resume (3).pdf'
'Divyansh_Agrawal_Resum

In [8]:
!ls /content/drive/MyDrive/medical_kb_v2 | head


In [2]:
!ls /content/drive


MyDrive


In [3]:
!ls -l /content/drive/MyDrive/medical_kb_chunks


total 0


In [4]:
kb_docs = load_documents("/content/drive/MyDrive/medical_kb_chunks/*.txt")


NameError: name 'load_documents' is not defined

In [13]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Build FAISS index
doc_embeddings = embed_model.encode(kb_docs, convert_to_numpy=True, show_progress_bar=True)
faiss.normalize_L2(doc_embeddings)

index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)

def retrieve_top_k(query, k=5):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    results = [(kb_docs[i], float(D[0][idx])) for idx, i in enumerate(I[0])]
    return results

def limit_contexts(contexts, max_chunks=2, max_chars=1200):
    final = []
    for c, s in contexts[:max_chunks]:
        final.append(c[:max_chars])
    return final

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

### **STEP 5 — GENERATION (LOCAL LLM — NO API)**

In [14]:
# Step 5 imports
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen1.5-1.8B-Chat"

llm_tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_rag_answer(question, contexts):
    contexts = limit_contexts(contexts)

    context_text = "\n\n".join([f"[DOC] {c}" for c in contexts])

    prompt = f"""
You are an AI assistant. Answer ONLY using the context provided.

CONTEXT:
{context_text}

QUESTION:
{question}

ANSWER (in English):
"""

    inputs = llm_tokenizer(prompt, return_tensors="pt").to("cuda")

    output = llm_model.generate(
        **inputs,
        max_new_tokens=250,
        temperature=0.2
    )

    answer = llm_tokenizer.decode(output[0], skip_special_tokens=True)
    return answer


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

### **STEP 6 — VALIDATION (BLEU + Grounding)**

In [16]:
# Step 6 imports
from sacrebleu import corpus_bleu
import re

In [17]:
def bleu_score(hypothesis, reference):
    return corpus_bleu([hypothesis], [[reference]]).score

def grounding_score(answer, contexts):
    ctx = " ".join([c for c,_ in contexts]).lower()
    ans_words = answer.lower().split()
    overlap = sum(1 for w in ans_words if w in ctx)
    return overlap / len(ans_words)

### **FULL END-TO-END PIPELINE FUNCTION (CONNECTING ALL 6 STEPS)**

In [18]:
def multilingual_rag_pipeline(audio_path=None, text=None, language="ar"):
    # STEP 1 → Capture Input
    if audio_path:
        ar_text = transcribe_arabic_audio(audio_path)
    else:
        ar_text = text

    # STEP 3 → Translate if Arabic
    if language == "ar":
        en_query = translate_ar_to_en(ar_text)
    else:
        en_query = ar_text

    # STEP 4 → Retrieve relevant docs
    contexts = retrieve_top_k(en_query, k=5)

    # STEP 5 → Generate answer using local LLM
    answer = generate_rag_answer(en_query, contexts)

    # STEP 6 → Validation
    g_score = grounding_score(answer, contexts)

    return {
        "arabic_input": ar_text,
        "english_query": en_query,
        # "contexts": contexts,
        # "answer": answer,
        "grounding_score": g_score
    }

### **EXAMPLE USAGE**

### Text Input

In [1]:
# based on texts in the pdf
response1 = multilingual_rag_pipeline(text="ما هي العلامات السريرية الأساسية للصدمة كما يذكر الدليل، مثل انخفاض الضغط ونقص التروية؟", language="ar")
# English: What are the main clinical signs of shock mentioned in the guideline, such as low blood pressure and tissue hypoperfusion?
# pages 11–14
response1

In [None]:
# based on figures
response2 = multilingual_rag_pipeline(text="يوجد جدول في الدليل يحتوي على اختصارات طبية مثل BP وCRT وSpO₂. ماذا تعني هذه الاختصارات؟" language="ar")
# English: There is a table containing medical acronyms such as BP, CRT, and SpO₂. What do these acronyms mean?
# Pages 7–9 — Abbreviations table with many medical acronyms
response2

In [None]:
# based on figures
response2 = multilingual_rag_pipeline(text="في جدول جرعات البنزوديازبينات لعلاج نوبة الصرع، ما الجرعة المقترحة لدواء ميدازولام لطفل وزنه 10 كغم؟" language="ar")
# English: According to the benzodiazepine dosing table, what is the recommended midazolam dose for a child weighing 10 kg?
# Page 24 — Benzodiazepines dosing table (table with ages/weights)
response2

### Audio Input

In [20]:
# based on written text
response = multilingual_rag_pipeline(audio_path=text_based1.mp3)
# English: What are the clinical signs of severe hypoglycemia that require immediate treatment?
# Page 32–33
response


Upload Arabic audio file:


Saving arabic_sample3.mp3 to arabic_sample3.mp3


ERROR:libav.mp3float:Header missing


{'arabic_input': ' هل يسمح للطالب بتأجيل الفصل الدراسي؟ وما هي الوثائق المطلوبة للموافقة؟',
 'english_query': 'Is the student permitted to delay the semester? and what documents are required for approval?',
 'contexts': [('year) onwards. The break will be allowed/ approved after second year. Such approval can be availed by the student before fee payment and registration for a semester. Once the classes have started, the students will not be eligible to apply for a break for that semester. d) Approval chain of such semester break - Faculty Advisor – DUGC – – Dean (Academic Pro- gramme)/ Convener, UGAPEC. e) The student would not be eligible for hostel/ Medical facilities during approved planned break. f) In case of approved break, the period of break will not be counted for the prescribed dura- tion of the programme. Hence, s/he will be eligible for the award of minor and Honours, if credits requirements are completed in the prescribed duration of the programme excluding approved break.

In [None]:
# based on written text
response = multilingual_rag_pipeline(audio_path=figures_based1.mp3)
# English: According to the guideline, what antibiotic is recommended for sepsis with a cutaneous source such as cellulitis?
# Page 16 — Antibiotic choices table
response


In [None]:
# based on written text
response = multilingual_rag_pipeline(audio_path=figures_based2.mp3)
# English: In the hypovolemic shock treatment table, what is the recommended Ringer Lactate volume for a child under one year old?
# Page 18–19 — Fluid resuscitation table
response
