<a href="https://colab.research.google.com/github/EllouziMedAmin/DSWithPytorch/blob/main/quran_vector_db_FAISS_%26_Quran_RAG_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu sentence-transformers numpy pandas

In [2]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [3]:
# Load embedding model (efficient & accurate)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load Quran text from the formatted file
def load_quran(file_path):
    df = pd.read_csv(file_path, sep="|", names=["chapter", "verse", "text"], dtype=str)
    df["reference"] = df["chapter"] + ":" + df["verse"]
    return df

# Convert Quranic verses to vector embeddings
def embed_texts(texts, model):
    return np.array(model.encode(texts, convert_to_numpy=True))

# Store embeddings in FAISS
def create_faiss_index(embeddings):
    d = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    return index

# Query FAISS for semantic search
def search_faiss(query, model, index, df, k=5):
    query_vec = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, k)
    return [(df.iloc[i]["reference"], df.iloc[i]["text"], distances[0][idx]) for idx, i in enumerate(indices[0])]

# Load Quran, Embed, and Create FAISS Index
quran_file = "quran.txt"
quran_df = load_quran(quran_file)
embeddings = embed_texts(quran_df["text"].tolist(), model)
index = create_faiss_index(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Example Query
query = "mercy of Allah"
results = search_faiss(query, model, index, quran_df)

# Display Results
for ref, text, score in results:
    print(f"Reference: {ref} | Verse: {text} \nScore: {score}\n")

Reference: 044:042 | Verse: Except those on whom Allah has mercy. He is the Mighty, the Merciful. 
Score: 0.5019750595092773

Reference: 024:020 | Verse: Allah extends His grace and mercy upon you. Allah is Kind and Merciful. 
Score: 0.548178493976593

Reference: 003:132 | Verse: And obey Allah and the Messenger that you may obtain mercy. 
Score: 0.5951741933822632

Reference: 001:001 | Verse: In the name of Allah, the Gracious, the Merciful. 
Score: 0.6009649038314819

Reference: 003:074 | Verse: He selects for His mercy whom He wills. And indeed, Allah is of great bounty. 
Score: 0.6239426732063293



In [5]:
from huggingface_hub import login
login("token")

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import torch

# Load sentence embedding model (efficient & powerful)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load local LLM model (Mistral-7B or LLaMA 2)
llm_model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name, torch_dtype=torch.float16, device_map="auto")
llm_pipeline = pipeline("text-generation", model=llm_model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [8]:
def generate_answer(question, retrieved_verses):
    context = "\n".join([f"{ref}: {text}" for ref, text in retrieved_verses])
    prompt = f"<s>[INST] Using the following Quranic verses, answer the question:\n\n{context}\n\nQuestion: {question} [/INST]"

    response = llm_pipeline(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
    return response[0]["generated_text"].split("[/INST]")[-1].strip()


In [12]:
def generate_answer(question, retrieved_verses):
    # Remove the similarity score, keeping only (ref, text)
    filtered_verses = [(ref, text) for ref, text, _ in retrieved_verses]

    context = "\n".join([f"{ref}: {text}" for ref, text in filtered_verses])
    prompt = f"<s>[INST] Using the following Quranic verses, answer the question:\n\n{context}\n\nQuestion: {question} [/INST]"

    response = llm_pipeline(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
    return response[0]["generated_text"].split("[/INST]")[-1].strip()


In [None]:
# Example RAG Query
user_question = "What does the Quran say about mercy?"
retrieved_verses = search_faiss(user_question, model, index, quran_df)
answer = generate_answer(user_question, retrieved_verses)


In [15]:
retrieved_verses

[('007:204',
  'When the Quran is read, then listen to it attentively, so you may receive mercy.',
  np.float32(0.50745606)),
 ('044:042',
  'Except those on whom Allah has mercy. He is the Mighty, the Merciful.',
  np.float32(0.6246931)),
 ('035:002',
  'None can withhold the mercy Allah opens for people, and none can release it after He has withheld it. He is the All-Wise, the All-Powerful.',
  np.float32(0.6936531)),
 ('036:045',
  'When they are told, “Guard yourselves from what’s before you and what’s behind you, so you may receive mercy.”',
  np.float32(0.7499194)),
 ('057:029',
  'The People of the Scripture should know that they have no monopoly whatsoever over Allah’s mercy and grace, and that all grace and mercy are in Allah’s hands. He grants them to whom He wills. Allah is the possessor of vast bounty.',
  np.float32(0.77249426))]

In [14]:
answer

"The Quran mentions mercy several times and describes it as a blessing that comes from Allah. It says that when the Quran is read, one should listen to it attentively in order to receive mercy (7:204). It also states that Allah has mercy on people, and that this mercy cannot be withheld or released by anyone else (35:2 and 36:45). The People of the Scripture are reminded that they have no monopoly on Allah's mercy and grace, and that all grace and mercy are in Allah's hands (57:29). Overall, the Quran portrays mercy as a gift from Allah that is available to all people."

✅ Good structure & clarity\
✅ Mostly accurate references\
⚠ Slight overgeneralization\
⚠ One minor misinterpretation