In [31]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
import re
from nltk.tokenize import word_tokenize
import os
import nltk
nltk.download('punkt')
model = SentenceTransformer('all-mpnet-base-v2')
def preprocess(text):
    """Preprocess the text by removing punctuation and making it lowercase."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    return ' '.join(tokens)
def encode_text_to_embedding(text):
    """Convert text to a semantic embedding."""
    return model.encode(text, convert_to_tensor=True)
data = []
for filename in os.listdir('Quran'):
    if filename.endswith('.txt'):
        with open(f'Quran/{filename}', 'r', encoding='utf-8') as file:
            surah_name = filename.split('.')[0]
            verses = file.readlines()
            for verse_id, verse_text in enumerate(verses, start=1):
                data.append([surah_name, verse_id, preprocess(verse_text.strip())])
df = pd.DataFrame(data, columns=['surah_name', 'verse_id', 'processed_text'])
df['embedding'] = df['processed_text'].apply(encode_text_to_embedding)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abdullah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
def find_top_related_verses(input_text, top_k=5):
    input_embedding = encode_text_to_embedding(preprocess(input_text))
    similarities = [util.pytorch_cos_sim(input_embedding, emb).item() for emb in df['embedding']]
    similarities = np.array(similarities)
    top_indices = np.argsort(-similarities)[:top_k]
    return df.iloc[top_indices] 

input_text = "take us on the straight path"
top_related_verses = find_top_related_verses(input_text)
for index, row in top_related_verses.iterrows():
    print(f"Surah {row['surah_name']}, Verse {row['verse_id']}: {row['processed_text']}")


Surah 1, Verse 6: 6 guide us to the straight path
Surah 37, Verse 119: 119 and we guided them upon the straight path
Surah 43, Verse 44: 44 so adhere to what is revealed to you you are upon a straight path
Surah 36, Verse 62: 62 and that you shall serve me this is a straight path
Surah 23, Verse 74: 74 you are inviting them to a straight path


In [54]:

input_text = "O disbelievers, I do not worship what you worship".lower()
top_related_verses = find_top_related_verses(input_text)
for index, row in top_related_verses.iterrows():
    print(f"Surah {row['surah_name']}, Verse {row['verse_id']}: {row['processed_text']}")


Surah 109, Verse 3: 3 i do not worship what you worship
Surah 109, Verse 4: 4 nor do you worship what i worship
Surah 40, Verse 67: 67 say i was prohibited from worshiping those you invoke besides god now that clear revelations have come to me from my lord and i was commanded to submit to the lord of the worlds
Surah 6, Verse 2: 2 praise be to god who created the heavens and the earth and made the darkness and the light yet those who disbelieve ascribe equals to their lord
Surah 46, Verse 5: 5 say have you considered those you worship instead of god show me which portion of the earth they have created or do they own a share of the heavens bring me a scripture prior to this one or some trace of knowledge if you are truthful


In [46]:
model.save("model")

In [48]:
model = SentenceTransformer("model")

In [53]:
import joblib
joblib.dump(df, 'model/quran_embeddings.pkl')

['model/quran_embeddings.pkl']