In [None]:
!pip install -q faiss-cpu sentence-transformers


In [None]:
!git clone https://github.com/ShathaTm/LK-Hadith-Corpus.git

In [None]:
import pandas as pd
import glob

In [None]:
colnames = ['Chapter_Number', 'Chapter_English', 'Chapter_Arabic', 'Section_Number',
       'Section_English', 'Section_Arabic', 'Hadith_number', 'English_Hadith',
       'English_Isnad', 'English_Matn', 'Arabic_Hadith', 'Arabic_Isnad',
       'Arabic_Matn', 'Arabic_Comment', 'English_Grade', 'Arabic_Grade']

In [None]:
path = '/content/LK-Hadith-Corpus'
files = sorted(glob.glob(path + '//**//*.csv', recursive=True))

In [None]:
import re

def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  text = re.sub(r'\s+', ' ', text)
  return text

In [None]:
all_hadith = []
for file in files:
    df = pd.read_csv(file, names=colnames, skiprows=1)

    if 'English_Hadith' in df.columns:
        df['Clean_Hadith'] = df['English_Hadith'].astype(str).apply(clean_text)
        all_hadith.extend(df[['Chapter_Number', 'Chapter_English', 'Section_Number',
                              'Section_English', 'Hadith_number', 'English_Hadith',
                              'Arabic_Hadith', 'Clean_Hadith', 'English_Grade']].values.tolist())

In [None]:
hadith_df = pd.DataFrame(all_hadith, columns=[
    'Chapter_Number', 'Chapter_English', 'Section_Number',
    'Section_English', 'Hadith_number', 'English_Hadith',
    'Arabic_Hadith', 'Clean_Hadith', 'English_Grade'
])
hadith_df.to_csv('cleaned_hadith_data.csv', index=False)


In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:

embeddings = model.encode(hadith_df['Clean_Hadith'].values, show_progress_bar=True, convert_to_numpy=True)


In [None]:

import numpy as np
np.save('hadith_embeddings.npy', embeddings)
embeddings = np.load('hadith_embeddings.npy')


In [None]:
import faiss

dimensions = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimensions)

faiss_index.add(embeddings)

faiss.write_index(faiss_index, "faiss_index.faiss")


In [None]:
def get_similar_hadith(query, model, faiss_index, count=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distance, indices = faiss_index.search(query_embedding, count)

    results = []
    for i in range(count):
        result = {
            'arabic': hadith_df['Arabic_Hadith'].iloc[indices[0][i]],
            'english': hadith_df['English_Hadith'].iloc[indices[0][i]],
            'distance': float(distance[0][i])
        }
        results.append(result)

    return results


In [None]:
results = get_similar_hadith("How many times we should pray?", model, faiss_index, 5)
for r in results:
    print("🕌 Arabic:", r['arabic'])
    print("📖 English:", r['english'])
    print("📏 Distance:", r['distance'], '\n')
