## Scrapper With All References of each hadith On Sunnah.com

In [9]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os

In [10]:


BASE_URL = "https://sunnah.com"
BOOK_URL = f"{BASE_URL}/malik"

# Ensure output folder exists
os.makedirs("malik_hadiths", exist_ok=True)


In [11]:

def get_chapters():
    res = requests.get(BOOK_URL)
    soup = BeautifulSoup(res.text, "html.parser")
    chapters = []
    for link in soup.select("a[href^='/malik/']"):
        book_number = link.get("href").split("/")[-1]
        english_name = link.select_one(".english_book_name").text.strip()
        arabic_name = link.select_one(".arabic_book_name").text.strip()
        chapters.append({
            "book_number": book_number,
            "english_name": english_name,
            "arabic_name": arabic_name,
            "url": BASE_URL + link.get("href")
        })
    return chapters

In [12]:



def get_hadiths_from_chapter(chapter_url):
    res = requests.get(chapter_url)
    soup = BeautifulSoup(res.text, "html.parser")
    hadith_divs = soup.select("div.actualHadithContainer")
    hadiths = []

    for div in hadith_divs:
        eng_text = div.select_one(".english_hadith_full p")
        eng_text = eng_text.get_text(" ", strip=True) if eng_text else ""

        arabic_container = div.select_one(".arabic_hadith_full")
        arabic_text = arabic_container.get_text(" ", strip=True) if arabic_container else ""

        refs = div.select("table.hadith_reference tr")
        references = {}
        for row in refs:
            cells = row.select("td")
            if len(cells) == 2:
                key = cells[0].text.strip()
                val = cells[1].text.strip()
                references[key] = val

        hadiths.append({
            "english": eng_text,
            "arabic": arabic_text,
            "references": references
        })

    return hadiths

In [13]:



def scrape_malik_book():
    print("📥 Fetching chapters…")
    chapters = get_chapters()
    print(f"✅ Chapters found: {len(chapters)}")

    for chapter in chapters:
        print(f"🔍 Scraping: {chapter['english_name']} (Book {chapter['book_number']})")
        hadiths = get_hadiths_from_chapter(chapter['url'])

        chapter_data = {
            "book_number": chapter["book_number"],
            "english_name": chapter["english_name"],
            "arabic_name": chapter["arabic_name"],
            "hadiths": hadiths
        }

        file_path = f"malik_hadiths/book_{chapter['book_number']}.json"
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(chapter_data, f, ensure_ascii=False, indent=2)

        time.sleep(1)  # Be polite to the server

    print("🎉 Scraping all chapters completed and saved to separate files!")


if __name__ == "__main__":
    scrape_malik_book()


📥 Fetching chapters…
✅ Chapters found: 61
🔍 Scraping: The Times of Prayer (Book 1)
🔍 Scraping: Purity (Book 2)
🔍 Scraping: Prayer (Book 3)
🔍 Scraping: Forgetfulness in Prayer (Book 4)
🔍 Scraping: Jumu'a (Book 5)
🔍 Scraping: Prayer in Ramadan (Book 6)
🔍 Scraping: Tahajjud (Book 7)
🔍 Scraping: Prayer in Congregation (Book 8)
🔍 Scraping: Shortening the Prayer (Book 9)
🔍 Scraping: The Two 'Ids (Book 10)
🔍 Scraping: The Fear Prayer (Book 11)
🔍 Scraping: The Eclipse Prayer (Book 12)
🔍 Scraping: Asking for Rain (Book 13)
🔍 Scraping: The Qibla (Book 14)
🔍 Scraping: The Qur'an (Book 15)
🔍 Scraping: Burials (Book 16)
🔍 Scraping: Zakat (Book 17)
🔍 Scraping: Fasting (Book 18)
🔍 Scraping: I'tikaf in Ramadan (Book 19)
🔍 Scraping: Hajj (Book 20)
🔍 Scraping: Jihad (Book 21)
🔍 Scraping: Vows and Oaths (Book 22)
🔍 Scraping: Sacrificial Animals (Book 23)
🔍 Scraping: Slaughtering Animals (Book 24)
🔍 Scraping: Game (Book 25)
🔍 Scraping: The 'Aqiqa (Book 26)
🔍 Scraping: Fara'id (Book 27)
🔍 Scraping: Marriag

### With Arabic Correction Scrapper Code


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os
import re

BASE_URL = "https://sunnah.com"
BOOK_URL = f"{BASE_URL}/malik"

os.makedirs("../Results/Muwatta Malik", exist_ok=True)

def clean_arabic_text(text):
    return re.sub(r'[\u200f\u200e\u202a-\u202e\u2066-\u2069]', '', text)

def get_chapters():
    res = requests.get(BOOK_URL)
    soup = BeautifulSoup(res.text, "html.parser")
    chapters = []
    for link in soup.select("a[href^='/malik/']"):
        book_number = link.get("href").split("/")[-1]
        english_name = link.select_one(".english_book_name").text.strip()
        arabic_name = link.select_one(".arabic_book_name").text.strip()
        chapters.append({
            "book_number": book_number,
            "english_name": english_name,
            "arabic_name": clean_arabic_text(arabic_name),
            "url": BASE_URL + link.get("href")
        })
    return chapters

def get_hadiths_from_chapter(chapter_url):
    res = requests.get(chapter_url)
    soup = BeautifulSoup(res.text, "html.parser")
    hadith_divs = soup.select("div.actualHadithContainer")
    hadiths = []

    for div in hadith_divs:
        eng_text = div.select_one(".english_hadith_full p")
        eng_text = eng_text.get_text(" ", strip=True) if eng_text else ""

        arabic_container = div.select_one(".arabic_hadith_full")
        arabic_text = arabic_container.get_text(" ", strip=True) if arabic_container else ""
        arabic_text = clean_arabic_text(arabic_text)

        refs = div.select("table.hadith_reference tr")
        references = {}
        for row in refs:
            cells = row.select("td")
            if len(cells) == 2:
                key = cells[0].text.strip()
                val = cells[1].text.strip()
                references[key] = val

        hadiths.append({
            "english": eng_text,
            "arabic": arabic_text,
            "references": references
        })

    return hadiths

def scrape_malik_book():
    print("📥 Fetching chapters…")
    chapters = get_chapters()
    print(f"✅ Chapters found: {len(chapters)}")

    for chapter in chapters:
        print(f"🔍 Scraping: {chapter['english_name']} (Book {chapter['book_number']})")
        hadiths = get_hadiths_from_chapter(chapter['url'])

        chapter_data = {
            "book_number": chapter["book_number"],
            "english_name": chapter["english_name"],
            "arabic_name": chapter["arabic_name"],
            "hadiths": hadiths
        }

        file_path = f"../Results/Muwatta Malik/book_{chapter['book_number']}.json"
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(chapter_data, f, ensure_ascii=False, indent=2)

        time.sleep(1) 

    print("🎉 Scraping all chapters completed and saved to separate files!")

if __name__ == "__main__":
    scrape_malik_book()


### Fix Json Issues in Muwatta Malik Scrapped Results


In [1]:
import json
import os
import re
from pathlib import Path

def clean_text(text):
    """
    Clean text by:
    1. Replacing \n with spaces
    2. Replacing unicode characters like U+00A0 with regular spaces
    3. Removing extra spaces and cleaning up formatting
    """
    if not text:
        return text
    
    # Replace \n with space
    text = text.replace('\\n', ' ')
    text = text.replace('\n', ' ')
    
    # Replace various unicode space characters with regular space
    text = text.replace('\u00A0', ' ')  # Non-breaking space
    text = text.replace('\u2002', ' ')  # En space
    text = text.replace('\u2003', ' ')  # Em space
    text = text.replace('\u2009', ' ')  # Thin space
    text = text.replace('\u200B', ' ')  # Zero-width space
    
    # Remove multiple consecutive spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing spaces
    text = text.strip()
    
    return text

def clean_reference_text(text):
    """
    Clean reference text by:
    1. Removing unwanted unicode characters and colons at the beginning
    2. Ensuring proper format like "Book X, Hadith Y"
    """
    if not text:
        return text
    
    # First apply general text cleaning
    text = clean_text(text)
    
    # Remove leading colons and spaces
    text = text.lstrip(': ')
    
    # Remove any remaining unwanted characters at the beginning
    # Look for patterns like ": Book" and clean them to just "Book"
    text = re.sub(r'^[:\s\u00A0\u2002\u2003\u2009\u200B]+', '', text)
    
    return text

def fix_json_file(file_path):
    """
    Fix formatting issues in a single JSON file
    """
    print(f"Processing: {file_path}")
    
    try:
        # Read the JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Track if any changes were made
        changes_made = False
        
        # Process each hadith in the file
        if 'hadiths' in data:
            for hadith in data['hadiths']:
                # Clean English text
                if 'english' in hadith and hadith['english']:
                    original_english = hadith['english']
                    hadith['english'] = clean_text(hadith['english'])
                    if hadith['english'] != original_english:
                        changes_made = True
                
                # Clean Arabic text
                if 'arabic' in hadith and hadith['arabic']:
                    original_arabic = hadith['arabic']
                    hadith['arabic'] = clean_text(hadith['arabic'])
                    if hadith['arabic'] != original_arabic:
                        changes_made = True
                
                # Clean references
                if 'references' in hadith and isinstance(hadith['references'], dict):
                    for ref_key, ref_value in hadith['references'].items():
                        if ref_value:
                            original_ref = ref_value
                            hadith['references'][ref_key] = clean_reference_text(ref_value)
                            if hadith['references'][ref_key] != original_ref:
                                changes_made = True
        
        # Write back the cleaned data if changes were made
        if changes_made:
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"✅ Fixed formatting issues in: {file_path}")
            return True
        else:
            print(f"ℹ️  No formatting issues found in: {file_path}")
            return False
            
    except Exception as e:
        print(f"❌ Error processing {file_path}: {str(e)}")
        return False

def main():
    """
    Main function to process all JSON files in the Muwatta Malik folder
    """
    # Define the path to the Muwatta Malik folder
    muwatta_folder = Path(r"D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik")
    
    if not muwatta_folder.exists():
        print(f"❌ Folder not found: {muwatta_folder}")
        return
    
    # Find all JSON files in the folder
    json_files = list(muwatta_folder.glob("*.json"))
    
    if not json_files:
        print(f"❌ No JSON files found in: {muwatta_folder}")
        return
    
    print(f"Found {len(json_files)} JSON files to process...")
    print("=" * 60)
    
    # Process each file
    fixed_count = 0
    total_count = len(json_files)
    
    for json_file in sorted(json_files):
        if fix_json_file(json_file):
            fixed_count += 1
        print("-" * 60)
    
    print("=" * 60)
    print(f"✅ Processing complete!")
    print(f"📊 Files processed: {total_count}")
    print(f"🔧 Files fixed: {fixed_count}")
    print(f"ℹ️  Files unchanged: {total_count - fixed_count}")

if __name__ == "__main__":
    main()


Found 61 JSON files to process...
Processing: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_1.json
ℹ️  No formatting issues found in: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_1.json
------------------------------------------------------------
Processing: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_10.json
ℹ️  No formatting issues found in: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_10.json
------------------------------------------------------------
Processing: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_11.json
ℹ️  No formatting issues found in: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_11.json
------------------------------------------------------------
Processing: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_12.json
ℹ️  No formatting issues found in: D:\nexusberry\IImiAI\hadith-data\Results\Muwatta Malik\book_12.json
-----------------------------------------------------