In [1]:
import requests
import json
from pathlib import Path
import re

from bs4 import BeautifulSoup
from sacremoses import MosesPunctNormalizer
import unicodedata

In [3]:
DOWNLOAD_DIR = Path("../data/raw/quran")
DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)

PROCESSED_DIR = Path("../data/processed/")
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)

LANGUAGE_CODES = [
    'lez-yamen',
    'ru-kuliev',
    'ru-abu-adel',
    'az-musayev',
    'en-saheeh-international',
]

## Scrapping

In [4]:
# https://quranacademy.gitbook.io/digital-quran/api/getting-started
ACCESS_TOKEN = "your-access-token"

API_URL = "http://digital-quran.quranacademy.org"
VERIFY = False

In [5]:
# how to get an id of a language code?
# look for it in the inspector and network tabs
# of your browser's web developer tools
lang_code2id = {
    'lez-yamen': 69,
    'ru-kuliev': 3,
    'ru-abu-adel': 4,
    'az-musayev': 54,
    'en-saheeh-international': 75,
}

In [11]:
translations_response = requests.get(
    f"{API_URL}/translations",
    # params={"language": 'ru'},
    headers={
        "Access-Token": ACCESS_TOKEN,
        "Language": "lez",
    },
    verify=VERIFY,
)
translations_response.json()



{'data': [{'code': 'alouddin-mansur',
   'name': 'Алауддин Мансур',
   'language': 'uz'},
  {'code': 'az-musayev', 'name': 'Алихан Мусаев', 'language': 'az'},
  {'code': 'bashkir-translate',
   'name': 'Башкирский перевод',
   'language': 'ba'},
  {'code': 'bg-theophanov', 'name': 'Цветан Теофанов', 'language': 'bg'},
  {'code': 'ce-adam-ibragimov', 'name': 'Ибрагимов Адам', 'language': 'ce'},
  {'code': 'ce-magomed', 'name': 'Магомед Магомедов', 'language': 'ce'},
  {'code': 'de-abu-rida',
   'name': 'Абу Рида Мухаммад ибн Ахмад',
   'language': 'de'},
  {'code': 'de-bubenheim-elyas',
   'name': 'Бубенгейм и Эльяс',
   'language': 'de'},
  {'code': 'de-denfer', 'name': 'Ахмад фон Денфер', 'language': 'de'},
  {'code': 'de-zaidan', 'name': 'Амир Заидан', 'language': 'de'},
  {'code': 'en-saheeh-international',
   'name': 'Сахих Интернешенал',
   'language': 'en'},
  {'code': 'en-transliteration', 'name': 'Транслит', 'language': 'en'},
  {'code': 'es-cortes', 'name': 'Хулио Кортес', 'la

In [None]:
translations = list()
for translation in translations_response.json()["data"]:
    lang = translation["language"]
    code = translation["code"]
    if code not in LANGUAGE_CODES:
        continue
    print(translation)
    response = requests.get(
        f"{API_URL}/surahs",
        headers={
            "Access-Token": ACCESS_TOKEN,
            "Language": lang,
        },
        verify=VERIFY,
    )

    surahs = response.json()["data"]
    for surah in surahs:
        surah_number = surah["number"]
        ayah_count = surah["ayah_count"]
        surah_name = surah["name"]["translation"]

        output_file_path = DOWNLOAD_DIR / f"{surah_number}.{code}"
        if output_file_path.exists():
            continue
        response = requests.get(
            "https://lez.quranacademy.org/quran/js-api/ayat-texts",
            params={
                "sura": surah_number,
                "start_ayat": 1,
                "end_ayat": ayah_count,
                "translation_id": lang_code2id[code],
            },
            verify=VERIFY,
        )

        texts = [a['text'].strip() for a in sorted(response.json(), key=lambda x: int(x['ayahId']))]
        assert len(texts) == ayah_count

        output_file_path.write_text('\n'.join([surah_name] + texts))

## Processing

In [135]:
def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, features="html.parser")
    for tag in ['div', 'sup', 'span']:
        for t in soup.find_all(tag):
            t.decompose()
    return soup.get_text()

def remove_extra_whitespaces(text):
    text = text.replace('*', ' ').replace(' .', '.')
    text = text.replace(' ,', ',').replace(' !', '!').replace(' ?', '?')
    text = re.sub(r' +', ' ', text)
    text = '\n'.join(l.strip() for l in text.split('\n'))
    return text

def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [144]:
for language_code in LANGUAGE_CODES:
    output_file = PROCESSED_DIR / f"quran.{language_code}"
    # if output_file.exists():
    #     continue
    output_texts = []

    normalizer = MosesPunctNormalizer(
        lang=language_code.split('_')[0],
        penn=True,
        norm_quote_commas=True,
        norm_numbers=True,
        pre_replace_unicode_punct=False,
        post_remove_control_chars=False,
        perl_parity=False,
    )

    for file in sorted(list(DOWNLOAD_DIR.glob(f"*.{language_code}")), key=lambda x: int(x.stem.split('.')[0])):
        text = file.read_text()
        text = clean_html(text)
        text = remove_extra_whitespaces(text)
        text = normalizer.normalize(text)
        # text = strip_accents(text)
        output_texts.append(text)
    output_file.write_text('\n'.join(output_texts))

In [142]:
!wc -l {PROCESSED_DIR}/*.*

    6349 ../data/processed/quran/quran.az-musayev
    6349 ../data/processed/quran/quran.en-saheeh-international
    6349 ../data/processed/quran/quran.lez-yamen
    6349 ../data/processed/quran/quran.ru-abu-adel
    6349 ../data/processed/quran/quran.ru-kuliev
   31745 total
