In [None]:
!pip install beautifulsoup4 numpy sacremoses

In [182]:
import os
import re
from urllib.request import urlopen
from pathlib import Path
import time
from collections import defaultdict

import numpy as np
from bs4 import BeautifulSoup
from sacremoses import MosesPunctNormalizer

In [2]:
DOWNLOAD_DIR = Path("../data/raw/bible")
DOWNLOAD_DIR.mkdir(exist_ok=True)

PROCESSED_DIR = Path("../data/processed/")
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)

# no / in the end
SITE_URL = "https://www.bible.com"

# first / is important
# ?parallel is important
lang_meta = {
    "az": {
        "id": 2324,
        "start_url": "/bible/2324/GEN.1.AZJ08?parallel=840",
    },
    "lez": {
        "id": 2193,
        "start_url": "/bible/2193/GEN.1.%25D0%259B%25D0%2595%25D0%2597%25D0%259F%25D0%259A?parallel=2193",
    },
    "ru_oriental": {
        "id": 385,
        "start_url": "/bible/385/GEN.1.CARS?parallel=2193",
    },
    "ru_oriental_allah": {
        "id": 840,
        "start_url": "/bible/840/GEN.1.CARS-A?parallel=2193",
    },
    "en_standart_vesrion_2016": {
        "id": 59,
        "start_url": "/bible/59/GEN.1.ESV?parallel=2193",
    },
}

## Scrapping

In [22]:
def start_parsing(chapter_url, lang_id):
    global site_url, lez_chapters

    next_chapter_exists = True
    while next_chapter_exists:
        page = urlopen(SITE_URL + chapter_url)
        html = page.read().decode("utf-8")
        soup = BeautifulSoup(html, "html.parser")
    
        chapter_id = soup.find("div", {"class": re.compile(r"ChapterContent_chapter.*")}).get('data-usfm').replace('.', '_')
        save_file_path = DOWNLOAD_DIR / f"{chapter_id}.{lang_id}"
        if not save_file_path.exists():
            html_reader = soup.find("div", {"class": re.compile(r"ChapterContent_reader.*")})
            html_chapter_elements = html_reader.find_all("span", {"class": re.compile(r"ChapterContent_[heading|verse].*")})
            
            headers = html_reader.find_all('h1')
            assert len(headers) == 1
            chapter_text = headers[0].get_text()
            
            for html_chapter_element in html_chapter_elements:
                element_text = html_chapter_element.get_text()
                if element_text == '':
                    continue
            
                element_class_name = html_chapter_element.get('class')[0]
                if 'heading' in element_class_name:
                    chapter_text += '\nHEADING | ' + element_text
                elif 'verse' in element_class_name:
                    verse_id = html_chapter_element.get('data-usfm')
                    label_element = html_chapter_element.find("span", {"class": re.compile(r"ChapterContent_label.*")})
                    if label_element and label_element.get_text() != '#':
                        chapter_text += '\n' + verse_id + ' |'
            
                    for html_content in html_chapter_element.find_all("span", {"class": re.compile(r"ChapterContent_content.*")}):
                        if (html_content.find_parent("span", {"class": re.compile(r"ChapterContent_add.*")})) is not None:
                            continue
                        chapter_text += ' ' + html_content.get_text().strip()
            save_file_path.write_text(chapter_text)
        else:
            time.sleep(3)

        next_chapter_exists = False
        for a in soup.find_all('a', href=True):
            if a.get_text() == "Next Chapter":
                chapter_url = a.get('href')
                print(chapter_url)
                next_chapter_exists = True

In [None]:
for lang, meta in lang_meta.items():
    chapter_url = start_chapter + str(lang_id)
    print(lang_id)
    start_parsing(meta["start_url"], lang)

## Processing

In [3]:
lez_chapters = [f.stem for f in DOWNLOAD_DIR.glob("*.lez")]

In [177]:
bible_by_lang = defaultdict(list)
for chapter in sorted(lez_chapters):
    verses_ids_by_lang = defaultdict(list)
    for file in DOWNLOAD_DIR.glob(f"{chapter}.*"):
        lang = file.suffix[1:]
        chapter_text = file.read_text()
        # skip shapter name
        chapter_text = chapter_text.split('\n')[1:]
        for line in chapter_text:
            if line.startswith('HEADING'):
                continue
            verse_id, verse_text = line.split(' |')
            verses_ids_by_lang[lang].append(verse_id)

    # add missing verses
    max_verse_id_by_lang = defaultdict(list)
    for lang, verses in verses_ids_by_lang.items():
        max_verse_id_by_lang[lang] = sorted([int(v.split('.')[-1]) for v in verses])[-1]

    if len(set(max_verse_id_by_lang.values())) != 1:
        for lang, verses in verses_ids_by_lang.items():
            if max_verse_id_by_lang[lang] < max(max_verse_id_by_lang.values()):
                first_part = '.'.join(verses[-1].split('.')[:-1])
                num = verses[-1].split('.')[-1]
                verses[-1] = verses[-1] + '+' + first_part + '.' + str(int(num) + 1)
    for lang, verses in verses_ids_by_lang.items():
        for i in range(0, len(verses)-1):
            cur = int(verses[i].split('.')[-1])
            next_ = int(verses[i+1].split('.')[-1])
            if next_ - cur > 1:
                first_part = '.'.join(verses[i].split('.')[:-1])
                num = verses[i].split('.')[-1]
                verses[i] += '+' + first_part + '.' + str(int(num) + 1)
            prev = cur

    # get merged verses
    merged_verses_ids = list()
    good_verses = list()
    for lang, verses in verses_ids_by_lang.items():
        for v in verses:
            if '+' in v:
                merged_verses_ids.extend(v.split('+'))
            else:
                good_verses.append(v)
    merged_verses_ids = sorted(list(set(merged_verses_ids)), key=lambda x: int(x.split('.')[-1]))

    good_verses = set(good_verses)
    good_verses = [v for v in good_verses if v not in merged_verses_ids]
    
    final_merged_verses_ids = list()
    if len(merged_verses_ids) > 0:    
        prev = int(merged_verses_ids[0].split('.')[-1])
        span = merged_verses_ids[0]
        for i in range(1, len(merged_verses_ids)):
            if int(merged_verses_ids[i].split('.')[-1]) - prev == 1:
                span += '+' + merged_verses_ids[i]
            else:
                final_merged_verses_ids.append(span)
                span = merged_verses_ids[i]
            prev = int(merged_verses_ids[i].split('.')[-1])
        final_merged_verses_ids.append(span)

        # print(merged_verses_ids)
        # print(final_merged_verses_ids)
        # print(sorted(good_verses))
        # print()
    
    final_verses = sorted(good_verses + final_merged_verses_ids, key=lambda x: int(x.split('.')[-1]))

    for file in DOWNLOAD_DIR.glob(f"{chapter}.*"):
        lang = file.suffix[1:]
        chapter_text = file.read_text()
        # skip shapter name
        chapter_text = chapter_text.split('\n')[1:]
        prev_verse_text = ''
        for line in chapter_text:
            if line.startswith('HEADING'):
                continue
            verse_id, verse_text = line.split(' |')
            if verse_id in final_verses:
                if len(prev_verse_text) > 0:
                    bible_by_lang[lang].append(prev_verse_text)
                    prev_verse_text = ''
                bible_by_lang[lang].append(verse_text.strip())
            else:
                prev_verse_text += verse_text.strip() + ' '
        if len(prev_verse_text) > 0:
            bible_by_lang[lang].append(prev_verse_text)
            prev_verse_text = ''

    if len(set([len(v) for v in bible_by_lang.values()])) != 1:
        print(chapter)
        break

In [179]:
[len(v) for v in bible_by_lang.values()], [v for v in bible_by_lang.keys()]

([13617, 13617, 13617, 13617, 13617],
 ['lez', 'ru_oriental', 'az', 'ru_oriental_allah', 'en_standart_vesrion_2016'])

In [185]:
def remove_extra_whitespaces(text):
    text = text.replace('*', ' ').replace(' .', '.')
    text = text.replace(' ,', ',').replace(' !', '!').replace(' ?', '?')
    text = re.sub(r' +', ' ', text)
    text = '\n'.join(l.strip() for l in text.split('\n'))
    return text

In [186]:
for lang, verses in bible_by_lang.items():
    output_file = PROCESSED_DIR / f"bible.{lang}"

    normalizer = MosesPunctNormalizer(
        lang=lang.split('_')[0],
        penn=True,
        norm_quote_commas=True,
        norm_numbers=True,
        pre_replace_unicode_punct=False,
        post_remove_control_chars=False,
        perl_parity=False,
    )

    text = '\n'.join(verses)
 
    text = remove_extra_whitespaces(text)
    text = normalizer.normalize(text)

    output_file.write_text(text)

In [187]:
!wc -l {PROCESSED_DIR}/bible.*

   13616 ../data/processed/bible.az
   13616 ../data/processed/bible.en_standart_vesrion_2016
   13616 ../data/processed/bible.lez
   13616 ../data/processed/bible.ru_oriental
   13616 ../data/processed/bible.ru_oriental_allah
   68080 total
