### Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager 
from tqdm import tqdm
from bs4 import BeautifulSoup
import time 
import os
from pathlib import Path 

### Languages / Data Sources

In [24]:
new_bible_languages_url = {
    "english": "https://www.bible.com/bible/12/MAT.1.ASV",              #1
    "spanish": "https://www.bible.com/bible/89/MAT.1.LBLA",             #2
    "adasen": "https://www.bible.com/bible/2812/MAT.1.YBT",             #3
    "chavacano": "https://www.bible.com/bible/1129/MAT.1.CBKNT",        #4
    "paranan": "https://www.bible.com/bible/438/MAT.1.PRF",             #5  
    "tausug": "https://www.bible.com/bible/1319/MAT.1.TSG",             #6
    "romblomanon": "https://www.bible.com/bible/2244/MAT.1.BKR",        #7
    "masbatenyo": "https://www.bible.com/bible/1222/MAT.1.MSB",         #8
    "kinaray-a": "https://www.bible.com/bible/1489/MAT.1.KRJNT",        #9
    "yami": "https://www.bible.com/bible/2364/MAT.1.SNT",               #10
    "tagalog": "https://www.bible.com/bible/2195/MAT.1.ABTAG01",        #11
    "cebuano": "https://www.bible.com/bible/562/MAT.1.RCPV",            #12
    "ilokano": "https://www.bible.com/bible/782/MAT.1.RIPV",            #13
    "ilonggo": "https://www.bible.com/bible/2190/MAT.1.MBBHIL12",       #14
    "waray": "https://www.bible.com/bible/2198/MAT.1.MBBSAM",           #15
    "bikolano": "https://www.bible.com/bible/890/MAT.1.MBBBIK92",       #16
}

new_bible_books = ["MAT", "MRK", "LUK"]

### Web Scraping

In [None]:
print("Setting up WebDriver...")
options = Options()
options.add_argument("--headless")  # run chrome without opening a visual window
options.add_argument("--log-level=3")  # suppress unnecessary logs
options.add_experimental_option('excludeSwitches', ['enable-logging'])

# use WebDriver Manager to handle driver installation/updates automatically
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
print("WebDriver ready.")

# data structures for storing scraped data and statistics
bible_data_new = {}  # {lang: {book: {chapter: [verses]}}}
word_counts_new = {}  # {lang: count}
total_words_new = 0
total_verses_new = 0
total_chapters_new = 0

# progress bar setup
total_iterations = len(new_bible_languages_url) * len(new_bible_books)
pbar = tqdm(total=total_iterations, desc="Overall Progress", unit="book")

try:
    for lang, root_url in new_bible_languages_url.items():
        bible_data_new[lang] = {}
        word_counts_new[lang] = 0

        # extract the parts of the URL
        try:
            parts = root_url.split("/")
            base_bible_url = f"https://www.bible.com/bible/{parts[4]}"
            version = root_url.split(".")[-1]
        except IndexError:
            print(f"Skipping invalid URL format for {lang}: {root_url}")
            pbar.update(len(new_bible_books))  
            continue  # skip to next language if error occurs

        for book in new_bible_books:
            pbar.set_description(f"Scraping {lang} - {book}")
            bible_data_new[lang][book] = {}

            ch = 1
            while True:
                url = f"{base_bible_url}/{book}.{ch}.{version}"
                driver.get(url)
                time.sleep(1)  # wait for page to load

                soup = BeautifulSoup(driver.page_source, "html.parser")
                
                # checks if end of chapters reached using "not available" marker
                not_available = soup.find("span", class_="ChapterContent_not-avaliable-span__WrOM_")
                if not_available:
                    break

                # if not yet end of chapters, extract
                chapter_content = soup.find_all("span", {"data-usfm": True})

                # mark as empty if no content found
                if not chapter_content:
                    bible_data_new[lang][book][ch] = ["MISSING"]
                else:
                    chapter_verses_new = []
                    chapter_word_count_new = 0

                    for verse in chapter_content:
                        # remove footnotes within the verse
                        for note in verse.find_all("span", class_=lambda x: x and x.startswith("ChapterContent_note")):
                            note.decompose()

                        # extract clean verse text
                        verse_text = verse.get_text(" ", strip=True)
                        if verse_text:
                            chapter_verses_new.append(verse_text)
                            verse_words_new = len(verse_text.split())
                            chapter_word_count_new += verse_words_new
                            total_verses_new += 1

                    # add verses
                    bible_data_new[lang][book][ch] = (
                        chapter_verses_new if chapter_verses_new else ["MISSING"]
                    )

                    # update word counts
                    word_counts_new[lang] += chapter_word_count_new
                    total_words_new += chapter_word_count_new
                    total_chapters_new += 1

                    pbar.set_postfix({
                        'Words': f"{total_words_new:,}",
                        'Verses': f"{total_verses_new:,}",
                        'Chapters': total_chapters_new
                    })

                ch += 1       # next chapter
                if ch > 100:  # safety cap
                    break

            pbar.update(1)

finally:
    driver.quit()
    pbar.close()
    print("Scraping complete.")

# summary statistics
print("\n" + "="*60)
if total_verses_new > 0 and total_chapters_new > 0:
    print(f"Total Words: {total_words_new:,}")
    print(f"Total Verses: {total_verses_new:,}")
    print(f"Total Chapters: {total_chapters_new}")
    print(f"Average Words per Verse: {total_words_new/total_verses_new:.1f}")
    print(f"Average Words per Chapter: {total_words_new/total_chapters_new:.1f}")
else:
    print("No data scraped or processed.")

print("\nWord Count by Language:")
print("-" * 30)
if total_words_new > 0:
    for lang_key, count in word_counts_new.items():
        if count > 0:
            percentage = (count / total_words_new) * 100
            print(f"{lang_key:12}: {count:8,} words ({percentage:.1f}%)")
        else:
            print(f"{lang_key:12}: {count:8,} words (0.0%) - Check availability/URL")
else:
    print("No words counted.")

Setting up WebDriver...
WebDriver ready.


Scraping bikolano - LUK: 100%|██████████| 48/48 [30:31<00:00, 38.16s/book, Words=1,200,405, Verses=49,672, Chapters=1088]

Scraping complete.

Total Words: 1,200,405
Total Verses: 49,672
Total Chapters: 1088
Average Words per Verse: 24.2
Average Words per Chapter: 1103.3

Word Count by Language:
------------------------------
english     :   66,890 words (5.6%)
spanish     :   62,683 words (5.2%)
adasen      :   92,218 words (7.7%)
chavacano   :   97,055 words (8.1%)
paranan     :   74,123 words (6.2%)
tausug      :   90,659 words (7.6%)
romblomanon :   75,652 words (6.3%)
masbatenyo  :   71,226 words (5.9%)
kinaray-a   :   77,204 words (6.4%)
yami        :   86,152 words (7.2%)
tagalog     :   67,231 words (5.6%)
cebuano     :   68,897 words (5.7%)
ilokano     :   59,445 words (5.0%)
ilonggo     :   74,632 words (6.2%)
waray       :   71,798 words (6.0%)
bikolano    :   64,540 words (5.4%)





In [26]:
bible_data_new

{'english': {'MAT': {1: ['1 The book of the generation of Jesus Christ, the son of David, the son of Abraham.',
    '2 Abraham begat Isaac; and Isaac begat Jacob; and Jacob begat Judah and his brethren;',
    '3 and Judah begat Perez and Zerah of Tamar; and Perez begat Hezron; and Hezron begat Ram;',
    '4 and Ram begat Amminadab; and Amminadab begat Nahshon; and Nahshon begat Salmon;',
    '5 and Salmon begat Boaz of Rahab; and Boaz begat Obed of Ruth; and Obed begat Jesse;',
    '6 and Jesse begat David the king.',
    'And David begat Solomon of her that had been the wife of Uriah;',
    '7 and Solomon begat Rehoboam; and Rehoboam begat Abijah; and Abijah begat Asa;',
    '8 and Asa begat Jehoshaphat; and Jehoshaphat begat Joram; and Joram begat Uzziah;',
    '9 and Uzziah begat Jotham; and Jotham begat Ahaz; and Ahaz begat Hezekiah;',
    '10 and Hezekiah begat Manasseh; and Manasseh begat Amon; and Amon begat Josiah;',
    '11 and Josiah begat Jechoniah and his brethren, at the t

### Converting to Text File

In [None]:

# set path where the raw .txt files will be saved
raw_output_path = Path("../data/raw")
raw_output_path.mkdir(parents=True, exist_ok=True)
print(f"\nWriting raw text files to: {raw_output_path.resolve()}")

# ensure data exists before writing
if 'bible_data_new' in locals() and bible_data_new:
    count = 0
    total_verses_written = 0

    # iterate through each language in the scraped data
    for lang in tqdm(bible_data_new, desc="Writing Raw Files"):
        filename = raw_output_path / f"{lang}_raw.txt"
        all_verses_text_list = [] 

        # iterate through books
        for book in bible_data_new[lang]:
            # iterate through chapters (sort numerically for consistency)
            try:
                sorted_chapters = sorted(bible_data_new[lang][book].keys(), key=int)
            except ValueError:
                sorted_chapters = sorted(bible_data_new[lang][book].keys()) 

            for chapter in sorted_chapters:
                verses = bible_data_new[lang][book][chapter]
                # filter out missing verses or empty verses
                valid_verses = [v for v in verses if v and not v.startswith("MISSING")]
                if valid_verses:
                    all_verses_text_list.extend(valid_verses)
                    total_verses_written += len(valid_verses)

        # join all collected verse strings with a single space
        final_text_block = " ".join(all_verses_text_list)
        # normalize whitespace (replace multiple spaces/newlines with a single space) : good for future n-grams
        final_text_block = ' '.join(final_text_block.split())

        # write into .txt file
        with open(filename, "w", encoding="utf-8") as f:
            f.write(final_text_block)
        count += 1 

    print(f"Wrote {count} raw text files containing text from approx {total_verses_written:,} verses.")
    print(f"\nRaw text files saved in '{raw_output_path.resolve()}' directory.")

else:
    print("Scraping data ('bible_data_new') not found or is empty. Skipping file writing.")

print("\nWriting process finished.")


Writing raw text files to: C:\Users\Dawson\OneDrive\ドキュメント\i am a college student bruh\3rd Year\1st Term\NLP1000\PH-Language-Similarity-Analysis\data\raw


Writing Raw Files: 100%|██████████| 16/16 [00:00<00:00, 72.36it/s]

Wrote 16 raw text files containing text from approx 49,672 verses.

Raw text files saved in 'C:\Users\Dawson\OneDrive\ドキュメント\i am a college student bruh\3rd Year\1st Term\NLP1000\PH-Language-Similarity-Analysis\data\raw' directory.

Writing process finished.



