# Scrape Bible

In [1]:
##loading or importing libraries
import requests
import csv
import re


from itertools import zip_longest
from bs4 import BeautifulSoup as bs

eng_bible_url = "https://cdn.door43.org/u/WycliffeAssociates/en_ulb/3fe47367e0/print_all.html"
nagamese_bible_url = "https://cdn.door43.org/u/Door43-Catalog/nag_isv/master/print_all.html"

In [2]:

## BIBLE ENGLISH Version
eng = requests.get(eng_bible_url)
## BIBLE Nagamese Version
naga = requests.get(nagamese_bible_url)

In [3]:


# Scrapping web content/information from HTML or XML into BeautifulSoup Object
bs_eng = bs(eng.content)
bs_naga = bs(naga.content)


In [4]:

# Function to parse and clean text from a specific div
def extract_text(soup,div):
    # Assuming content is within a specific div; adjust if needed
    bible_content = soup.find("div", attrs={"id":div})

    # Remove headings
    for heading_tag in bible_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        if heading_tag.string:
            heading_tag.string.extract()

    # Remove footnotes or other unwanted divs
    for footnote in bible_content.find_all('div', class_='footnotes'):
        footnote.decompose()

    # Extract and clean text
    text = bible_content.get_text(separator=' ')
    text = re.sub(r'\s+', ' ', text)            # Remove extra whitespace
    text = re.sub(r'\[.*?\]', '', text)         # Remove text inside square brackets
    return text.strip()

# Split corpus by verse numbers
def split_corpus(corpus):
    # Splits by numbers typically indicating verse, e.g., "1 In the beginning..."
    verses = re.split(r'(\d+\s)', corpus)
    verses = [''.join(verses[i:i+2]).strip() for i in range(0, len(verses), 2)]
    return [v for v in verses if v]             # Remove empty entries

# Extract and split content for each language
# eng_text = extract_text(bs_eng)
# naga_text = extract_text(bs_naga)
# eng_verses = split_corpus(eng_text)
# naga_verses = split_corpus(naga_text)

# Save both languages to a CSV
def save_to_csv(path, headers, eng_data, naga_data):
    with open(path, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        for eng, naga in zip_longest(eng_data, naga_data, fillvalue=""):
            writer.writerow([eng, naga])
    print(f"Data saved successfully to {path}")


In [5]:
# Define the array with file names
files = ['41-MAT.html', '42-MRK.html', '43-LUK.html', '44-JHN.html', '45-ACT.html',
         '46-ROM.html', '47-1CO.html', '48-2CO.html', '49-GAL.html', '50-EPH.html',
         '51-PHP.html', '52-COL.html', '53-1TH.html', '54-2TH.html', '55-1TI.html',
         '56-2TI.html', '57-TIT.html', '58-PHM.html', '59-HEB.html', '60-JAS.html',
         '61-1PE.html', '62-2PE.html', '63-1JN.html', '64-2JN.html', '65-3JN.html',
         '66-JUD.html', '67-REV.html']

# Initialize lists to collect English and Nagamese verses
all_eng_verses = []
all_naga_verses = []

# Iterate through each file in the array
for div in files:
    eng = extract_text(bs_eng, div)  # Fetch English text for the given div
    naga = extract_text(bs_naga, div)  # Fetch Nagamese text for the given div

    eng_verses = split_corpus(eng)  # Split English text into verses
    naga_verses = split_corpus(naga)  # Split Nagamese text into verses

    # Ensure both languages have the same number of verses
    if len(eng_verses) == len(naga_verses):
        all_eng_verses.extend(eng_verses)
        all_naga_verses.extend(naga_verses)
    else:
        print(f"Verse count mismatch in {div}: English({len(eng_verses)}) != Nagamese({len(naga_verses)})")

# Save all collected verses to a single CSV file
save_to_csv("/content/bible_verses.csv", ["English", "Nagamese"], all_eng_verses, all_naga_verses)


Verse count mismatch in 41-MAT.html: English(1072) != Nagamese(1075)
Verse count mismatch in 42-MRK.html: English(667) != Nagamese(679)
Verse count mismatch in 45-ACT.html: English(1011) != Nagamese(1009)
Verse count mismatch in 48-2CO.html: English(258) != Nagamese(257)
Verse count mismatch in 67-REV.html: English(412) != Nagamese(414)
Data saved successfully to /content/bible_verses.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Move the dataset to drive
!mv /content/bible_verses.csv /content/drive/MyDrive/