In [1]:
import requests
import re
import os
from bs4 import BeautifulSoup
  
# URL = "https://www.biblia.fo/bible.php?l=fo&b=19&c=149"
# r = requests.get(URL)

# Folder path where the HTML files are stored
folder_path = './site/'

# Output file path to save formatted data
output_file_path = 'biblia.txt'


# Function to parse HTML content from a file using Beautiful Soup
def parse_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        return BeautifulSoup(html_content, 'html.parser')
    
# List all files in the folder
file_list = os.listdir(folder_path)

# Filter only the HTML files
html_files = [file for file in file_list if file.endswith('.html')]


# soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib

# Function to add a tab delimiter between 'v?' and the text using regular expressions
def add_delimiter_with_regex(sentence):
    return re.sub(r'(v[0-9]+)', r'\1\t', sentence, 1)

def extract_numbers_from_url(url):
    pattern = r"b=(\d{1,2})&c=(\d{1,3})"
    match = re.search(pattern, url)
    if match:
        book_number = int(match.group(1))
        chapter_number = int(match.group(2))
        return book_number, chapter_number
    else:
        return None

# Function to format and write output to a file
def write_formatted_output(file_path, formatted_output):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(formatted_output)


In [2]:
booksSorted = [
'Fyrsta Mósebók',
'Onnur Mósebók',
'Triðja Mósebók',
'Fjórða Mósebók',
'Fimta Mósebók',
'Jósvabók',
'Dómarabókin',
'Rutarbók',
'Fyrra Sámuelsbók',
'Seinna Sámuelsbók',
'Fyrra Kongabók',
'Seinna Kongabók',
'Fyrra Krýnikubók',
'Seinna Krýnikubók',
'Ezra',
'Nehemia',
'Esterarbók',
'Jobsbók',
'Sálmarnir',
'Orðtøk Sálomons',
'Prædikarin',
'Hásongurin',
'Jesaja',
'Jeremia',
'Harmljóðini',
'Ezekiel',
'Dánjal',
'Hósea',
'Jóel',
'Ámos',
'Óbadia',
'Jónas',
'Mika',
'Náhum',
'Hábakkuk',
'Sefanja',
'Haggai',
'Zakarja',
'Málaki',
'Evangeliið eftir Matteus',
'Evangeliið eftir Markus',
'Evangeliið eftir Lukas',
'Evangeliið eftir Jóhannes',
'Ápostlasøgan',
'Bræv Paulusar ápostuls til Rómverja',
'Fyrra bræv Paulusar ápostuls til Korintmanna',
'Seinna bræv Paulusar ápostuls til Korintmanna',
'Bræv Paulusar ápostuls til Galatamanna',
'Bræv Paulusar ápostuls til Efesusmanna',
'Bræv Paulusar ápostuls til Filippimanna',
'Bræv Paulusar ápostuls til Kolossumanna',
'Fyrra bræv Paulusar ápostuls til Tessalónikumanna',
'Seinna bræv Paulusar ápostuls til Tessalónikumanna',
'Fyrra bræv Paulusar ápostuls til Timoteusar',
'Seinna bræv Paulusar ápostuls til Timoteusar',
'Bræv Paulusar ápostuls til Titusar',
'Bræv Paulusar ápostuls til Filemons',
'Brævið til Hebreara',
'Hitt almenna bræv Jákups',
'Hitt fyrra almenna bræv Pæturs',
'Hitt seinna almenna bræv Pæturs',
'Hitt fyrsta almenna bræv Jóhannesar',
'Annað bræv Jóhannesar',
'Triðja bræv Jóhannesar',
'Hitt almenna bræv Judasar',
'Jóhannesar opinbering',
]


In [4]:
# Process each HTML file using Beautiful Soup
for html_file in html_files:
    file_path = os.path.join(folder_path, html_file)
    soup = parse_html_file(file_path)
    
    text = soup.find_all('span', class_="verse")
    versNumber = soup.find_all('a', class_="verse_number")
    bookNames = soup.find_all('a', class_="button_bible_book")
    
    NumberedBooksSorted = enumerate(booksSorted, start=1)
    numbers = extract_numbers_from_url(html_file)

    if numbers is not None:
        book_number, chapter_number = numbers
    else:
        print("URL format does not match.")

    for bookNum, bookName in NumberedBooksSorted:
        if (bookNum == book_number):
            selectedBook = bookName

    for verse in text:
        strVerse = verse.text
        cleanVerse = strVerse.replace('\t','')
        cleanVerse = cleanVerse.replace('\n','')
        cleanedup = add_delimiter_with_regex(cleanVerse)
        formatted_output = f"{selectedBook}\t{chapter_number}\t{cleanedup}\n"
        # print(formatted_output)
        
        # Write the formatted output to the file
        write_formatted_output(output_file_path, formatted_output)