<a href="https://colab.research.google.com/github/davidbaines/eBible/blob/main/eBible_Extract_projects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define base folder

In [2]:
base = "/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible"

# Import modules, define rewrite boolean, directory paths and logging file

In [3]:
from pathlib import Path
from datetime import date, datetime
from os import listdir, makedirs, environ
from os.path import exists
from glob import iglob
import pandas as pd
import shutil
import warnings
import xml.etree.ElementTree as ET
import re
import codecs

warnings.simplefilter(action='ignore', category=FutureWarning)

rewrite = False  # tells if the project is overwritten when it already exists

corpus = Path(base)
ebible_projects = corpus / 'projects'
ebible_metadata = corpus / 'metadata'
ebible_translations_csv = ebible_metadata / 'translations.csv'
ebible_copyright_csv = ebible_metadata / 'copyrights.csv'
ebible_redistributable = corpus / "redistributable/projects"
ebible_extractions = corpus / "MT/scripture"
ebible_logs = corpus / "logs"

print(ebible_projects)
print(ebible_metadata)
print(ebible_translations_csv)
print(ebible_copyright_csv)
print(ebible_redistributable)
print(ebible_extractions)
print(ebible_logs)
print(f"rewrite = {rewrite}")

/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/projects
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/metadata
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/metadata/translations.csv
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/metadata/copyrights.csv
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/MT/scripture
/content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/logs
rewrite = False


# Install packages and clone the silnlp repo

In [4]:
!pip install python-dotenv
!pip install sil-machine
!pip install boto3
!pip install s3path
!pip install requests

!git clone https://github.com/sillsdev/silnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sil-machine
  Downloading sil_machine-0.7.4-py3-none-any.whl (205 kB)
[K     |████████████████████████████████| 205 kB 6.5 MB/s 
Collecting regex<2022.0.0,>=2021.7.6
  Downloading regex-2021.11.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 65.7 MB/s 
Installing collected packages: regex, sil-machine
  Attempting uninstall: regex
    Found existing installation: regex 2022.6.2
    Uninstalling regex-2022.6.2:
      Successfully uninstalled regex-2022.6.2
Successfully installed regex-2021.11.10 sil-machine-0.7.4
Looking in indexes: https:

# Define methods

In [5]:
from pandas.core.groupby import groupby
# Columns are easier to use if they are valid python identifiers:
def improve_column_names(df): df.columns = df.columns.str.strip().str.lower().str.replace('"', '').str.replace("'", '')\
    .str.replace('(', '').str.replace(')', '').str.replace(' ', '_')


def log_and_print(s, type='ínfo'):
    log_file.write(f"{type.upper()}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {s}\n")
    print(s)


def get_extracted_projects(dir_extracted):

    extracted = []
    for line in listdir(dir_extracted):
        m = re.search(r".+-(.+).txt$", line)
        if m:
            extracted.append(m.group(1))
    
    return extracted


def get_books_type(files):

    for book in files:
        m = re.search(r".*GEN|JON.*", book)
        if m:
            return "OT+NT"
    return "NT"


def get_conclusion(versification):

    if versification != "":
        return versification
    else:
        return "4" # English


def conclude_versification_from_OT(dan_3, dan_5, dan_13):
    if dan_3 == 30:
        versification = "4"  # English
    elif dan_3 == 33 and dan_5 == 30:
        versification = "1"  # Original
    elif dan_3 == 33 and dan_5 == 31:
        versification = "5"  # Russian Protestant
    elif dan_3 == 97:
        versification = "2"  # Septuagint
    elif dan_3 == 100:
        if dan_13 == 65:
            versification = "3"  # Vulgate
        else:
            versification = "6"  # Russian Orthodox
    else:
        versification = ""

    return versification

def conclude_versification_from_NT(jhn_6, act_19, rom_16):
    if jhn_6 == 72:
        versification = "3"  # Vulgate
    elif act_19 == 41:
        versification = "4"  # English
    elif rom_16 == 24:
        versification = "6"  # Russian Orthodox (same as Russian Protestant)
    elif jhn_6 == 71 and act_19 == 40:
        versification = "1"  # Original (Same as Septuagint)
    else:
        versification = ""

    return versification


def get_last_verse(project, book, chapter):

    ch = str(chapter)

    for book_file in iglob(f"{project}/*{book}*"):
        last_verse = "0"
        try:
            f = codecs.open(book_file, "r", encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"Could not open {book_file}, reason:  {e}")
            continue
        try:
            in_chapter = False
            for line in f:
                m = re.search(r"\\c ? ?([0-9]+).*", line)
                if m:
                    if m.group(1) == ch:
                        in_chapter = True
                    else:
                        in_chapter = False

                m = re.search(r"\\v ? ?([0-9]+).*", line)
                if m:
                    if in_chapter:
                        last_verse = m.group(1)
        except Exception as e:
            print(f"Something went wrong in reading {book_file}, reason:  {e}")
            return None
        try:
            return int(last_verse)
        except Exception as e:
            print(f"Could not convert {last_verse} into an integer in {book_file}, reason:  {e}")
            return None


def get_checkpoints_OT(project):
    dan_3 = get_last_verse(project, "DAN", 3)
    dan_5 = get_last_verse(project, "DAN", 5)
    dan_13 = get_last_verse(project, "DAN", 13)

    return dan_3, dan_5, dan_13


def get_checkpoints_NT(project):
    jhn_6 = get_last_verse(project, "JHN", 6)
    act_19 = get_last_verse(project, "ACT", 19)
    rom_16 = get_last_verse(project, "ROM", 16)

    return jhn_6, act_19, rom_16


def get_versification(project):
    versification = ""
    books = get_books_type(listdir(project))

    if books == "OT+NT":
        dan_3, dan_5, dan_13 = get_checkpoints_OT(project)
        versification = conclude_versification_from_OT(dan_3, dan_5, dan_13)

    if not versification:
        jhn_6, act_19, rom_16 = get_checkpoints_NT(project)
        versification = conclude_versification_from_NT(jhn_6, act_19, rom_16)

    return versification


def add_settings_file(project, language_code):
    versification = get_conclusion(get_versification(project))
    setting_file_stub = f"""<ScriptureText>
    <Versification>{versification}</Versification>
    <LanguageIsoCode>{language_code}:::</LanguageIsoCode>
    <Naming BookNameForm="41-MAT" PostPart="{project.name}.usfm" PrePart="" />
</ScriptureText>"""

    settings_file = project / 'Settings.xml'
    f = open(settings_file, "w")
    f.write(setting_file_stub)
    f.close()


def copy_to_working_directory(project, language_code):
    folder = ebible_redistributable / project.name
    if exists(folder):
      if rewrite:
        shutil.rmtree(folder)
      else:
        return 0
    log_and_print(f"copying {project.name} to {ebible_redistributable}")
    shutil.copytree(project, folder)
    add_settings_file(folder, language_code)
    return 1


def get_redistributable_projects():

    ok_copyrights = ["by-nc-nd", "by-nd", "by-sa"]
    redistributable = {}
    translations_info = pd.read_csv(ebible_translations_csv)
    copyright_info = pd.read_csv(ebible_copyright_csv)
    improve_column_names(translations_info)
    improve_column_names(copyright_info)
    copyright_info.rename(columns={'id': 'translationid'}, inplace=True)
    combined = pd.merge(translations_info, copyright_info, on='translationid', how='left')

    for index, row in combined.iterrows():
        if row["redistributable"] and (row["licence_type"] in ok_copyrights or row["copyright_holder"] == "Public Domain"):
            redistributable[row["translationid"]] = row["languagecode"]

    return redistributable


# Prepare redistributable projects to be extracted. 

In [None]:
log_file = open(ebible_logs / f"run_{date.today()}.log", "a")
log_and_print(f"Starting converting eBible projects for extracting...")

# Create target directory if it doesn't exist already
makedirs(ebible_redistributable, exist_ok=True)

# Make dictionary of copyright free projects in eBible.
redistributable = get_redistributable_projects()

# Copy redistributable eBible projects into working directory, and add settings files
copied = 0
for project in ebible_projects.iterdir():
    if project.name in redistributable:
        copied += copy_to_working_directory(project, redistributable[project.name])

log_and_print(f"Number of eBible projects: {len([item for item in listdir(ebible_projects)])}")
log_and_print(f"Number of redistributable eBible projects: {len(redistributable)}")
log_and_print(f"{copied} projects copied to {ebible_redistributable}")
log_and_print(f"Rewrite {rewrite}")
log_file.close()

Starting converting eBible projects for extracting...
copying ziw to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying aoj to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying sri to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying caoNT to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying priNT to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying sbe to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying ctuBl to /content/drive/Shareddrives/Partnership for Applied Biblical NLP/Data/Corpora/ebible/redistributable/projects
copying paoNT to /content/drive/Shareddrives/Partnership for Appl

# Extract projects

In [None]:
log_file = open(ebible_logs / f"run_{date.today()}.log", "a")
log_and_print(f"Starting extracting eBible projects...")

# Tell the SIL NLP tools where to find the resources
environ['SIL_NLP_DATA_PATH'] = base

# Tell Python where to find our repo
environ['PYTHONPATH'] = "/env/python:/content/silnlp"

extracted = get_extracted_projects(ebible_extractions)
nr_extracted = len(extracted)

for project in ebible_redistributable.glob("*"):
    if not project.name in extracted or rewrite:
        log_and_print(f"extracting {project}")
        !python -m silnlp.common.extract_corpora "{project}"

log_and_print(f"{len(get_extracted_projects(ebible_extractions)) - nr_extracted} new eBible projects extracted")
log_and_print(f"Rewrite {rewrite}")
log_file.close()