In [1]:
from zipfile import ZipFile
import re
import pandas as pd
import datetime

booknames = pd.read_csv('books.csv').set_index('BookName')['BookID']

def get_output_zip():
    timestamp = datetime.datetime.isoformat(datetime.datetime.now()).replace('-', '').replace('T', '').replace(':','').split('.')[0]
    output_file_name = f'/Users/davidmorton/Downloads/output_{timestamp}.epub'
    return ZipFile(output_file_name, 'x')

def get_book_order(book:str):
    if book in booknames.keys():
        return booknames[book]
    return 0

def sanitize_book_name(book:str):
    otherbooks = {
        "Ps": "Psalms",
        "Psalm": "Psalms",
        "Isa": "Isaiah",
        'Eccl':'Ecclesiastes',
        'Dan':'Daniel',
        '1 Cor':'1 Corinthians',
        'Gen':'Genesis',
        'Rev':'Revelation',
        'Rom':'Romans',
        '1 Tim':'1 Timothy',
        'Matt':'Matthew',
        '1 Pet':'1 Peter',
        '2 Cor':'2 Corinthians',
        'Ge':'Genesis'
    }

    if book in otherbooks.keys():
        book = otherbooks[book]

    if book not in booknames.keys():
        print(f'Warning! {book} is not in booknames')
    return book

def divide_file(content:str):
    divisions = []

    start = 0
    regex = '((?:[123] )?[a-zA-Z]+) (\\d+)\\:(\\d+)(?:[-–](\\d+))?'
    matches = re.finditer(regex, content)

    for match in matches:
        groups = list(match.groups())
        book = sanitize_book_name(groups[0])
        book_order = get_book_order(book)
        chapter = int(groups[1])
        verse_start = int(groups[2])
        verse_end = verse_start if groups[3] is None else int(groups[3])

        reference = f"{book} {chapter}:{verse_start}{('' if verse_end == verse_start else (f'-{verse_end}'))}"

        ref_start = match.start()
        ref_end = match.end()
        divisions.append(content[start:ref_start])
        divisions.append((content[ref_start:ref_end], book_order, book, chapter, verse_start, verse_end, reference))
        start = ref_end

    divisions.append(content[start:])
        
    return divisions

def refigure_file(filename, divisions):
    output = ''
    references = []
    current_index = 1
    for x in divisions:
        if type(x) == str:
            output = output + x
        else:
            text, book_order, book, chapter, verse_start, verse_end, reference = x
            text = f"<a class='scripture-ref' id='ref{current_index}'/>{reference}"
            output += text
            references += [(f'{filename}#ref{current_index}', book_order, book, chapter, verse_start, verse_end, reference)]
            current_index = current_index + 1

    return output, references
        
def update_content_opf(content):
    content = (content
               .replace('</manifest>', '    <item href="sections/scripture_index.xhtml" id="scripture-index" media-type="application/xhtml+xml"/></manifest>')
               .replace('</spine>', '<itemref idref="scripture-index"/></spine>')
    )
    return content

def update_navmap(content):
    content = (content
               .replace('</navMap>', '<navPoint class="document" id="scripture-index" playOrder="17"><navLabel><text>Appendix E: Scripture Index</text></navLabel><content src="sections/scripture_index.xhtml"/></navPoint></navMap>')
               )
    return content     

def update_toc(content):
    content = (content
               .replace('</body>', '<p class="toc-para"><span class="toc-entry"><a href="scripture_index.xhtml">Appendix E: Scripture Index</a></span></p></body>')) 
    return content     

def update_styleshee(content):
    content = content + '''
.idx-book-p {
  background-color: transparent;
  line-height: 100%;
  margin: .2in 0 0.1in 0;
  text-align: justify;
}
.idx-book-span {
  font-family: "Open Sans", sans-serif;
  font-size: 12pt;
  font-weight: bold;
}
.idx-ref-p {
  background-color: transparent;
  line-height: 100%;
  margin: 0 0 0.05in .2in;
  text-align: justify;
}
.idx-ref-span {
  font-family: "Open Sans", sans-serif;
  font-size: 10pt;
  width: 200px;
  display: inline-block;
}
.idx-ref-scr-span {
  font-family: "Open Sans", sans-serif;
  font-size: 10pt;
  padding-right: 10px;
}
.scripture-ref {
  color: inherit;
}'''
    return content

def add_index_page(output_zip, references):
    refs = pd.DataFrame(references, columns=['file', 'order','book','chapter','start','end','reference'])
    refs = refs[refs['order'] != 0]
    refs = refs.sort_values(by=['order', 'chapter', 'start','end'])
    refs['file'] = refs['file'].str.replace('OEBPS/sections/', '')
    index_table = '<?xml version="1.0" encoding="UTF-8"?><html xmlns="http://www.w3.org/1999/xhtml"><head><link href="../styles/stylesheet.css" rel="stylesheet" type="text/css"/></head><body class="body0" xmlns:epub="http://www.idpf.org/2007/ops">'
    index_table += '<p class="para2"><span class="span4">Appendix E: Scripture Index</span></p>'
    for book_group in refs.groupby(by=['order','book']):
        index_table += f"<p class='idx-book-p'><span class='idx-book-span'>{book_group[0][1]}</span></p>"
        for ref_group in book_group[1].groupby(by='reference'):
            index_table += f"<p class='idx-ref-p'><span class='idx-ref-span'>{ref_group[0].replace(book_group[0][1] + ' ', '')}</span>"
            for i,row in enumerate(ref_group[1].itertuples()):
                index_table += f"<span class='idx-ref-scr-span'><a href='{row.file}'>[{i+1}]</a></span>"
            index_table += f"</p>\n"

    index_table += '</body></html>'

    output_zip.writestr('OEBPS/sections/scripture_index.xhtml', bytes(index_table, encoding='utf-8'))


def add_scripture_index_to_file(source):
    references = []

    z = ZipFile(source)
    output_zip = get_output_zip()

    for item in z.filelist:
        bcontent = z.read(item)
        if item.filename.endswith('.xhtml'):
            content = bcontent.decode('utf-8')
            divisions = divide_file(content)
            content, new_refs = refigure_file(item.filename, divisions)
            bcontent = bytes(content, 'utf-8')
            references += new_refs
        if 'content.opf' in item.filename:
            bcontent = bytes(update_content_opf(bcontent.decode('utf-8')), 'utf-8')
        if 'toc.ncx' in item.filename:
            bcontent = bytes(update_navmap(bcontent.decode('utf-8')), 'utf-8')
        if 'section0002.xhtml' in item.filename: # Table of contents
            bcontent = bytes(update_toc(bcontent.decode('utf-8')), 'utf-8')
        if 'stylesheet.css' in item.filename:
            bcontent = bytes(update_styleshee(bcontent.decode('utf-8')), 'utf-8')
            
        output_zip.writestr(item.filename, bcontent)

    add_index_page(output_zip, references)
    output_zip.close()
    return output_zip.filename

file_name = '/Users/davidmorton/Downloads/Awake, Oh Sleeper! - David Morton.epub'
print(add_scripture_index_to_file(file_name))

/Users/davidmorton/Downloads/output_20230506053247.epub


In [39]:
output_zip.filename

'/Users/davidmorton/Downloads/output_20230505170001.epub'

In [71]:
for info in z.filelist:
    if info.filename == 'mimetype':
        print(z.read(info))
    print(info)

b'application/epub+zip'
<ZipInfo filename='mimetype' filemode='-rw-r--r--' file_size=20>
<ZipInfo filename='inside-title.xhtml' compress_type=deflate filemode='-rw-r--r--' file_size=488 compress_size=293>
<ZipInfo filename='META-INF/container.xml' compress_type=deflate filemode='-rw-r--r--' file_size=239 compress_size=168>
<ZipInfo filename='META-INF/calibre_bookmarks.txt' compress_type=deflate filemode='-rw-r--r--' file_size=200 compress_size=169>
<ZipInfo filename='OEBPS/CrimsonText-Bold.otf' compress_type=deflate filemode='-rw-r--r--' file_size=201204 compress_size=78315>
<ZipInfo filename='OEBPS/toc.xhtml' compress_type=deflate filemode='-rw-r--r--' file_size=1340 compress_size=434>
<ZipInfo filename='OEBPS/content.opf' compress_type=deflate filemode='-rw-r--r--' file_size=4748 compress_size=1054>
<ZipInfo filename='OEBPS/CrimsonText-BoldItalic.otf' compress_type=deflate filemode='-rw-r--r--' file_size=208596 compress_size=83448>
<ZipInfo filename='OEBPS/CrimsonText-Italic.otf' com

In [73]:
z = ZipFile(file_name)
output_zip = get_output_zip()

for f in z.filelist:
    output_zip.writestr(f.filename, z.read(f))
output_zip.close()