In [1]:
# %pip install beautifulsoup4 lxml html5lib
# %pip install EbookLib

In [2]:
from zipfile import ZipFile
import shutil
import glob



In [3]:
import warnings
from bs4 import MarkupResemblesLocatorWarning, XMLParsedAsHTMLWarning, BeautifulSoup
import bs4
#warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [4]:
def trimEnd(s: str, value: str):
  if(s.endswith(value)):
    length = len(value)
    return s[:-length]
  return s

In [5]:


def extractEpub(fromFile: str, toDir: str):
    shutil.unpack_archive(f'input/{fromFile}', f'data/{toDir}', 'zip')


def saveEpub(fromDir: str, toFile: str):
    shutil.make_archive(f'output/{toFile}', 'zip', f'data/{fromDir}')

In [6]:

inputEpubs = glob.glob('*.epub', root_dir='./input')

In [7]:
def readAll(filename: str):
  file = open(filename, encoding='utf-8')
  text = file.read()
  file.close()
  return text

def writeAll(filename: str, text: str):
  file = open(filename, encoding='utf-8', mode='w')
  file.write(text)
  file.close()


In [8]:
from typing import List

from bs4 import NavigableString, Tag

def parseText(soup: BeautifulSoup, text: str):
    def embolden(t: str):
        dividerIndex = len(t) // 4 + 1
        boldPart = soup.new_tag('b')
        boldPart.string = t[0: dividerIndex]

        return [boldPart, soup.new_string(t[dividerIndex:])]

    result: List[Tag | NavigableString] = []

    startI = 0
    while startI < len(text):
        char = text[startI]
        if not char.isalpha():
            result.append(char)
            startI += 1
            continue

        if char == '\'' or char == '"':
            result.extend(embolden(char))
            startI += 1
            continue

        endI = startI
        while endI < len(text) and text[endI].isalpha():
            endI += 1

        word = text[startI: endI]

        result.extend(embolden(word))
        startI = endI
    return result


In [9]:
import os
for epubFullFileName in inputEpubs:
  epubShortFileName = trimEnd(epubFullFileName, '.epub')
  try:
    shutil.rmtree(f'./data/{epubShortFileName}')
    os.remove(f'./output/{epubFullFileName}')
  except FileNotFoundError:
    pass
  finally:
    pass
for dir in glob.glob('./data/*'):
    shutil.rmtree(dir)
for file in glob.glob('./output/*'):
    os.remove(file)

In [10]:
removedText = [
    "THE SEVEN HABITS OF HIGHLY EFFECTIVE PEOPLE",
    "Brought to you by FlyHeart"
]

def splitText(value: str):
    split = value.split(" ")
    return list(filter(lambda v: v != "", split))

removedText = list(map(lambda x: splitText(x.lower()),  removedText))

def to_be_removed(value: str):
    v = splitText(value.lower())
    return v in removedText

print(removedText)
#print(['THE', 'SEVEN', 'HABITS', 'OF', 'HIGHLY', 'EFFECTIVE', 'PEOPLE'] in removedText)

[['the', 'seven', 'habits', 'of', 'highly', 'effective', 'people'], ['brought', 'to', 'you', 'by', 'flyheart']]


In [11]:
from typing import Iterable, Iterator

from bs4 import PageElement

In [None]:

from bs4 import ResultSet


def skip(iter: Iterator, count: int):
  for i in range(0, count):
    try:
      iter.__next__()
    except StopIteration:
      return

def parseTree(soup: BeautifulSoup, start: BeautifulSoup, elements: Iterable[PageElement]):
  iter = elements.__iter__()
  while True:
    try:
      next = iter.__next__()

      if isinstance(next, Tag):
        parseTree(soup, start, next.children)
        continue;

      if not isinstance(next, NavigableString):
          continue;
      next : NavigableString = next
      text : str = next.string


      if to_be_removed(text):
        next.extract()
      else:
        boldText = parseText(soup, text)
        next.replace_with(*boldText)
        skip(iter, len(boldText) - 1)

    except StopIteration:
      break





In [20]:
import os

for epubFullFileName in inputEpubs:
    print(epubFullFileName)
    epubShortFileName = trimEnd(epubFullFileName, '.epub')

    extractEpub(epubFullFileName, epubShortFileName)
    htmlFiles = [*glob.glob('**/*.html', root_dir=f'data/{epubShortFileName}', recursive=True),
                 *glob.glob('**/*.xhtml', root_dir=f'data/{epubShortFileName}', recursive=True)]
    for dir in htmlFiles:
        path = f'data/{epubShortFileName}/{dir}'
        parsedDocument = BeautifulSoup(readAll(path), 'xml')
        body = parsedDocument.find('body')
        parseTree(parsedDocument, body, body.children)
        writeAll(path, str(parsedDocument))
    newFileName = 'bold-' + epubShortFileName
    saveEpub(epubShortFileName, newFileName)
    shutil.move(f'./output/{newFileName}.zip', f'./output/{newFileName}.epub')

CoveyStephen-TheSevenHabitsOfHighlyEffectivePeople.epub
______
______
___THE  SEVEN  HABITS  OF  HIGHLY  EFFECTIVE  PEOPLE___
Clearing []
______
___Brought  to  you  by  FlyHeart___
Clearing []
______
___THE SEVEN HABITS OF HIGHLY EFFECTIVE PEOPLE___
Clearing []
______
______
___Stephen R. Covey___
______
______
___THE  SEVEN  HABITS  OF  HIGHLY  EFFECTIVE  PEOPLE___
Clearing []
______
___Brought  to  you  by  FlyHeart___
Clearing []
______
___Stephen Covey has written a remarkable book about the human condition, so elegantly written, so understanding of our embedded concerns, so useful for our organization and personal lives, that it's going to be my gift to everyone I know.___
______
___-- Warren Bennis, author of On Becoming a Leader___
______
___I've never known any teacher or mentor on improving personal effectiveness to generate such an overwhelmingly positive reaction....  This book captures beautifully Stephen's philosophy of principles.___
______
___I think anyone reading it w