In [1]:
import os
import pandas as pd
import re

In [2]:
source = r"..\html"
directories = [x[0] for x in os.walk(source)]

In [3]:
files = []
for directory in directories:
    for file in os.listdir(directory):
        if file.endswith(".html") and file[2] == "_" and file[5] != "-":
            files.append(file)

In [4]:
df = pd.DataFrame(data=files, columns=["filename"])
df["folder"] = df["filename"].str[:2]
df["volume"] = df["filename"].str[1]
df["item"] = df["filename"].str[3:5]
df["path"] = source + "\\" + df["folder"] + "\\" + df["filename"]
df = df[["path", "filename", "folder", "volume", "item"]]

In [5]:
D1 = df[df["volume"] == "1"]["path"]

In [6]:
content = []
for file in D1:
    with open(file, mode="r", encoding="UTF-8") as f:
        content.append(f.readlines())

In [7]:
class_dictionary = {
    "indent" : "text_content",
    "space-break1" : "text_content",
    "right" : "opening_caption",
    "space-break" : "opening_caption",
    "blockquote" : "opening_text",
    "noindent" : "opening_text",
    "linegroup" : "text_poem",
    "line" : "text_poem",
    "line1" : "text_poem",
    "linex" : "text_poem",
    "chapter-title" : "chapter_title",
    "" : ""}

In [8]:
def PartOfText(text):
    # Function to regognize what part of text the given string is    
    class_start = text.find("<p class=") + len("<p class=") + 1
    class_end = text.find(">") - 1
    
    if class_start < 10:
        text_part = ""
    else:
        class_type = text[class_start:class_end]
        text_part = class_dictionary[class_type]

    return text_part;

In [9]:
def ExtractText(text):
    # Function to extract just text from given string and remove all html elements
    text = text.strip()
    text = re.sub("<em>", "‘", str(text))         # Change "thoughts" from cursive to dialogue format start
    text = re.sub("</em>", "’", str(text))        # Change "thoughts" from cursive to dialogue format end
    text = re.sub("\r\n", "", text)               # Remove line feeds
    text = re.sub("\n", "", text)                 # Remove line feeds
    text = re.sub("</p>", "", text)               # Remove class endings end
    text = re.sub("<a id=.*>", "", text)          # Remove id="page." references
    text = re.sub("<.*blockquote.*>", "", text)
    text = re.sub("<.*head>", "", text)
    text = re.sub("<.*body>", "", text)
    text = re.sub("<.*title.*>", "", text)
    text = re.sub("<.*link.*>", "", text)
    text = re.sub("<.*meta.*>", "", text)
    text = re.sub("<.*xml.*>", "", text)

    return text;

In [10]:
book = []
for k in range(0, len(content)):
    for i in range(0, len(content[k])):
        book.append(content[k][i])

In [11]:
book_classified = []
for i in range(1, len(book)):
    temp = ["",""]
    s = ExtractText(book[i])
    temp[0] = PartOfText(s)
    temp[1] = re.sub("<p class=.*>", "", s)
    book_classified.append(temp)

In [12]:
book_condensed = ["",""]
for i in range(0, len(book_classified)):
    temp = ["",""]
    if book_classified[i][1] != "":
        temp = [book_classified[i][0], book_classified[i][1]]
        book_condensed.append(temp)
del book_condensed[:2]

In [13]:
df_book_long = pd.DataFrame(book_condensed, columns=["part","content"])

In [14]:
df_book_long['content'] = df_book_long['content'] + ' '
df_book_long['count'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).cumcount() + 1
df_book_long['text_content'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).content.apply(lambda x : x.cumsum())
df_book_long['take'] = df_book_long['count'] == df_book_long.groupby(df_book_long.part.ne('').cumsum())['count'].transform('max')
df_book_long['text_part'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).part.apply(lambda x : x.cumsum())

In [15]:
df_book = df_book_long[df_book_long['take'] == True].reset_index()
df_book = df_book[['text_part', 'text_content']]

In [None]:
df_book.to_csv(index=False)