In [1]:
import os
import pandas as pd
import re
import numpy as np

In [2]:
source = r"..\html"
directories = [x[0] for x in os.walk(source)]

In [3]:
files = []
for directory in directories:
    for file in os.listdir(directory):
        if file.endswith(".html") and file[2] == "_" and file[5] != "-":
            files.append(file)

In [4]:
df = pd.DataFrame(data=files, columns=["filename"])
df["folder"] = df["filename"].str[:2]
df["volume"] = df["filename"].str[1]
df["item"] = df["filename"].str[3:5]
df["path"] = source + "\\" + df["folder"] + "\\" + df["filename"]
df = df[["path", "filename", "folder", "volume", "item"]]

In [5]:
D1 = df[df["volume"] == "1"]["path"]

In [6]:
content = []
for file in D1:
    with open(file, mode="r", encoding="UTF-8") as f:
        content.append(f.readlines())

In [7]:
class_dictionary = {
    "indent" : "text_content",
    "space-break1" : "text_content",
    "right" : "opening_caption",
    "space-break" : "opening_caption",
    "blockquote" : "opening_text",
    "noindent" : "opening_text",
    "linegroup" : "text_poem",
    "line" : "text_poem",
    "line1" : "text_poem",
    "linex" : "text_poem",
    "chapter-title" : "chapter_title",
    "normal" : "text_content",
    "normal-1" : "text_content",
    "normal1" : "text_content",
    "bodytext" : "text_content",
    "bodytext-left" : "text_caption",
    "bodytextb" : "text_content",
    "bodytextt" : "text_content",
    "extract" : "text_poem",
    "extractba" : "text_poem",
    "bodytext-leftz" : "opening_text",
    "bodytext-leftza" : "opening_text",
    "hanging1" : "opening_text",
    "hanging" : "opening_text",
    "extract-indent1" : "opening_text",
    "extract-indent" : "text_poem",
    "bodytext1" : "text_content",
    "extract-indent-a1" : "text_poem",
    "extract-indent-b1" : "text_poem",
    "extract2" : "text_content",
    "extract-3" : "text_poem",
    "" : ""}

In [8]:
def PartOfText(text):
    # Function to regognize what part of text the given string is    
    class_start = text.find("<p class=") + len("<p class=") + 1
    class_end = text.find(">") - 1
    
    if class_start < 10:
        text_part = ""
    else:
        class_type = text[class_start:class_end]
        text_part = class_dictionary[class_type]

    return text_part;

In [9]:
def ExtractText(text):
    # Function to extract just text from given string and remove all html elements
    text = text.strip()
    text = re.sub("<em>", "‘", str(text))         # Change "thoughts" from cursive to dialogue format start
    text = re.sub("</em>", "’", str(text))        # Change "thoughts" from cursive to dialogue format end
    text = re.sub("\r\n", "", text)               # Remove line feeds
    text = re.sub("\n", "", text)                 # Remove line feeds
    text = re.sub("</p>", "", text)               # Remove class endings end
    text = re.sub("<a id=.*>", "", text)          # Remove id="page." references
    text = re.sub("<.*blockquote.*>", "", text)
    text = re.sub("<.*head>", "", text)
    text = re.sub("<.*body>", "", text)
    text = re.sub("<.*title.*>", "", text)
    text = re.sub("<.*link.*>", "", text)
    text = re.sub("<.*meta.*>", "", text)
    text = re.sub("<.*xml.*>", "", text)
    #text = re.sub("<h5 class=.*>", "", text)
    #text = re.sub("<.*h5>", "", text)
    #text = re.sub("<.*strong>", "", text)

    return text;

In [10]:
book = []
for k in range(0, len(content)):
    for i in range(0, len(content[k])):
        book.append(content[k][i])

In [11]:
book_classified = []
for i in range(1, len(book)):
    temp = ["",""]
    s = ExtractText(book[i])
    temp[0] = PartOfText(s)
    temp[1] = re.sub("<p class=.*>", "", s)
    book_classified.append(temp)

In [12]:
book_condensed = ["",""]
for i in range(0, len(book_classified)):
    temp = ["",""]
    if book_classified[i][1] != "":
        temp = [book_classified[i][0], book_classified[i][1]]
        book_condensed.append(temp)
del book_condensed[:2]

In [13]:
df_book_long = pd.DataFrame(book_condensed, columns=["part","content"])

df_book_long['content'] = df_book_long['content'] + ' '
df_book_long['chapter_count'] = df_book_long.groupby(df_book_long.part.eq('opening_caption').cumsum()).cumcount() + 1
df_book_long['line_count'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).cumcount() + 1
df_book_long['content'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).content.apply(lambda x : x.cumsum())
df_book_long['take'] = df_book_long['line_count'] == df_book_long.groupby(df_book_long.part.ne('').cumsum())['line_count'].transform('max')
df_book_long['text_part'] = df_book_long.groupby(df_book_long.part.ne('').cumsum()).part.apply(lambda x : x.cumsum())

In [37]:
df_book = df_book_long[df_book_long['take'] == True].reset_index()
df_book = df_book[['text_part', 'content', 'chapter_count']]

df_book['chapter_start'] = df_book['chapter_count'].apply(lambda x : x == 1)
df_book['chapter_num'] = df_book['chapter_start']
df_book['chapter_num'].replace({True:1, False:0}, inplace=True)
df_book['chapter'] = df_book.chapter_num.cumsum() -1
df_book['chapter'] = df_book['chapter'].shift(-1).ffill()

df_book = df_book[['chapter', 'text_part', 'content']][1:]

In [38]:
df_book['opening_caption'] = df_book['content'].str[0] == "—"

df_book['text_content'] = df_book['opening_caption'].shift(-2).ffill() != False
df_book['opening_text'] = df_book['opening_caption'].shift(-1).ffill() != False
df_book['text_content'] = df_book['opening_caption'].shift(1).ffill() != False

In [39]:
text_content_arr = np.where(df_book['text_content'] == True)
for i in text_content_arr:
    df_book['text_part'].iloc[i] = 'text_content'

In [40]:
opening_caption_arr = np.where(df_book['opening_caption'] == True)
for i in opening_caption_arr:
    df_book['text_part'].iloc[i] = 'opening_caption'

In [41]:
opening_text_arr = np.where(df_book['opening_text'] == True)
for i in opening_text_arr:
    df_book['text_part'].iloc[i] = 'opening_text'

In [42]:
df_book.head()

Unnamed: 0,chapter,text_part,content,opening_caption,text_content,opening_text
1,1.0,opening_text,A beginning is the time for taking the most de...,False,True,True
2,1.0,opening_caption,—from ‘Manual of Muad’Dib’ by the Princess Iru...,True,False,False
3,1.0,text_content,"In the week before their departure to Arrakis,...",False,True,False
4,1.0,text_content,"It was a warm night at Castle Caladan, and the...",False,False,False
5,1.0,text_content,The old woman was let in by the side door down...,False,False,False


In [43]:
df_book.iloc[165:175]

Unnamed: 0,chapter,text_part,content,opening_caption,text_content,opening_text
166,1.0,text_content,"‘Yes, the one who can be many places at once: ...",False,False,False
167,1.0,text_content,"‘They tried and failed, all of them?’",False,False,False
168,1.0,text_content,"‘Oh, no.’ She shook her head. ‘They tried and ...",False,False,False
169,2.0,opening_text,To attempt an understanding of Muad’Dib withou...,False,False,True
170,2.0,opening_caption,—from ‘Manual of Muad’Dib’ by the Princess Iru...,True,False,False
171,2.0,text_content,"It was a relief globe of a world, partly in sh...",False,True,False
172,2.0,text_content,An ellipsoid desk with a top of jade-pink petr...,False,False,False
173,2.0,text_content,Both youth and man stared at the globe and the...,False,False,False
174,2.0,text_content,A chuckle sounded beside the globe. A basso vo...,False,False,False
175,2.0,text_content,"Assuredly, Baron,’ said the man. His voice cam...",False,False,False


In [44]:
df_book.to_csv(r'output/D1.csv',
               index=False,
               columns=['chapter', 'text_part', 'content'])