In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

All files stored as html. Each book has it's own folder. Each file has the same naming convention: ```D<book_volume>_<two_digit_chapter_number>.html```

In [2]:
source = r"..\html"
directories = [x[0] for x in os.walk(source)]

# List all files in given directory
files = []
for directory in directories:
    for file in os.listdir(directory):
        if file.endswith(".html") and file[2] == "_" and file[5] != "-":
            files.append(file)

In [3]:
# Create DataFrame with file paths and volume numbers
df = pd.DataFrame(data=files, columns=["filename"])
df["path"] = source + "\\" + df["filename"].str[:2] + "\\" + df["filename"]
df["volume"] = df["filename"].str[1]
df = df[["path", "volume"]]

In [4]:
DuneCronicles = []

Each book has different class used as a chapter start and as chapter start only.

In [5]:
ChapterStarters = {
    1:'blockquote',
    2:'blockquote1a',
    3:'extract',
    4:'extract',
    5:'epigraph',
    6:'extracts',
    7:'blockquote',
    8:'blockquote'}

Main loop to create one big list with all paragraphs from every book.

In [6]:
for Book in range(1, 9): # Main loop over each book
    Dune = df[df["volume"] == str(Book)]["path"]
    Chapter = 0

    for html_file in Dune: # Main loop over each chapter
        with open(html_file, encoding="utf8") as markup:
            soup = BeautifulSoup(markup, 'html.parser')
        
            AllParagraphs = soup.body.find_all(['p', 'blockquote'])
            for paragraph in range(0, len(AllParagraphs)):
                Class = AllParagraphs[paragraph].attrs['class'][0]
                Text = AllParagraphs[paragraph].get_text().replace('\n        ', '').replace('\n', '').replace('  ', ' ').replace('  ', ' ') #?!
                if Class == ChapterStarters[Book]: Chapter += 1
                DuneCronicles.append([Book, Chapter, Class, Text])

In [7]:
dfBook = pd.DataFrame(data=DuneCronicles, columns=['Book','Chapter','Class','Text'])

**Duplicates** : List to handle duplicates in Book 1 where 'blockquote' is a parent to 'noindent' which causes duplicate record for chapter opening quotes

**EmptyLines** : Empty lines at the begining of each chapter in Book 7 & 8

In [8]:
Duplicates, EmptyLines = [], []

In [9]:
for row in range(0, len(DuneCronicles)):
    
    if (DuneCronicles[row][0] == 1 or DuneCronicles[row][0] == 8) and DuneCronicles[row][2] == 'blockquote':
        Duplicates.append(True)
    else:
        Duplicates.append(False)
        
    if DuneCronicles[row][2][:5] == 'image' or DuneCronicles[row][2] == 'linespace' or DuneCronicles[row][2] == 'right-para' or DuneCronicles[row][2] == 'center-para':
        EmptyLines.append(True)
    else:
        EmptyLines.append(False)

In [10]:
Duplicates.insert(0, False)
Duplicates = Duplicates[0:len(Duplicates)-1]

dfBook['Duplicates'] = Duplicates
dfBook['EmptyLines'] = EmptyLines

dfBook = dfBook[dfBook['Duplicates'] == False] # Remove duplicates from Book 1
dfBook = dfBook[dfBook['EmptyLines'] == False] # Remove empty lines from Book 7 & 8

In [11]:
dfBook.reset_index(drop=True, inplace=True)

In [12]:
dfBook.head()

Unnamed: 0,Book,Chapter,Class,Text,Duplicates,EmptyLines
0,1,1,blockquote,A beginning is the time for taking the most de...,False,False
1,1,1,right,—from ‘Manual of Muad’Dib’ by the Princess Irulan,False,False
2,1,1,noindent,"In the week before their departure to Arrakis,...",False,False
3,1,1,indent,"It was a warm night at Castle Caladan, and the...",False,False
4,1,1,indent,The old woman was let in by the side door down...,False,False


In [13]:
dfBook[['Book','Chapter','Class','Text']].to_csv('output\DuneCronicles.csv', index=False, encoding='utf-8')