In [23]:
import pandas as pd

import ebooklib
from ebooklib import epub

from bs4 import BeautifulSoup

Read out all the books' chapters into a two-layer dictionary.

In [68]:
books_dict = {}

for book_no in range(1,55): # for each book in main series
    
    no_string = str(book_no).zfill(2) # format leading 0
    book = epub.read_epub(f'books/{no_string}.epub') # read in book
    
    # get chapters and chapter-like things; index no.s may not correspond
    # to actual chapter no.s
    chapters = [c for c in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)]

    chapter_dict = {}
    for chapter_no, chapter in enumerate(chapters): # for chapter in book
        
        # list of paragraphs as strings
        paragraphs = [b.text for b in 
                      BeautifulSoup(chapter.content, 'xml'
                                   ).find_all('p')][1:] # p0 always blank
        
        chapter_dict[chapter_no] = '\n'.join(paragraphs) # list as lines
        
    books_dict[book_no] = chapter_dict

Convert dictionary into df of chapters.

In [96]:
df_list = []

for k,v in books_dict.items():
    for vk,vv in v.items():
        df = pd.DataFrame({'book':k,
                           'chapter':vk,
                           'text':vv}, index=[0])
        df_list.append(df)

books_df = pd.concat(df_list).reset_index().drop(columns='index')
books_df = books_df[books_df['text'] != '']

In [98]:
books_df

Unnamed: 0,book,chapter,text
1,1,1,"My name is Jake. That's my first name, obvious..."
2,1,2,"""A flying saucer?"" Marco said. He did laugh. T..."
3,1,3,<They have come to destroy you.>\nIt was stran...
4,1,4,<Yeerks!>\nThe twin red lights slowed. They tu...
5,1,5,"The Hork-Bajir pointed his gun, or whatever it..."
...,...,...,...
1520,54,18,The world's smartest mouse wiggled his nose at...
1521,54,19,I had nine million four hundred and thirty-two...
1522,54,20,"We drove through the desert night, silent most..."
1523,54,21,The Rachel was fast.\nWe blew through normal s...


Add authenticity columns; 1 denotes a book written by Applegate, 0 denotes a book ghostwritten.

In [102]:
auths = [i for i in range(1,25)] + [26, 32, 53, 54]
books_df['authenticity'] = books_df['book'].map(lambda b: int(b in auths))

books_df.head()

Save df to csv.

In [105]:
books_df.to_csv('../data/animorphs.csv')