In [1]:
import platform
platform.platform()

'Linux-5.10.16.3-microsoft-standard-WSL2-x86_64-with-glibc2.35'

In [2]:
import os
os.getcwd()

'/home/gibson/wildbow-lookup'

# What is this?
A Formatter to change the default content files produced by the scraper to a series of text files linked to their metadata. It will be converted a standalone script in the near future.

## Input
A folder of HTML files downloaded from the scraper.

## Processing Steps
1. Iterate over all HTML files generated from the scraper.
2. Parse the HTML, pulling out the necessary metadata.
3. Remove unnecessary mess like character PoV, chapter title, Previous/Next chapter, etc
4. Write out as text a different verison of the file

## Output
A directory (parsed into a df for simple manipulation) containing:
- A list of text files numbered by their chapter appearance
- A metadata JSON containing the following information keyed on chapter Index
    - ~~Chapter Number~~
    - ~~Arc Number~~
    - ~~Arc Title~~
    - ~~Character Viewpoint~~
    - Publish date (not available)
    - ~~Interlude flag~~
    - Series (to expand to other series as necessary)
    - The plaintext
        - No XML version
        - No head
        - No h1 title
        - No floating previous / next chapter
        - No interlude / viewpoint

In [95]:
from bs4 import BeautifulSoup
import re
import pandas as pd

In [96]:
dir_prefix = './content/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.html')])
# all_files

In [206]:
parsed_files = []
for fname in all_files:
    file_number = int(re.search(r'\d+',fname).group())
    # print(file_number)
    with open(dir_prefix + fname) as fp:
        soup = BeautifulSoup(fp,features='xml')

    full_title = soup.title.string # Parse into metadata, etc
    if re.search(r'Break \d',full_title):
        # Dealing with Break, which is a bit different
        arc_name = 'Break'
        arc_number = 13 # Since it takes place within Summer Break / 13
        break_num =int(re.search(r'\d',full_title).group())
        chapter_number = f"B{break_num}" # Naming them Arc 13.bX
    elif full_title == 'Summer Break':
        # The last Chapter here has no numbers
        arc_name = ' Summer Break'
        arc_number = 13
        chapter_number = 'end' # There's not a great way to do it otherwise imo
    elif full_title == 'In Absentia 21.12':
        arc_name = 'In Absentia'
        arc_number = 21
        chapter_number = 12
    else:
        split_title = full_title.split('–')
        arc_name, number = split_title[0].strip(), split_title[1].strip()
        if '.' in number:
            number_to_split = number.split('.')
        else:
            # That Cherrypop interlude where things are misspelled
            if number == '12a':
                number_to_split = ('12','aa')
        arc_number, chapter_number = number_to_split[0].strip(), number_to_split[1].strip()
    # print(arc_name,arc_number, chapter_number)
    perspective_or_interlude = soup.find('p').getText().lower()
    decompose_flag = True
    if 'interlude' in perspective_or_interlude:
        chapter_type = 'Interlude' # Gotta check for that cherrypop interlude
        viewpoint = perspective_or_interlude
        # print(f"Interlude: {viewpoint}")
    elif 'lucy' in perspective_or_interlude:
        chapter_type = 'Lucy'
        viewpoint = 'Lucy'
    elif 'avery' in perspective_or_interlude:
        chapter_type = 'Avery'
        viewpoint = 'Avery'
    elif 'verona' in perspective_or_interlude:
        chapter_type = 'Verona'
        viewpoint = 'Verona'
    else:
        chapter_type = 'Interlude'
        viewpoint = perspective_or_interlude
        print(f"Not caught: {perspective_or_interlude}")
        decompose_flag = False

    text = soup.body
    text.h1.decompose()
    if decompose_flag:
        text.p.decompose() # First p is the interlide / viewpoint
    clean_text = text.getText().replace(u'\xa0', ' ').replace(u'\n', '  ').replace("Next Chapter","").replace("Previous Chapter","")
    # TODO Format this so it matches the text / metadata format Haystack expects
    d = {
        # Some of this information is going to removed in favor of the spreadsheet versions
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'starting_viewpoint': viewpoint,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
    }
    parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])

Not caught: interude
Not caught: mccauleigh sat in the chair in raymond’s office.  diffuse images moved around them, projected.  he had programs he could lean on that referenced marketing to certain age groups, calming colors, interests, and styles, then pull those together enough that there was something vaguely soothing and yet not distracting in regular motion, on monitors and projected surfaces.  he’d read the notes, checked with witnesses.
Not caught:  
Not caught: the alabaster doe endured an existence of paradoxes.  few things drove that home as much as her role here at the arena.  an architect of mercy paying witness to a fierce battle of strength.  john stiles leading his group of soldiers against the wolves of the ephemeral alpha, two very different sorts of hunter and fighter fighting viciously.  the first irony was that she was very well equipped to study what was happening.
Not caught: cagerattler
Not caught: the alabaster gave the signal.
Not caught: the beorgmann used on

In [207]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
# df.sample()

In [208]:
df[df['arc_number']==13]

Unnamed: 0,arc_title,arc_number,chapter_number,starting_viewpoint,chapter_type,text,file_number,absolute_file_number,chapter
50,Break,13,B1,mccauleigh sat in the chair in raymond’s offic...,Interlude,McCauleigh sat in the chair in Raymond’s off...,145,140,13.B1
52,Break,13,B2,,Interlude,Hurry up and wait. John walked into the ...,147,142,13.B2
54,Break,13,B3,the alabaster doe endured an existence of para...,Interlude,The Alabaster Doe endured an existence of pa...,149,144,13.B3
57,Break,13,B4,cagerattler,Interlude,Cagerattler Men with guns entered the Arena...,151,146,13.B4
59,Break,13,B5,the alabaster gave the signal.,Interlude,The Alabaster gave the signal. “We could co...,153,148,13.B5
60,Summer Break,13,end,Lucy,Lucy,Lucy’s phone rang. Her shaking hands near...,154,149,13.end


Join in information from the [Pale Amalgram Spreadsheet](https://docs.google.com/spreadsheets/d/1VS0HRcbHChh4gmL8LcL8xiIvo-nPhSgs2OGOVV3fVbo/edit#gid=0) to get more accurate views, wordcounts, etc

In [196]:
gsheet_url = f"https://docs.google.com/spreadsheets/d/1VS0HRcbHChh4gmL8LcL8xiIvo-nPhSgs2OGOVV3fVbo/gviz/tq?tqx=out:csv"
stats_dirty = pd.read_csv(gsheet_url, header=None, index_col=False,
names=['a','b','c','Arc Number', 'Arc Title', 'Chapter','PoV','Wordcount','Synopses','Reddit Discussion', 'Audiobook Link', 'Pale Reflections Discussion', 'n','o','p','q'], 
dtype = {'Arc Number':str, 'Arc Title':str, 'Chapter':str,'PoV':str,'Wordcount':str},
usecols = ['Arc Number', 'Arc Title', 'Chapter','PoV','Wordcount'],
skiprows=1,
encoding='UTF-8'
)
# stats_dirty = stats_dirty.drop(columns=['a','b','c','Synopses','Reddit Discussion', 'Audiobook Link', 'Pale Reflections Discussion', 'n','o','p','q'])
# stats_dirty = stats_dirty.dropna()
stats_dirty.head(5)

Unnamed: 0,Arc Number,Arc Title,Chapter,PoV,Wordcount
0,prologue,,,Louise,7174
1,arc 1,Lost for Words,1.1,Verona,9191
2,,,1.2,Lucy,6799
3,,,1.3,Avery,9492
4,,,1.4,Avery,8616


In [197]:
stats_dirty.at[0,'Arc Title'] = 'Blood Runs Cold'
stats_dirty.at[0,'Chapter'] = '0.0'

stats_dirty.at[9,'Chapter'] = "1.z"
stats_dirty.at[21,'Chapter'] = "2.z"
stats_dirty.at[32,'Chapter'] = "3.z"
stats_dirty.at[41,'Chapter'] = "4.x"
stats_dirty.at[44,'Chapter'] = "4.10"
stats_dirty.at[47,'Chapter'] = "5.a"
stats_dirty.at[48,'Chapter'] = "5.b"
stats_dirty.at[53,'Chapter'] = "5.c"
stats_dirty.at[55,'Chapter'] = "5.d"
stats_dirty.at[66,'Chapter'] = "6.z"
stats_dirty.at[74,'Chapter'] = "7.a"
stats_dirty.at[78,'Chapter'] = "7.x"
stats_dirty.at[86,'Chapter'] = "8.a"
stats_dirty.at[100,'Chapter'] = "9.10"
stats_dirty.at[103,'Chapter'] = "9.z"
stats_dirty.at[106,'Chapter'] = "10.a"
stats_dirty.at[107,'Chapter'] = "10.b"
stats_dirty.at[108,'Chapter'] = "10.c"
stats_dirty.at[110,'Chapter'] = "10.d"
stats_dirty.at[111,'Chapter'] = "10.e"
stats_dirty.at[114,'Chapter'] = "10.z"
stats_dirty.at[125,'Chapter'] = "11.10"
stats_dirty.at[129,'Chapter'] = "11.z"
stats_dirty.at[132,'Chapter'] = "12.aa"
stats_dirty.at[139,'Chapter'] = "12.a"
stats_dirty.at[140,'Chapter'] = "12.8"
stats_dirty.at[141,'Chapter'] = "12.9"
stats_dirty.at[142,'Chapter'] = "12.10"
stats_dirty.at[143,'Chapter'] = "12.z"
stats_dirty.at[154,'Chapter'] = "13.B1"
stats_dirty.at[155,'Chapter'] = "13.10"
stats_dirty.at[156,'Chapter'] = "13.B2"
stats_dirty.at[158,'Chapter'] = "13.B3"
stats_dirty.at[160,'Chapter'] = "13.B4"
stats_dirty.at[162,'Chapter'] = "13.B5"
stats_dirty.at[163,'Chapter'] = "13.end"
stats_dirty.at[165,'Chapter'] = "14.1"
stats_dirty.at[171,'Chapter'] = "14.z"
stats_dirty.at[182,'Chapter'] = "15.10"
stats_dirty.at[184,'Chapter'] = "15.z"
stats_dirty.at[195,'Chapter'] = "16.10"
stats_dirty.at[196,'Chapter'] = "16.y"
stats_dirty.at[197,'Chapter'] = "16.z"
stats_dirty.at[206,'Chapter'] = "17.a"
stats_dirty.at[207,'Chapter'] = "17.b"
stats_dirty.at[210,'Chapter'] = "17.10"
stats_dirty.at[216,'Chapter'] = "17.x"
stats_dirty.at[217,'Chapter'] = "17.y"
stats_dirty.at[218,'Chapter'] = "17.z"
stats_dirty.at[222,'Chapter'] = "18.a"
stats_dirty.at[226,'Chapter'] = "18.b"
stats_dirty.at[229,'Chapter'] = "18.c"
stats_dirty.at[232,'Chapter'] = "18.10"
stats_dirty.at[233,'Chapter'] = "18.y"
stats_dirty.at[234,'Chapter'] = "18.z"
stats_dirty.at[245,'Chapter'] = "19.10"
stats_dirty.at[246,'Chapter'] = "19.11"
stats_dirty.at[247,'Chapter'] = "19.12"
stats_dirty.at[248,'Chapter'] = "19.13"
stats_dirty.at[249,'Chapter'] = "19.14"
stats_dirty.at[250,'Chapter'] = "19.15"
stats_dirty.at[251,'Chapter'] = "19.16"
stats_dirty.at[252,'Chapter'] = "19.17"
stats_dirty.at[253,'Chapter'] = "19.z"
stats_dirty.at[258,'Chapter'] = "20.a"
stats_dirty.at[262,'Chapter'] = "20.b"
stats_dirty.at[263,'Chapter'] = "20.c"
stats_dirty.at[264,'Chapter'] = "20.7"
stats_dirty.at[265,'Chapter'] = "20.8"
stats_dirty.at[266,'Chapter'] = "20.d"
stats_dirty.at[267,'Chapter'] = "20.e"
stats_dirty.at[268,'Chapter'] = "20.9"
stats_dirty.at[269,'Chapter'] = "20.f"
stats_dirty.at[270,'Chapter'] = "20.z"
stats_dirty.at[281,'Chapter'] = "21.10"
stats_dirty.at[282,'Chapter'] = "21.11"
stats_dirty.at[283,'Chapter'] = "21.12"
stats_dirty.at[284,'Chapter'] = "21.13"
stats_dirty.at[285,'Chapter'] = "21.14"




stats_dirty.head()

Unnamed: 0,Arc Number,Arc Title,Chapter,PoV,Wordcount
0,prologue,Blood Runs Cold,0.0,Louise,7174
1,arc 1,Lost for Words,1.1,Verona,9191
2,,,1.2,Lucy,6799
3,,,1.3,Avery,9492
4,,,1.4,Avery,8616


In [198]:
# gsheet_url = f"https://docs.google.com/spreadsheets/d/1tVpjBsylcCae-rW7f4ZzjLgXqJ0i_K5foQyRTTn-unk/gviz/tq?tqx=out:csv&sheet=Parseable"
# stats_dirty = pd.read_csv(gsheet_url, index_col=False,
# # names=['a','b','c','Arc Number', 'Arc Title', 'Chapter','PoV','Wordcount','Synopses','Reddit Discussion', 'Audiobook Link', 'Pale Reflections Discussion', 'n','o','p','q'], 
# dtype = {'Arc Number':str, 'Arc Title':str, 'Chapter':object,'PoV':str,'Wordcount':str},
# usecols = ['Arc Number', 'Arc Title', 'Chapter','PoV','Wordcount'],
# # skiprows=1,
# # encoding='UTF-8'
# )
# # stats_dirty = stats_dirty.drop(columns=['a','b','c','Synopses','Reddit Discussion', 'Audiobook Link', 'Pale Reflections Discussion', 'n','o','p','q'])
# # stats_dirty = stats_dirty.dropna()
# stats_dirty.head(5)

Some notes about manual fixes that need to happen for this to be compatible:
- Arc Numberprologue needs to be adjusted so that the chapter is 0.0 and the Arc Title is Blood Runs cold. Something in the formatting breaks thinkg
- Interludes generally break things like chapter, so need to correct those manually
- There are too many files / chapters in the HTML parsed df - fix those before the merge

In [200]:
stats = stats_dirty.copy(deep=True)
stats = stats[~pd.to_numeric(stats['Wordcount'], errors='coerce').isnull()]
stats['Arc Number'] = stats['Arc Number'].fillna(method='ffill')
stats['Arc Title'] = stats['Arc Title'].fillna(method='ffill')
stats['chapter_number'] = range(1,len(stats)+1)

In [218]:
# get these joined together
all_df = stats.merge(df, how='inner', left_on='Chapter', right_on='chapter', suffixes=('_stats','_parse'))
print(len(all_df)==len(stats), max(all_df['chapter_number_stats'])==len(all_df))
all_df['text'].isna().sum()

True True


0

In [219]:
# Now clean up the DF so that it's actually functional for something
all_df = all_df.drop(
    columns = ['chapter_type','file_number', 'absolute_file_number', 'Chapter', 'starting_viewpoint', 'arc_title', 'Arc Number','chapter_number_parse']
)
all_df.head()

Unnamed: 0,Arc Title,PoV,Wordcount,chapter_number_stats,arc_number,text,chapter
0,Blood Runs Cold,Louise,7174,1,0,Prologue Louise’s eyes welled with moistu...,0.0
1,Lost for Words,Verona,9191,2,1,"Verona leaned over her kitchen sink, looki...",1.1
2,Lost for Words,Lucy,6799,3,1,"If their job was to solve a mystery, then ...",1.2
3,Lost for Words,Avery,9492,4,1,Avery and her friends sat in the back of a...,1.3
4,Lost for Words,Avery,8616,5,1,Their travel plans hadn’t allowed much lee...,1.4


In [262]:
raw_dict = all_df.to_dict(orient='index')
chapter_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['Arc Title'],
        'pov': i['PoV'],
        'wordcount': i['Wordcount'],
        'series_chapter_number': i['chapter_number_stats'],
        'arc_number': i['arc_number'],
        'extra_material': False,
        'title': i['Arc Title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]
chapter_arc_lookup = {i['meta']['chapter']:i['meta'] for i in chapter_construct_list} # Use this for the extra materials -> chapter matchings

Append the Extra Materials (transcript)

In [228]:
# These were all taken manually and given the necessary information.
dir_extra_prefix = './pale-extra-materials/'
extra_files = sorted([f for f in os.listdir(dir_extra_prefix) if os.path.isfile(os.path.join(dir_extra_prefix,f)) and f.endswith('.txt')])

In [244]:
extra_materials = []
for fname in extra_files:
    # print(fname)
    splits = re.split(r'(\d\.[\d\w]{1,2} \w)', fname)
    chapter_attached = splits[0] + re.split(r' ',splits[1])[0]
    title_with_ext =  re.split(r' ',splits[1])[-1] + splits[-1]
    title = title_with_ext.rsplit('.',1)[0]

    with open(dir_extra_prefix + fname, 'r') as file:
        data = file.read()

    d = {
        'chapter': chapter_attached.lower(),
        'title': title,
        'text': data
    }

    extra_materials.append(d)
extra_df = pd.DataFrame(extra_materials)

In [253]:
extra_df[['arc_title', 'pov', 'wordcount', 'series_chapter_number', 'arc_number']] = extra_df.apply(lambda x: (chapter_arc_lookup[x['chapter']]['arc_title'], 'Extra Materials', '0', chapter_arc_lookup[x['chapter']]['series_chapter_number'], chapter_arc_lookup[x['chapter']]['arc_number']), axis=1, result_type='expand')

In [258]:
extra_dict = extra_df.to_dict(orient='index')
extra_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'pov': i['pov'],
        'wordcount': i['wordcount'],
        'series_chapter_number': i['series_chapter_number'],
        'arc_number': i['arc_number'],
        'title': i['title'],
        'extra_material': True,
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in extra_dict.items()
    ]

In [263]:
haystack_construct_list = chapter_construct_list + extra_construct_list

In [272]:
import pickle
with open('./chapter_fmt_list.pkl','wb') as f:
    pickle.dump(haystack_construct_list,f)