## What does this do?
This is a one-time processing of the other works by Wildbow in preparation of using them as a fine-tuning (and future) search mechanism for the model / lookup system.

Note: While there will be some basic metadata here, depending on the difficulty / manual effort required to get it into the same shape as Pale it may not be done in the current form. Pale had a spreadsheet with additional information (PoV, wordcount, etc) that isn't available here.

In [79]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import pickle

### Worm

In [59]:
dir_prefix = '../data/worm/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.html')])

In [60]:
tmp = all_files[92]
with open(dir_prefix + tmp) as fp:
        soup = BeautifulSoup(fp,features='xml')
print(soup.title.string)
full_title = soup.title.string
if 'interlude' in full_title.lower(): 
        # Different structure
        arc_name = None
        arc_number = None
        chapter_number = 'i' + re.search(r'\d+',full_title).group()
        is_interlude = True
else:
        arc_name = re.split(r' \d',full_title)[0]
        arc_num = re.search(r'\d+\.\d+', full_title).group()
        arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
        is_interlude = False

print(arc_name, arc_number, chapter_number)

text = soup.body
text.h1.decompose()
clean_text = text.getText().replace(u'\xa0', ' ')
        # splits[0] + re.split(r' ',splits[1])[0]

Interlude 16 (Donation Bonus #3)
None None i16


In [61]:
parsed_files = []
for fname in all_files:
    file_number = int(re.search(r'\d+',fname).group())
    # print(file_number)
    with open(dir_prefix + fname) as fp:
        soup = BeautifulSoup(fp,features='xml')

    # print(soup.title.string)
    full_title = soup.title.string
    if 'interlude' in full_title.lower(): 
            # Different structure
            arc_name = None
            arc_number = None
            if ' End' in full_title:
                chapter_number = 'end'
            else:
                chapter_number = 'i' + re.search(r'\d+',full_title).group()
            chapter_type = 'Interlude'
    else:
            arc_name = re.split(r' \d',full_title)[0]
            if 'Teneral' in arc_name:
                arc_num = re.search(r'\w+\.\d+', full_title).group()
            else:
                arc_num = re.search(r'\d+\.\d+', full_title).group()
            arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
            chapter_type = 'Taylor'

    # print(arc_name, arc_number, chapter_number)

    text = soup.body
    text.h1.decompose()
    clean_text = text.getText().replace(u'\xa0', ' ')
    # TODO Format this so it matches the text / metadata format Haystack expects
    d = {
        # Some of this information is going to removed in favor of the spreadsheet versions
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
    }
    parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])

In [62]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df[['arc_title','arc_number']] = df[['arc_title','arc_number']].fillna(method='ffill')
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
df = df.fillna('')
df.sample(n=5)

Unnamed: 0,arc_title,arc_number,chapter_number,chapter_type,text,file_number,absolute_file_number,chapter
63,Colony,15,1,Taylor,"\nBentley lunged in my direction, and I could ...",157,140,15.1
219,Speck,30,4,Taylor,\nI didn’t break eye contact with Dragon. My ...,299,263,30.4
110,Insinuation,2,9,Taylor,"\nAs Brian and I returned to the loft, I felt ...",20,16,2.9
45,Snare,13,8,Taylor,\n“I was a lot more comfortable with the risky...,140,125,13.8
247,Hive,5,1,Taylor,\nThe place was nondescript. A hole in the wa...,48,43,5.1


In [63]:
raw_dict = df.to_dict(orient='index')
worm_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'series_chapter_number': i['absolute_file_number'],
        'arc_number': i['arc_number'],
        'pov': i['chapter_type'],
        'title': i['arc_title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]


In [80]:
with open('../data/worm_fmt_list.pkl','wb') as f:
    pickle.dump(worm_construct_list,f)

## Ward

In [65]:
dir_prefix = '../data/ward/OEBPS/chapters/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.xhtml')])

In [66]:
tmp = all_files[99]
file_number = int(re.search(r'\d+',tmp).group())
# print(file_number)
with open(dir_prefix + tmp) as fp:
        soup = BeautifulSoup(fp,features='xml')
print(soup.title.string)
full_title = soup.title.string
if 'interlude' in full_title.lower(): 
        # Different structure
        arc_name = re.split(r' Interlude',full_title)[0]
        arc_num = re.search(r'\d+\.\w+', full_title).group()
        arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
        is_interlude = True
else:
        arc_name = re.split(r' \d',full_title)[0]
        arc_num = re.search(r'\d+\.\d+', full_title).group()
        arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
        is_interlude = False

print(arc_name, arc_number, chapter_number)

text = soup.body
text.h1.decompose()
clean_text = text.getText().replace(u'\xa0', ' ')
        # splits[0] + re.split(r' ',splits[1])[0]

Gleaming Interlude 9.x
Gleaming 9 x


In [76]:
parsed_files = []
for fname in all_files:
    file_number = int(re.search(r'\d+',fname).group())
    # print(file_number)
    with open(dir_prefix + fname) as fp:
        soup = BeautifulSoup(fp,features='xml')

    # print(soup.title.string)
    # print(soup.title.string)
    full_title = soup.title.string
    if 'interlude' in full_title.lower() or re.search(r'\d+\.\D+', full_title.lower()): 
            # Different structure
            arc_name = re.split(r' Interlude',full_title)[0]
            if '.' in full_title:
                arc_num = re.search(r'\d+\.\w+', full_title).group()
                arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
            else:
                arc_number = None
                chapter_number = 'i' + re.search(r'\d+',full_title).group()
            chapter_type = 'Interlude'
    else:
            arc_name = re.split(r' \d',full_title)[0]
            if 'Eclipse' in arc_name:
                arc_num = re.search(r'\w+\.\d+', full_title).group()
            
            else:
                arc_num = re.search(r'\d+\.\d+', full_title).group()
            arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
            chapter_type = 'Victoria'

    # print(arc_name, arc_number, chapter_number)

    text = soup.body
    text.h1.decompose()
    clean_text = text.getText().replace(u'\xa0', ' ')
    # TODO Format this so it matches the text / metadata format Haystack expects
    d = {
        # Some of this information is going to removed in favor of the spreadsheet versions
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
    }
    parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])

In [77]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df[['arc_title','arc_number']] = df[['arc_title','arc_number']].fillna(method='ffill')
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
df = df.fillna('')
df.sample(n=5)

Unnamed: 0,arc_title,arc_number,chapter_number,chapter_type,text,file_number,absolute_file_number,chapter
41,Shadow,5,7,Victoria,\n\n\nIt was an overcast day. Our nights were...,41,40,5.7
132,Blinding,11,5,Victoria,\n\n\nThe ‘joints’ of the spider’s limbs were ...,132,131,11.5
172,Breaking,14,3,Victoria,\n\n\nThere were no big television cameras to ...,172,171,14.3
143,Heavens,12,1,Victoria,\n\n\nThe red and blue lights from the police ...,143,142,12.1
228,Radiation,18,7,Victoria,\n\n\nThe five-pound phone I carried blared. ...,228,227,18.7


In [78]:
raw_dict = df.to_dict(orient='index')
ward_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'series_chapter_number': i['absolute_file_number'],
        'arc_number': i['arc_number'],
        'pov': i['chapter_type'],
        'title': i['arc_title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]


In [81]:
with open('../data/ward_fmt_list.pkl','wb') as f:
    pickle.dump(ward_construct_list,f)

### Pact

In [82]:
dir_prefix = '../data/pact/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.html')])

In [105]:
parsed_files = []
for fname in all_files:
        file_number = int(re.search(r'\d+',fname).group())
        # print(file_number)
        with open(dir_prefix + fname) as fp:
                soup = BeautifulSoup(fp,features='xml')
        # print(soup.title.string)
        full_title = soup.title.string
        if 'gathered' in full_title.lower() or 'pages' in full_title.lower(): 
                # Different structure
                print("Gathered Pages")
                arc_name = None
                arc_number = None
                chapter_number = 'i' + str(int(re.search(r'\d+',full_title).group()))
                chapter_type = 'Gathered Pages'
        elif 'histories' in full_title.lower():
                arc_name = None
                if 'Arc' in full_title:
                        arc_number = int(re.search(r'Arc \d+',full_title).group().split(' ')[1])
                else:
                        arc_number = int(re.search(r'\d+',full_title).group())
                chapter_number = 'i' + str(int(re.search(r'\d+',full_title).group()))
                chapter_type = 'Histories'
        elif 'epilogue' in full_title.lower():
                arc_name = 'Epilogue'
                arc_number = 'End'
                chapter = 'End'
                chapteR_type = 'Epilogue'
        else:
                # print(full_title)
                arc_name = re.split(r' \d',full_title)[0]
                arc_num = re.search(r'\d+\.\d+', full_title).group()
                arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
                chapter_type = 'Chapter'

        # print(arc_name, arc_number, chapter_number)

        text = soup.body
        text.h1.decompose()
        clean_text = text.getText().replace(u'\xa0', ' ')
        d = {
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
    }
        parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])

Gathered Pages
Gathered Pages
Gathered Pages
Gathered Pages
Gathered Pages


In [106]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df[['arc_title','arc_number']] = df[['arc_title','arc_number']].fillna(method='ffill')
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
df = df.fillna('')
df.sample(n=5)

Unnamed: 0,arc_title,arc_number,chapter_number,chapter_type,text,file_number,absolute_file_number,chapter
131,Bonds,1,4,Chapter,\nI finished toweling myself dry and wrapped t...,8,4,1.4
39,Sine Die,14,i14,Histories,"\nThe wind blew, pulling dry bits of vegetatio...",135,131,14.i14
42,Possession,15,3,Chapter,\nThe top of the pillar was bright compared to...,138,134,15.3
119,Void,7,10,Chapter,\nThe sudden shifts in weather had made for so...,69,65,7.10
133,Null,9,3,Chapter,"\nI clenched my fists, but neither was in part...",81,77,9.3


In [107]:
raw_dict = df.to_dict(orient='index')
pact_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'series_chapter_number': i['absolute_file_number'],
        'arc_number': i['arc_number'],
        'pov': i['chapter_type'],
        'title': i['arc_title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]


In [108]:
with open('../data/pact_fmt_list.pkl','wb') as f:
    pickle.dump(pact_construct_list,f)

## Glow-worm

In [109]:
dir_prefix = '../data/glow-worm/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.html')])

In [112]:
parsed_files = []
for fname in all_files:
        file_number = int(re.search(r'\d+',fname).group())
        # print(file_number)
        with open(dir_prefix + fname) as fp:
                soup = BeautifulSoup(fp,features='xml')
        print(soup.title.string)
        full_title = soup.title.string
        arc_name = full_title.split('.')[0]
        arc_number = full_title.split('.')[0]
        chapter_number = full_title.split('.')[1]
        chapter_type = 'Chapter'

        print(arc_name, arc_number, chapter_number)

        text = soup.body
        text.h1.decompose()
        clean_text = text.getText().replace(u'\xa0', ' ')
        d = {
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
        }
        parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])

P.6
P P 6
P.7
P P 7
P.8
P P 8
P.9
P P 9
P.1
P P 1
P.2
P P 2
P.3
P P 3
P.4
P P 4
P.5
P P 5


In [113]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df[['arc_title','arc_number']] = df[['arc_title','arc_number']].fillna(method='ffill')
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
df = df.fillna('')
df.sample(n=5)

Unnamed: 0,arc_title,arc_number,chapter_number,chapter_type,text,file_number,absolute_file_number,chapter
8,P,P,5,Chapter,\nSubject: Your Nilles University Application\...,9,5,P.5
7,P,P,4,Chapter,\nWelcome to the Parahumans Online message boa...,8,4,P.4
3,P,P,9,Chapter,\n♦ You have four unread private messages fro...,13,9,P.9
5,P,P,2,Chapter,\nWelcome to the Parahumans Online message boa...,6,2,P.2
2,P,P,8,Chapter,\nSubject: PHO Technical Assistance\nAugust 24...,12,8,P.8


In [114]:
raw_dict = df.to_dict(orient='index')
glowworm_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'series_chapter_number': i['absolute_file_number'],
        'arc_number': i['arc_number'],
        'pov': i['chapter_type'],
        'title': i['arc_title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]


In [115]:
with open('../data/glowworm_fmt_list.pkl','wb') as f:
    pickle.dump(glowworm_construct_list,f)

## Twig

In [116]:
dir_prefix = '../data/twig/'
all_files = sorted([f for f in os.listdir(dir_prefix) if os.path.isfile(os.path.join(dir_prefix,f)) and f.endswith('.html')])

In [144]:
parsed_files = []
for fname in all_files:
    file_number = int(re.search(r'\d+',fname).group())
    # print(file_number)
    with open(dir_prefix + fname) as fp:
        soup = BeautifulSoup(fp,features='xml')

    print(soup.title.string)
    full_title = soup.title.string
    if 'enemy' in full_title.lower():
        arc_name = None
        arc_number = int(re.search(r'Arc \d+', full_title).group().split(' ')[1])
        chapter_number = 'i' + str(arc_number)
        if '–' in full_title:
            chapter_type = re.sub('[\W_]', '',full_title.split('–')[1])
        else:
            chapter_type = 'Enemy'
    elif 'lamb' in full_title.lower() and 'slaughter' not in full_title.lower():
        if 'black sheep' in full_title.lower():
            arc_name = re.split(r' \d',full_title)[0]
            arc_name = re.sub('–', '',arc_name).strip()
            arc_num = re.search(r'\d+\.\d+', full_title).group()
            arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
            chapter_type = 'Lamb'
        else:
            arc_name = None
            arc_number = int(re.search(r'Arc \d+', full_title).group().split(' ')[1])
            chapter_number = 'i' + str(arc_number)
            chapter_type = 'Lamb'
    elif 'e.' in full_title.lower():
        arc_name = full_title.split('–')[0]
        arc_number = 'e'
        chapter_number = int(re.search(r'\.\d+',full_title).group().split('.')[1])
        chapter_type = 'Epilogue'
    else:
        arc_name = re.split(r' \d',full_title)[0]
        arc_name = re.sub('–', '',arc_name).strip()
        arc_num = re.search(r'\d+\.\d+', full_title).group()
        arc_number, chapter_number = arc_num.split('.')[0], arc_num.split('.')[1]
        chapter_type = 'Chapter'

    d = {
        'arc_title': arc_name,
        'arc_number': arc_number,
        'chapter_number': chapter_number,
        'chapter_type': chapter_type,
        'text': clean_text,
        'file_number':file_number
        }
    parsed_files.append(d)
df = pd.DataFrame(parsed_files)
df = df.drop_duplicates(subset=['arc_number','chapter_number'])


Taking Root 1.6
Enemy (Arc 7 – Boys)
Enemy (Arc 7 – Girls)
Bleeding Edge – 8.1
Bleeding Edge – 8.2
Bleeding Edge – 8.3
Bleeding Edge – 8.4
Bleeding Edge – 8.5
Bleeding Edge – 8.6
Bleeding Edge – 8.7
Bleeding Edge – 8.8
Taking Root 1.7
Bleeding Edge – 8.9
Bleeding Edge – 8.10
Bleeding Edge – 8.11
Bleeding Edge – 8.12
Bleeding Edge – 8.13
Bleeding Edge – 8.14
Bleeding Edge – 8.15
Bleeding Edge – 8.16
Enemy (Arc 8)
Counting Sheep – 9.1
Taking Root 1.8
Counting Sheep – 9.2
Counting Sheep – 9.3
Counting Sheep – 9.4
Counting Sheep – 9.5
Counting Sheep – 9.6
Counting Sheep – 9.7
Counting Sheep – 9.8
Counting Sheep – 9.9
Counting Sheep – 9.10
Counting Sheep – 9.11
Taking Root 1.9
Counting Sheep – 9.12
Counting Sheep – 9.13
Counting Sheep – 9.14
Counting Sheep – 9.15
Counting Sheep – 9.16
Counting Sheep – 9.17
Counting Sheep – 9.18
Enemy (Arc 9)
In Sheep’s Clothing – 10.1
In Sheep’s Clothing – 10.2
Taking Root 1.10
In Sheep’s Clothing – 10.3
In Sheep’s Clothing – 10.4
In Sheep’s Clothing – 10.5

In [145]:
df = df.sort_values('file_number',ascending=True)
df['absolute_file_number'] = range(1,len(df)+1) # 1-indexed
df[['arc_title','arc_number']] = df[['arc_title','arc_number']].fillna(method='ffill')
df['chapter'] = df['arc_number'].astype(str) + "." + df['chapter_number'].astype(str)
df = df.sort_values(by='absolute_file_number', ascending=True)
df = df.fillna('')
df.sample(n=5)

Unnamed: 0,arc_title,arc_number,chapter_number,chapter_type,text,file_number,absolute_file_number,chapter
13,Bleeding Edge,8,10,Chapter,\nSubject: Your Nilles University Application\...,111,106,8.1
276,Esprit de Corpse,5,6,Chapter,\nSubject: Your Nilles University Application\...,59,55,5.6
220,Lips Sealed,3,1,Chapter,\nSubject: Your Nilles University Application\...,30,26,3.1
57,In Sheep’s Clothing,10,14,Chapter,\nSubject: Your Nilles University Application\...,151,146,10.14
138,Bitter Pill,15,9,Chapter,\nSubject: Your Nilles University Application\...,225,220,15.9


In [146]:
raw_dict = df.to_dict(orient='index')
twig_construct_list = [
    {'content':i['text'], 'meta': {
        'arc_title': i['arc_title'],
        'series_chapter_number': i['absolute_file_number'],
        'arc_number': i['arc_number'],
        'pov': i['chapter_type'],
        'title': i['arc_title'] + ' - ' + i['chapter'],
        'chapter': i['chapter'].lower() # We'll use it later as a join key
    }}
    for idx, i in raw_dict.items()
    ]


In [147]:
with open('../data/twig_fmt_list.pkl','wb') as f:
    pickle.dump(twig_construct_list,f)