In [1]:
from bs4 import BeautifulSoup as beau
import pandas as pd
import itertools
import pickle
import nltk
import os
import re

In [2]:
directory = 'natfinder/natfinder/manual-works'
fics_df = pd.DataFrame(columns=['L1', 'author', 'title', 'tags', 'summary', 'notes', 'endnotes', 'work', 'rating', 'warnings', 'category', 'fandom', 'ships', 'charas', 'freeform', 'published', 'status', 'word_count', 'chaps', 'comments', 'kudos', 'hits', 'bookmarks'])

In [3]:
def extract_info(soup):
    # basic info
    try:
        L1 = soup.find(class_='L1').text
    except:
        L1 = 'ERROR'
    try:
        author = soup.find(rel='author').text
    except:
        author = 'Anonymous'
    try:
        title = soup.find(class_='title heading').text
    except:
        title = 'NaN'
    try:
        tags = [t.text for t in soup.find_all(class_='tag')]
    except:
        tags = 'NaN'
    try:
        summary = soup.find('div', class_='summary module').text
    except:
        summary = 'NaN'
    try:
        notes = soup.find('div', class_='notes module').text
    except:
        notes = 'NaN'
    try:
        endnotes = soup.find('div', class_='end notes module').text
    except:
        endnotes = 'NaN'
    try:
        work = [x.text for x in soup.find_all(id="chapters")]
    except:
        work = 'NaN'
    # tag info
    try:
        rating = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='rating tags')]
    except:
        rating = 'ERROR'
    try:
        warnings = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='warning tags')]
    except:
        warnings = 'ERROR'
    try:
        categories = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='category tags')]
    except:
        categories = 'ERROR'
    try:
        fandoms = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='fandom tags')]
    except:
        fandoms = 'ERROR'
    try:
        ships = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='relationship tags')]
    except:
        ships = 'NaN'
    try:
        charas = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='character tags')]
    except:
        charas = 'NaN'
    try:
        freeform = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='freeform tags')]
    except:
        freeform = 'NaN'
    # stats info
    try:
        published = soup.dd.find(class_='published').text
    except:
        published = 'ERROR'
    try:
        status = soup.dd.find(class_='status').text
    except:
        status = 'ERROR'
    try:
        wc = int(soup.dd.find(class_='words').text)
    except:
        wc = 'ERROR'
    try:
        chapters = int(soup.dd.find(class_='chapters').text)
    except:
        chapters = 'ERROR'
    try:
        comments = int(soup.dd.find(class_='comments').text)
    except:
        comments = 0
    try:
        kudos = int(soup.dd.find(class_='kudos').text)
    except:
        kudos = 0
    try:
        hits = int(soup.dd.find(class_='hits').text)
    except:
        hits = 0
    try:
        bookmarks = int(soup.dd.find(class_='bookmarks').text)
    except:
        bookmarks = 0
    temp = pd.DataFrame({'L1':[L1], 'author':[author], 'title':[title],'tags':[tags], 'summary':[summary], 'notes':[notes], 'endnotes':[endnotes], 'work':[work], 'rating':[rating], 'warnings':[warnings], 'category':[categories], 'fandom':[fandoms], 'ships':[ships], 'charas':[charas], 'freeform':[freeform], 'published':[published], 'status':[status], 'word_count':[wc], 'chaps':[chapters], 'comments':[comments], 'kudos':[kudos], 'hits':[hits], 'bookmarks':[bookmarks]})
    return temp

In [4]:
files = []
i = 0
for f in os.scandir(directory):
    if f.name.endswith('.html'):
        files.append(f.name)
for f in files:
    soup = beau(open(directory + '/' + f).read(), 'html.parser')
    temp_df = extract_info(soup)
    fics_df = pd.concat([fics_df, temp_df], ignore_index=True)

In [7]:
with open('fics_df_manual.pkl', 'wb') as file:
    pickle.dump(fics_df, file)

In [6]:
fics_df

Unnamed: 0,L1,author,title,tags,summary,notes,endnotes,work,rating,warnings,...,charas,freeform,published,status,word_count,chaps,comments,kudos,hits,bookmarks
0,Portuguese,phoenixdellaverita,,[],,,,[\n\nChapter 1\n\n\n\nTHE COLD WATER of the la...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
1,Portuguese,phoenixdellaverita,,[],,,,[\nThe ghost boy king\n\nLancer was used to th...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
2,Portuguese,phoenixdellaverita,,[],,,,[\n\n1 - The Damnation\n\n\n\nDanny slipped hi...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
3,Portuguese,phoenixdellaverita,,[],,,,"[\n\n1 - Notes, Portals and Statues\n\n\n\nDip...",[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
4,Korean,Gimli_s_Pickaxe (orphan_account),,[],,,,[\nTrapped\n\nThe waves crash onto the grey hu...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Polish,coolerdazai,,[],,,,[\nFate had finally found me\n\nCold breeze of...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
93,Polish,coolerdazai,,[],,,,[\nI got what I wanted - High Life\n\n\nDanny ...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
94,Polish,coolerdazai,,[],,,,[\nI want to snarl and show you just how distu...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
95,Polish,coolerdazai,,[],,,,[\nAssigned Straight By Helmet\n\n\nThe atmosp...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0


In [12]:
with open('fics_df_manual.pkl', 'rb') as file:
    fcdf = pickle.load(file)

In [17]:
fcdf.head(5)

Unnamed: 0,L1,author,title,tags,summary,notes,endnotes,work,rating,warnings,...,charas,freeform,published,status,word_count,chaps,comments,kudos,hits,bookmarks
0,Portuguese,phoenixdellaverita,,[],,,,[\n\nChapter 1\n\n\n\nTHE COLD WATER of the la...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
1,Portuguese,phoenixdellaverita,,[],,,,[\nThe ghost boy king\n\nLancer was used to th...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
2,Portuguese,phoenixdellaverita,,[],,,,[\n\n1 - The Damnation\n\n\n\nDanny slipped hi...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
3,Portuguese,phoenixdellaverita,,[],,,,"[\n\n1 - Notes, Portals and Statues\n\n\n\nDip...",[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
4,Korean,Gimli_s_Pickaxe (orphan_account),,[],,,,[\nTrapped\n\nThe waves crash onto the grey hu...,[],[],...,[],[],ERROR,ERROR,ERROR,ERROR,0,0,0,0
