# file set-up
1. import necessary libraries  
2. title directory (where data files are stored)  
3. create empty dataframe  
4. create list of languages
    will return to this---currently it doesn't actually serve a purpose, as I was unable to get the languages through here.  
    will either update this file or change spider to sort by language.  

In [1]:
from bs4 import BeautifulSoup as beau
import pandas as pd
import itertools
import pickle
import nltk
import os
import re

In [2]:
directory = 'natfinder'
fics_df = pd.DataFrame(columns=['lang', 'author', 'title', 'file', 'tags', 'summary', 'notes', 'endnotes', 'work'])
langs = ['french', 'spanish', 'mexican', 'brazilian', 'portugeuse', 'russian', 'ukranian', 'bengali', 'italian', 'czech', 'japanese', 'korean', 'chinese', 'swedish', 'german', 'finnish', 'turkish', 'greek', 'hindi']

# def: extract_info(soup)
1. try/except in case data is missing from file (ie; fics written by anonymous users will not return an author value)  
2. gather basic info (author, title, tags, summary, notes, endnotes, work)  
    tag info (rating, warning, category, fandom, (relation)ships, characters, freeform)  
    stat info (publication date, status, word count, chapters, comments, kudos, hits, bookmarks)
3. add to `fics_df`

In [None]:
def extract_info(soup):
    # basic info
    try:
        author = soup.find(rel='author').text
    except:
        author = 'NaN'
    try:
        title = soup.find(class_='title heading').text
    except:
        title = 'NaN'
    try:
        tags = [t.text for t in soup.find_all(class_='tag')]
    except:
        tags = 'NaN'
    try:
        summary = soup.find('div', class_='summary module').text
    except:
        summary = 'NaN'
    try:
        notes = soup.find('div', class_='notes module').text
    except:
        notes = 'NaN'
    try:
        endnotes = soup.find('div', class_='end notes module').text
    except:
        endnotes = 'NaN'
    try:
        work = str([p.text for p in soup.div.find(role='article').find_all('p')])
        work = list(itertools.chain.from_iterable(work))
    except:
        work = 'NaN'
    # tag info
    try:
        rating = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='rating tags')]
    except:
        rating = 'ERROR'
    try:
        warnings = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='warning tags')]
    except:
        warnings = 'ERROR'
    try:
        categories = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='category tags')]
    except:
        categories = 'ERROR'
    try:
        fandoms = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='fandom tags')]
    except:
        fandoms = 'ERROR'
    try:
        ships = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='relationship tags')]
    except:
        ships = 'NaN'
    try:
        charas = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='character tags')]
    except:
        charas = 'NaN'
    try:
        freeform = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='freeform tags')]
    except:
        freeform = 'NaN'
    # stats info
    try:
        published = soup.dd.find(class_='published').text
    except:
        published = 'ERROR'
    try:
        status = soup.dd.find(class_='status').text
    except:
        status = 'ERROR'
    try:
        wc = int(soup.dd.find(class_='words').text)
    except:
        wc = 'ERROR'
    try:
        chapters = int(soup.dd.find(class_='chapters').text)
    except:
        chapters = 'ERROR'
    try:
        comments = int(soup.dd.find(class_='comments').text)
    except:
        comments = 0
    try:
        kudos = int(soup.dd.find(class_='kudos').text)
    except:
        kudos = 0
    try:
        hits = int(soup.dd.find(class_='hits').text)
    except:
        hits = 0
    try:
        bookmarks = int(soup.dd.find(class_='bookmarks').text)
    except:
        bookmarks = 0
    temp = pd.DataFrame({
        'author':[author],
        'title':[title],
        'tags':[tags],
        'summary':[summary],
        'notes':[notes],
        'endnotes':[endnotes],
        'work':[work],
        'rating':[rating],
        'warnings':[warnings],
        'category':[categories],
        'fandom':[fandoms],
        'ships':[ships],
        'charas':[charas],
        'freeform':[freeform],
        'published':[published],
        'status':[status],
        'word_count':[wc],
        'chaps':[chapters],
        'comments':[comments],
        'kudos':[kudos]
        'hits':[hits],
        'bookmarks':[bookmarks]})
    return temp

# parsing through files
1. gather files  
2. create bs4 soup for files
3. use `extract_info(soup)` to add to fics_df

In [4]:
files = []
for f in os.scandir(directory):
    if f.name.endswith('.html'):
        files.append(f.name)
for f in files:
    soup = beau(open('natfinder/' + f).read(), 'html.parser')
    fics_df = pd.concat([fics_df, extract_info(soup, f)], ignore_index=True)

4. create additional data columns  
    may move this over to data_organization.ipynb
5. pickle dataframe as `fics_df.pkl`

In [5]:
fics_df['sum_toks'] = fics_df.summary.map(lambda x: nltk.word_tokenize(x)[2:])
fics_df['notes_toks'] = fics_df.notes.map(lambda x: nltk.word_tokenize(x)[2:])
fics_df['endnotes_toks'] = fics_df.endnotes.map(lambda x: nltk.word_tokenize(x)[2:])
fics_df['toks'] = fics_df.work.map(lambda x: nltk.word_tokenize(x))

TypeError: expected string or bytes-like object, got 'list'

In [6]:
fics_df['rating'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='rating tags')]
fics_df['warnings'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='warning tags')]
fics_df['categories'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='category tags')]
fics_df['fandoms'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='fandom tags')]
fics_df['ships'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='relationship tags')]
fics_df['charas'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='character tags')]
fics_df['freeform'] = [[a.text for a in t.find_all('a')] for t in soup.find_all(class_='freeform tags')]

ValueError: Length of values (2) does not match length of index (251)

In [None]:
with open('fics_df.pkl', 'wb') as file:
    pickle.dump(fics_df, file)