# Install dependencies

```sh
sudo apt-get install libdb++-dev
export BERKELEYDB_DIR=/usr
pip install gutenberg patool nltk tqdm
```

In [None]:
import logging
import itertools
import json
import numpy as np
import os
import re
import requests

from multiprocessing import Pool
from functools import partial
from functools import reduce

import matplotlib.pyplot as plt
import patoolib

from bs4 import BeautifulSoup
from gutenberg.acquire import get_metadata_cache
from gutenberg.acquire import load_etext
from gutenberg.query import get_metadata
from nltk.tokenize import sent_tokenize
from tqdm import tqdm_notebook as tqdm

os.environ['GUTENBERG_DATA'] = "/media/brian/ColdStore/Datasets/nlp/gutenberg"

In [None]:
_GUTENBERG_BASE_URL = "http://www.gutenberg.org/files/"

_HTML_FILENAME_TEMPLATE = "{idno}-h.htm"
_HTML_URN_TEMPLATE = "{idno}/{idno}-h/" + _HTML_FILENAME_TEMPLATE

_TEXT_FILENAME_TEMPLATE = "{idno}.txt"
_TEXT_URN_TEMPLATE = "{idno}/" + _TEXT_FILENAME_TEMPLATE

_TEXT_FILENAME_TEMPLATE_ALT = "{idno}-0.txt"
_TEXT_URN_TEMPLATE_ALT = "{idno}/" + _TEXT_FILENAME_TEMPLATE_ALT

_SENT_FILENAME_TEMPLATE = "{idno}.txt"

_META_FILENAME_TEMPLATE = "{idno}.json"

_MAX_IDNO = 58120


DEFAULT_GUTENBERG_DATA = os.environ.get('GUTENBERG_DATA') or "~/gutenberg_data"
DEFAULT_GUTENBERG_DATA_HTML = os.path.join(DEFAULT_GUTENBERG_DATA, "html")
DEFAULT_GUTENBERG_DATA_TEXT = os.path.join(DEFAULT_GUTENBERG_DATA, "text")
DEFAULT_GUTENBERG_DATA_SENT = os.path.join(DEFAULT_GUTENBERG_DATA, "sent")
DEFAULT_GUTENBERG_DATA_META = os.path.join(DEFAULT_GUTENBERG_DATA, "meta")

PUBLIC_RIGHTS = {'Public domain in the USA.'}
EN_LANG = {'en'}


def populate_gutenberg_html(idnos, dest=DEFAULT_GUTENBERG_DATA_HTML):
    pool = Pool(100)
    download_fn = partial(download_gutenberg_book_html, dest=dest)
    list(tqdm(pool.imap(download_fn, list(idnos)), total=len(idnos)))
    pool.close()
    pool.join()


def populate_gutenberg_text(idnos, dest=DEFAULT_GUTENBERG_DATA_TEXT):
    pool = Pool(100)
    download_fn = partial(download_gutenberg_book_text, dest=dest)
    list(tqdm(pool.imap(download_fn, list(idnos)), total=len(idnos)))
    pool.close()
    pool.join()

    
def download_gutenberg_book_html(idno, dest=DEFAULT_GUTENBERG_DATA_HTML):
    url = get_gutenberg_book_html_url(idno)
    download(url, dest=dest)


def download_gutenberg_book_text(idno, dest=DEFAULT_GUTENBERG_DATA_TEXT):
    url = get_gutenberg_book_text_url(idno)
    if not download(url, dest=dest):
        alt_url = get_gutenberg_book_text_url(idno, use_alt=True)
        download(alt_url, dest=dest)


def load_gutenberg_book_text(idno, root=DEFAULT_GUTENBERG_DATA_TEXT):
    path = get_gutenberg_book_text_local(idno)
    if not os.path.isfile(path):
        path = get_gutenberg_book_text_local(idno, root=root, use_alt=True)
    if not os.path.isfile(path):
        return None
    try:
        with open(path, encoding='utf-8') as fh:
            return ''.join(fh)
    except UnicodeDecodeError as e:
        logging.warning("Using iso-8859-1: {}".format(path))
        with open(path, encoding='iso-8859-1') as fh:
            return ''.join(fh)


def load_gutenberg_book_sents(idno, root=DEFAULT_GUTENBERG_DATA_SENT):
    path = get_gutenberg_book_sents_local(idno, root=root)
    if not os.path.isfile(path):
        return None
    with open(path, encoding='utf-8') as fh:
        return list(fh)


def load_gutenberg_book_meta(idno, root=DEFAULT_GUTENBERG_DATA_META):
    path = get_gutenberg_book_meta_local(idno)
    if not os.path.isfile(path):
        return None
    with open(path, encoding='utf-8') as fh:
        return json.load(fh)    


def get_gutenberg_book_html_url(idno):
    return os.path.join(_GUTENBERG_BASE_URL + 
                        _HTML_URN_TEMPLATE.format(idno=idno))


def get_gutenberg_book_text_url(idno, use_alt=False):
    urn = _TEXT_URN_TEMPLATE_ALT if use_alt else _TEXT_URN_TEMPLATE
    return os.path.join(_GUTENBERG_BASE_URL + urn.format(idno=idno))


def get_gutenberg_book_html_local(idno, root=DEFAULT_GUTENBERG_DATA_HTML):
    return os.path.join(root, _HTML_FILENAME_TEMPLATE.format(idno=idno))


def get_gutenberg_book_text_local(idno, root=DEFAULT_GUTENBERG_DATA_TEXT, use_alt=False):
    urn = _TEXT_FILENAME_TEMPLATE_ALT if use_alt else _TEXT_FILENAME_TEMPLATE
    return os.path.join(root, urn.format(idno=idno))


def get_gutenberg_book_sents_local(idno, root=DEFAULT_GUTENBERG_DATA_SENT):
    return os.path.join(root, _SENT_FILENAME_TEMPLATE.format(idno=idno))


def get_gutenberg_book_meta_local(idno, root=DEFAULT_GUTENBERG_DATA_META):
    return os.path.join(root, _META_FILENAME_TEMPLATE.format(idno=idno))


def download(url, dest='/tmp/'):
    filename = os.path.basename(url)
    if dest[-1] == '/' or os.path.isdir(dest):
        if not os.path.isdir(dest):
            os.makedirs(dest)
        dest = os.path.join(dest, filename)
    if os.path.isfile(dest):
        logging.info("{} already exist in {}.".format(url, dest))
    else:
        logging.info("Downloading {} to {}...".format(url, dest))
        resp = requests.get(url)
        if not resp.ok:        
            logging.info("{}: {}".format(resp.reason, url))
            return False
        with open(dest, 'wb') as fh:
            fh.write(resp.content)
    return True


def unzip(filepath):
    outdir = os.path.dirname(filepath)
    logging.info("Unzipping {} to {}".format(filepath, outdir))
    patoolib.extract_archive(filepath, outdir=outdir)


# Populate metadata and book cache

In [None]:
metacache = get_metadata_cache()
if not metacache.exists:
    metacache.populate()

idnos = list(range(1, _MAX_IDNO+1))
populate_gutenberg_html(idnos)
populate_gutenberg_text(idnos)

# Filter valid english public domain book ID numbers

In [None]:
idnos = list(range(1, _MAX_IDNO+1))
valid_idnos = list(filter(
    lambda idno: os.path.isfile(get_gutenberg_book_text_local(idno)) or
                 os.path.isfile(get_gutenberg_book_text_local(idno, use_alt=True)),
    idnos))
valid_en_idnos = list(filter(lambda idno: get_metadata('language', idno) == EN_LANG, valid_idnos))
valid_en_public_idnos = list(filter(lambda idno: get_metadata('rights', idno) & PUBLIC_RIGHTS, valid_en_idnos))

# Metadata exploration

In [None]:
def get_subject_words(idno):
    subjs = get_metadata('subject', idno)
    subjs_norm = map(lambda s: re.sub('[^\w\s]', ' ', s.lower()), subjs)
    subjs_words = list(map(lambda s: set(s.split()), subjs_norm)) or [{'N/A'}]
    subj_words = reduce(set.union, subjs_words)
    return subj_words

In [None]:
poetry_idnos = []
for idno in valid_en_public_idnos:
    subj_words = get_subject_words(idno)
    if 'poetry' in subj_words:
        poetry_idnos.append(idno)

In [None]:
keyword_counts = {}
for idno in valid_en_public_idnos:
    keywords = get_subject_words(idno)
    for kw in keywords:
        keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
        
sorted_keyword_counts = sorted(keyword_counts.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
author_counts = {}
for idno in valid_en_public_idnos:
    authors = get_metadata('author', idno)
    for author in authors:
        author_counts[author] = author_counts.get(author, 0) + 1

sorted_author_counts = sorted(author_counts.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
top100_authors = ["Austen, Jane", "Dickens, Charles", "Twain, Mark", "Doyle, Arthur Conan", "Wilde, Oscar", "Stevenson, Robert Louis", "Shakespeare, William", "Shelley, Mary Wollstonecraft", "Conrad, Joseph", "Carroll, Lewis", "Swift, Jonathan", "Ibsen, Henrik", "Plato", "Homer", "Poe, Edgar Allan", "Jowett, Benjamin", "Hawthorne, Nathaniel", "Dostoyevsky, Fyodor", "Melville, Herman", "Hall, J. Lesslie (John Lesslie)", "Wells, H. G. (Herbert George)", "Stoker, Bram", "Pope, Alexander", "Irving, Washington", "Doré, Gustave", "Tolstoy, Leo, graf", "Nietzsche, Friedrich Wilhelm", "Kafka, Franz", "Garnett, Constance", "Joyce, James", "Dante Alighieri", "Dumas, Alexandre", "James, Henry", "Grimm, Jacob", "Grimm, Wilhelm", "Buckley, Theodore Alois", "Wyllie, David", "Baum, L. Frank (Lyman Frank)", "Kipling, Rudyard", "Verne, Jules", "Gilman, Charlotte Perkins", "Robertson, James Alexander", "Chekhov, Anton Pavlovich", "Blair, Emma Helen", "Bourne, Edward Gaylord", "Montgomery, L. M. (Lucy Maud)", "Du Bois, W. E. B. (William Edward Burghardt)", "Hugo, Victor", "Morley, Henry", "Chesterton, G. K. (Gilbert Keith)", "Russell, Bertrand", "Brontë, Charlotte", "Franklin, Benjamin", "Hobbes, Thomas", "Goethe, Johann Wolfgang von", "London, Jack", "Shaw, Bernard", "Cervantes Saavedra, Miguel de", "Lang, Andrew", "Kemble, E. W. (Edward Windsor)", "Wharton, Edith", "Cary, Henry Francis", "Maude, Louise", "Bierce, Ambrose", "Defoe, Daniel", "Machiavelli, Niccolò", "Maude, Aylmer", "Leech, John", "Alcott, Louisa May", "Mill, John Stuart", "Townsend, F. H. (Frederick Henry)", "Barrie, J. M. (James Matthew)", "Widger, David", "Thoreau, Henry David", "Marriott, W. K. (William Kenaz)", "Hesse, Hermann", "Douglass, Frederick", "Marlowe, Christopher", "Wodehouse, P. G. (Pelham Grenville)", "Balzac, Honoré de", "Voltaire", "Scott, Walter", "Malory, Thomas, Sir", "Chaucer, Geoffrey", "Burroughs, Edgar Rice", "Burnett, Frances Hodgson", "Smith, E. Boyd (Elmer Boyd)", "Potter, Beatrix", "Burton, Richard Francis, Sir", "Emerson, Ralph Waldo", "Maupassant, Guy de", "Hardy, Thomas", "Madison, James", "Darwin, Charles", "Hamilton, Alexander", "Jay, John", "Pine, Frank Woodworth", "Hapgood, Isabel Florence", "Eliot, George", "Ormsby, John"]

for i, author in enumerate(top100_authors):
    if author not in author_counts:
        print(i, author, "N/A")
    elif author_counts[author] < 7:
        print(i, author, author_counts[author])

# Parse book HTML

In [None]:
idno = 42
path = get_gutenberg_book_html_local(idno)
pprint(get_metadata('subject', idno))
pprint(get_metadata('title', idno))
pprint(get_metadata('language', idno))

In [None]:
with open(path) as fh:
    soup = BeautifulSoup(fh, 'lxml')

In [None]:
for i, par in enumerate(soup.find_all('p')):
    text = par.text
    print(re.sub('\s*\n+\s*', ' ', text))

Does not work well because iso-8859-1 are silently parsed incorrectly. Directly parse book text file instead.

# Parse book text

In [None]:
_CR = r'\r'
_CF = r'\n'
_PARA_BREAK = r'\n\n+'
_LINE_BREAK = r'\s*\n\s*'
_END_WS_DASH = r'([^-]\s\-{1,3})' + _LINE_BREAK
_START_DASH_WS = _LINE_BREAK + r'(\-{2,3}\s[^-])'
_END_DASH = r'([^-]\-{1,3})' + _LINE_BREAK
_START_DASH = _LINE_BREAK + r'(\-{2,3}[^-])'

_CR_RE = re.compile(_CR)
_CF_RE = re.compile(_CF)
_PARA_BREAK_RE = re.compile(_PARA_BREAK)
_LINE_BREAK_RE = re.compile(_LINE_BREAK)
_END_WS_DASH_RE = re.compile(_END_WS_DASH)
_START_DASH_WS_RE = re.compile(_START_DASH_WS)
_END_DASH_RE = re.compile(_END_DASH)
_START_DASH_RE = re.compile(_START_DASH)


def remove_carriage_return(text):
    return _CR_RE.sub('', text)


def paragraph_segment(text):
    return _PARA_BREAK_RE.split(text)


def is_poetry(text, threshold=0.95):
    '''Check if beginning of every line is capitalized.'''
    lines = _LINE_BREAK_RE.split(text)
    total_lines = len(lines)
    bad_lines = 0
    
    if len(lines) < 2:
        return False
    
    for line in lines:
        if line and line[0].islower():
            bad_lines += 1

    return bad_lines/total_lines < (1 - threshold)


def stitch_paragraph(text):
    # must sub dash with ws first
    text = _END_WS_DASH_RE.sub('\\1 ', text)
    text = _START_DASH_WS_RE.sub(' \\1', text)
    text = _END_DASH_RE.sub('\\1', text)
    text = _START_DASH_RE.sub('\\1', text)
    text = _LINE_BREAK_RE.sub(' ', text)
    return text


def sentence_segment(text):
    chunks = _CF_RE.split(text)
    chunks_sents = map(sent_tokenize, chunks)
    sents = itertools.chain.from_iterable(chunks_sents)
    return sents


def extract_sentences(text):
    if not text:
        return []
    text_norm = remove_carriage_return(text)
    paras = paragraph_segment(text_norm)
    paras_norm = map(lambda p: p if is_poetry(p) else stitch_paragraph(p),
                     paras)
    return itertools.chain.from_iterable(map(sentence_segment, paras_norm))

### Evaluate Poetry Detection
Basic analysis of poetry detection shows that simple Capitalization detection is fairly consistant without any additional tuning.

In [None]:
paragraphs

In [None]:
poetry_percent_dict = {}
for idno in poetry_idnos:
    text = load_gutenberg_book_text(idno)
    if not text:
        continue
    text = remove_carriage_return(text)
    paragraphs = paragraph_segment(text)
    p_count = 0
    np_count = 0
    for par in paragraphs:
        if is_poetry(par):
            p_count += 1
        else:
            np_count += 1
    poetry_percent_dict[idno] = p_count/(p_count+np_count)

poetry_percents = list(poetry_percent_dict.values())
poetry_percent_mean = np.mean(poetry_percents)
poetry_percent_std = np.std(poetry_percents)
plt.hist(poetry_percents, bins=100)
plt.xlabel("poetry paragraph percentage")
plt.ylabel("number of books")
pprint("mean: {}, std: {}".format(poetry_percent_mean, poetry_percent_std))

human evaluation of poetry classification.

In [None]:
text = load_gutenberg_book_text(poetry_idnos[42])
text = remove_carriage_return(text)
paragraphs = paragraph_segment(text)
for par in paragraphs:
    print(par)
    if is_poetry(par):
        print('#'*80)
    else:
        print('. '*40)

### Evaluate Sentence Extraction

In [None]:
idno=42

text = load_gutenberg_book_text(idno)
pprint(get_metadata('subject', idno))
pprint(get_metadata('title', idno))

sents = list(extract_sentences(text))

sent_lens = [len(s) for s in sents]
sent_len_mean = np.mean(sent_lens)
sent_len_std = np.std(sent_lens)

plt.hist(sent_lens, bins=100)
plt.xlabel("sentence length (chars)")
plt.ylabel("number of sentences")
pprint("mean: {}, std: {}".format(sent_len_mean, sent_len_std)) 

# Books to sentences

In [None]:
def _init_extract_gutenberg_sentences(lang):
    extract_gutenberg_sentences.segmenter = load_segmenter(lang)

    
def extract_gutenberg_sentences(idno, lang='en'):
    if (not hasattr(extract_gutenberg_sentences, 'segmenter') or 
        extract_gutenberg_sentences.segmenter.lang is not lang):
        logging.warning("Creating tokenizer with lang: {}".format(lang))
        _init_extract_gutenberg_sentences(lang)
    
    text = load_gutenberg_book_text(idno)
    return list(extract_sentences(text, extract_gutenberg_sentences.segmenter))


def generate_gutenberg_sentences(idno, lang='en'):
    cache_path = get_gutenberg_book_sents_local(idno)
    if os.path.isfile(cache_path):
        return
    cache_dir = os.path.dirname(cache_path)
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    sents = extract_gutenberg_sentences(idno, lang=lang)
    if not sents:
        return
    with open(cache_path, 'w', encoding='utf-8') as fh:
        fh.write('\n'.join(sents))

In [None]:
pool = Pool(processes=4)
_ = list(tqdm(pool.imap(generate_gutenberg_sentences, range(_MAX_IDNO), chunksize=10)))

# Write metadata json

Store preprocessed metadata because the raw metadata can easily be extracted using `get_metadata`

**preprocessing:**:
- `author` [categorical] : as is, use each unique author as a class (will probably need to take top $n$ authors. Alternatively, lower case, remove all punctuations except period (because period is used for abbreviation), although this is probably not a good idea because sharing embedding for names is likely useless.
- `language` [categorical] : as is.
- `subject`[language] : lower case, replace punctuations with space, take unique space separated words.
- `title` [categorical] : as is, although this is probably not practical becauses there are too many unique titles to store.

In [None]:
def get_subject_words(idno):
    subjs = get_metadata('subject', idno)
    subjs_norm = map(lambda s: re.sub('[^\w\s]', ' ', s.lower()), subjs)
    subjs_words = list(map(lambda s: set(s.split()), subjs_norm)) or [{'N/A'}]
    subj_words = reduce(set.union, subjs_words)
    return subj_words

def extract_gutenberg_meta(idno):
    metadata = {}
    metadata['author'] = list(get_metadata('author', idno))
    metadata['language'] = list(get_metadata('language', idno))
    metadata['title'] = list(get_metadata('title', idno))
    metadata['subject'] = list(get_subject_words(idno))
    return metadata

def generate_gutenberg_meta(idno):
    cache_path = get_gutenberg_book_meta_local(idno)
    if os.path.isfile(cache_path):
        return
    cache_dir = os.path.dirname(cache_path)
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    metadata = extract_gutenberg_meta(idno)
    with open(cache_path, 'w', encoding='utf-8') as fh:
        json.dump(metadata, fh)

In [None]:
pool = Pool(processes=4)
_ = list(tqdm(pool.imap(generate_gutenberg_meta, range(_MAX_IDNO), chunksize=10)))

# Pre-Text and Post-Text removal

In [None]:
start_tags = [
    "*** START OF THIS PROJECT GUTENBERG EBOOK",
    "*** START OF THE PROJECT GUTENBERG EBOOK",
    "***START OF THE PROJECT GUTENBERG EBOOK",
]
    
end_tags = [
    "*** END OF THIS PROJECT GUTENBERG EBOOK",
    "*** END OF THE PROJECT GUTENBERG EBOOK",
]


In [None]:
idno = 137
has_start = 0
has_end = 0
for sent in load_gutenberg_book_sents(idno):
    tagged = False
    for tag in start_tags:
        if sent.startswith(tag):
            has_start += 1
            tagged = True
            break
    if tagged:
        continue
    for tag in end_tags:
        if sent.startswith(tag):
            has_end += 1
            break

print('idno: {} starts: {} ends: {}'.format(idno, has_start, has_end))