In [16]:
import requests
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from gutenberg.acquire.text import UnknownDownloadUriException
import re
from gensim.utils import tokenize
import random
import nltk
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import os
import glob
import json

In [9]:
with open('data/gutenberg_index.json') as fin:
    authors = json.load(fin)
recent = [x for x in authors if 'birthdate' in x and x['birthdate'] > 1830]
[(x['name'], x['birthdate'], x['english_books']) for x in recent[:5]]

[('Twain, Mark', 1835, 210),
 ('Ebers, Georg', 1837, 164),
 ('Parker, Gilbert', 1862, 135),
 ('Fenn, George Manville', 1831, 128),
 ('Jacobs, W. W. (William Wymark)', 1863, 112)]

In [3]:
print(list_supported_metadatas())

('author', 'formaturi', 'language', 'rights', 'subject', 'title')


In [12]:
PARAGRAPH_SPLIT_RE = re.compile(r'\n *\n+')

def extract_conversations(text, quote='"'):
    paragraphs = PARAGRAPH_SPLIT_RE.split(text.strip())
    conversations = [['']]
    for paragraph in paragraphs:
        chunks = paragraph.replace('\n', ' ').split(quote)
        for i in range((len(chunks) + 1) // 2):
            if (len(chunks[i * 2]) > 100 or len(chunks) == 1) and conversations[-1] != ['']:
                if conversations[-1][-1] == '':
                    del conversations[-1][-1]
                conversations.append([''])
            if i * 2 + 1 < len(chunks):
                chunk = chunks[i * 2 + 1]
                if chunk:
                    if conversations[-1][-1]:
                        if chunk[0] >= 'A' and chunk[0] <= 'Z':
                            if conversations[-1][-1].endswith(','):
                                conversations[-1][-1] = conversations[-1][-1][:-1]
                            conversations[-1][-1] += '.'
                        conversations[-1][-1] += ' '
                    conversations[-1][-1] += chunk
        if conversations[-1][-1]:
            conversations[-1].append('')

    return [x for x in conversations if len(x) > 1]


conversations = extract_conversations(strip_headers(load_etext(10008).strip()))
sum(len(x) for x in conversations)

1126

In [17]:
LATIN_1_CHARS = (
    (u'\xe2\x80\x99', "'"),
    (u'\xc3\xa9', 'e'),
    (u'\xe2\x80\x90', '-'),
    (u'\xe2\x80\x91', '-'),
    (u'\xe2\x80\x92', '-'),
    (u'\xe2\x80\x93', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x94', '-'),
    (u'\xe2\x80\x98', "'"),
    (u'\xe2\x80\x9b', "'"),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9c', '"'),
    (u'\xe2\x80\x9d', '"'),
    (u'\xe2\x80\x9e', '"'),
    (u'\xe2\x80\x9f', '"'),
    (u'\xe2\x80\xa6', '...'),
    (u'\xe2\x80\xb2', "'"),
    (u'\xe2\x80\xb3', "'"),
    (u'\xe2\x80\xb4', "'"),
    (u'\xe2\x80\xb5', "'"),
    (u'\xe2\x80\xb6', "'"),
    (u'\xe2\x80\xb7', "'"),
    (u'\xe2\x81\xba', "+"),
    (u'\xe2\x81\xbb', "-"),
    (u'\xe2\x81\xbc', "="),
    (u'\xe2\x81\xbd', "("),
    (u'\xe2\x81\xbe', ")")
)

books = 0
for author in recent[:1000]:
    for book in author['books']:
        books += 1
        try:
            txt = strip_headers(load_etext(int(book[0]))).strip()
        except UnknownDownloadUriException:
            continue
        for ch1, ch2 in LATIN_1_CHARS:
            txt = txt.replace(ch1, ch2)
        conversations += extract_conversations(txt)

print(len(conversations), books)

1646779 15349


In [18]:
with open('gutenberg.txt', 'w') as fout:
    for conv in conversations:
        fout.write('\n'.join(conv) + '\n\n')

In [30]:
RE_TOKEN = re.compile('(\w+|\?)', re.UNICODE)
token_counter = Counter()
with open('gutenberg.txt') as fin:
    for line in fin:
        line = line.lower().replace('_', ' ')
        token_counter.update(RE_TOKEN.findall(line))
with open('gutenberg.tok', 'w') as fout:
    for token, count in token_counter.items():
        fout.write('%s\t%d\n' % (token, count))

In [29]:
token_counter['?']

2674921

In [21]:
PAT_ALPHABETIC.findall(conv[0])

[('And', 'd'),
 ('I', 'I'),
 ('I', 'I'),
 ('ve', 'e'),
 ('got', 't'),
 ('to', 'o'),
 ('arrest', 't'),
 ('him', 'm'),
 ('in', 'n'),
 ('my', 'y'),
 ('own', 'n'),
 ('house', 'e'),
 ('I', 'I'),
 ('doubt', 't'),
 ('if', 'f'),
 ('you', 'u'),
 ('will', 'l'),
 ('have', 'e'),
 ('the', 'e'),
 ('opportunity', 'y'),
 ('sir', 'r')]

In [None]:
RE