In [34]:
# imports
from urllib.request import urlopen, HTTPError
from bs4 import BeautifulSoup
import re

# sentence parsing
import spacy

from random import sample, choices, seed
seed(1)

from collections import Counter
from pprint import pprint

import os

import pandas as pd

# Exercise 1: getting `n` sentences per author for `k` authors

## Constants

In [33]:
# program default parameters
AUTHOR_NUMBER = 10  # k
SENTENCE_PER_AUTHOR = 50  # n

# book selection criterion
LANGUAGE_ROLE_REGEX = re.compile(r'\((\w+)\) \(as (\w+)\)')
# - indivdual role in the book
ROLES = {'Illustrator', 'Editor', 'Compiler', 'Translator', 'Contributer', 'Dubious author', 'Author'}
ROLE = 'Author'
assert ROLE in ROLES
# - language
SCIPY_LANGUAGES = {
    'German': 'de',
    'Greek': 'el',
    'English': 'en',
    'Spanish': 'es',
    'French': 'fr',
    'Italian': 'it',
    'Dutch': 'nl',
    'Portuguese': 'pt'}
LANGUAGE = 'English'
assert LANGUAGE in SCIPY_LANGUAGES.keys()

# author selection criterion
# - alphabetical: look up the authors in lexicographic order, untill the requiered number of author is reached
# - lexicographic balanced: try to find the same number of authors for each letter
AUTHOR_SELECTION_STRATEGIES = {'lexicographic', 'lexicographic balanced'}
AUTHOR_SELECTION_STRATEGY = 'lexicographic'
assert AUTHOR_SELECTION_STRATEGY in AUTHOR_SELECTION_STRATEGIES

# minimum number of books per author
BOOK_THRESHOLD = 4

# url for letter search
GUTEMBERG_LETTER_URL = 'http://www.gutenberg.org/browse/authors/{}'

# url for book download
GUTEMBERG_LETTER_UTF = "http://www.gutenberg.org{}.txt.utf-8"

VERBOSE = False

LETTERS = 'abcdefghijklmnopqrstuvwxyz'

# data save directory
SAVE_PATH = 'data'
if not os.path.isdir(SAVE_PATH):
    os.mkdir(SAVE_PATH)
    
# CSV book/bookfile name/author matchup file
CSV_FILE_PATH = os.path.join(SAVE_PATH, 'book_catalogue.csv')
CSV_HEADERS = ['author', 'book_url', 'book_title', 'book_file']

In [3]:
# scipy parser singleton
PARSERS = dict()
def get_parser(language=LANGUAGE):
    # load the sentence parser for the target language
    if language not in PARSERS.keys():
        PARSERS[language] = spacy.load(SCIPY_LANGUAGES[language])
    return PARSERS[language]


## Code

In [4]:
def get_book(book_link, base_url=GUTEMBERG_LETTER_UTF, verbose=VERBOSE):
    url = base_url.format(book_link)
    
    if verbose: print("Downloading book {} from {}".format(book_link, url))
    
    # fetching file content
    text = urlopen(url).read()
    return text

In [5]:
# TODO add check for duplicate book
def parse_page(soup: BeautifulSoup, language, role, base_book_url=GUTEMBERG_LETTER_UTF, verbose=VERBOSE):
    content_div = soup.find('div', class_='pgdbbyauthor')
    if not content_div: # if the main div is not found, return empy list of books
        yield None, []
    
    else:
        authors_lists = content_div.find_all(["ul", "h2"])
        
        for elem in authors_lists:
            if elem.name == 'h2':
                # get the first a tag with name attribute as the author name if there is at least one
                name_list = [a_tag.get_text() for a_tag in elem.find_all('a') if a_tag.has_attr('name')]
                author_name = None if len(name_list) < 1 else name_list[0]
                
            elif elem.name == 'ul' and author_name:
                book_li_list = elem.find_all('li', class_="pgdbetext")
                books = dict()
                for book_li in book_li_list:
                    # we try to extract the language and role for the book
                    match = LANGUAGE_ROLE_REGEX.search(book_li.get_text())
                    
                    if match:
                        book_language, book_role = match.group(1, 2)

                        if book_language.lower() == language.lower() and book_role.lower() == role.lower():
                            try: # try to open and decode the book, if it fails we ignore the book
                                book_title = book_li.find('a').get_text()
                                book_link = book_li.find('a')['href']
                                
                                get_book(book_link, base_url=base_book_url, verbose=verbose).decode('utf-8')
                                
                                # add the book to the list of books
                                books[book_title] = book_link
                                
                            except (UnicodeDecodeError, HTTPError):
                                if verbose: print("Book {} is either badly encoded or unavailable at this address, skipping it".format(book_link))
                                pass
                
                yield author_name, books
                
                # clean the memory to avoid getting multiple ul per author
                author_name = None

In [6]:
# TODO add check for duplicate book
def get_authors_books(letter,
                      book_anti_diplicate_set=set(),
                      language=LANGUAGE,
                      role=ROLE,
                      book_threshold=BOOK_THRESHOLD,
                      author_number=AUTHOR_NUMBER,
                      base_letter_url=GUTEMBERG_LETTER_URL,
                      base_book_url=GUTEMBERG_LETTER_UTF,
                      verbose=VERBOSE):
    url = base_letter_url.format(letter.lower())
    
    # get page
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    # extract autors and books from list
    author_dict = dict()
    for author_name, books in parse_page(soup, language, role, base_book_url=base_book_url, verbose=verbose):
        books = {book_link: book_title for book_link, book_title in books.items()
                 if book_link not in book_anti_diplicate_set} # remove books already selected
        if len(books) >= book_threshold:
            # add author and books to the list if there is enough books
            author_dict[author_name] = books
            book_anti_diplicate_set |= books.keys()
            
            if verbose: print(author_name, books)
            
        
        if len(author_dict) >= author_number: break
            
    return author_dict, book_anti_diplicate_set

In [16]:
# pick sentences from a book
def get_book_sents(book_link, sentence_number, language=LANGUAGE, base_url=GUTEMBERG_LETTER_UTF, verbose=VERBOSE):
    # get the content of the book
    book_text = get_book(book_link, base_url, verbose=verbose).decode('utf-8')
    book_text = book_text.split("***",2)[2].strip() # removing gutenberg header
    
    # sentence tokenize using spacy
    book_text = book_text[:500000]  # taking only a sample of the book as spacy puts a limit on the number of characters
    if verbose: print("Parsing book {} with spacy".format(book_link))
    parsed_text = get_parser(language)(book_text)
    sentences = [sent.text for sent in parsed_text.sents]
    
    # random sampling of sentences
    chosen_sentences = sample(sentences, k=sentence_number)
    
    # removing extra spacing characters from selected lines
    chosen_sentences = [re.sub(r'\s+', ' ', sent).strip() for sent in chosen_sentences]
    return chosen_sentences

In [12]:
def get_sentence_per_book(books, sentence_number=SENTENCE_PER_AUTHOR, verbose=VERBOSE):  #decide number of lines per book
    # select the number of sentences to pick from each book
    sentence_per_book_counter = Counter()
    while sum(sentence_per_book_counter.values()) < sentence_number:
        for book_name, book_url in books.items():
            sentence_per_book_counter[book_url] += 1
            if sum(sentence_per_book_counter.values()) >= sentence_number:
                break
                
    if verbose:
        print("Sentence repartition:")
        pprint(sentence_per_book_counter)
        
    return sentence_per_book_counter

In [19]:
def get_sentences(author_dict,
                  sentence_number=SENTENCE_PER_AUTHOR,
                  language=LANGUAGE,
                  base_book_url=GUTEMBERG_LETTER_UTF,
                  verbose=VERBOSE):
    sentence_dict = dict()
    for author, books in author_dict.items():
        # select the number of sentences to pick from each book of the author
        line_per_book_counter = get_sentence_per_book(books, sentence_number, verbose=verbose)
        
        # load the sentences from each book
        sentence_dict[author] = dict()
        for book_url, book_sentence_number in line_per_book_counter.items():
            sentences = get_book_sents(book_url,
                                       book_sentence_number,
                                       base_url=base_book_url,
                                       language=language,
                                       verbose=verbose)
            sentence_dict[author][book_url] = sentences
            if verbose: 
                print("Sentences from {}:".format(book_url))
                pprint(sentences)
    
    return sentence_dict

In [10]:
def author_per_letter(author_number=AUTHOR_NUMBER, letters=LETTERS):
    """Return a Counter object over randomly chosen letters among 'letters'.
    The total of the elements in the counter is equal to 'author_number'.
    
    :param author_number: number of elements to choose accross the letters (int)
    :param letters: sequence of elements to choose from (itterable)
    
    :return: a counter where keys are letters and values add up to author_number (Counter)"""
    return Counter(choices(letters, k=author_number))

In [31]:
def get_file_name(book_url, path=SAVE_PATH):
    return os.path.join(path, book_url.split('/')[-1] + ".txt")

def save_sentence_dict(sentence_dict, author_dict, path=SAVE_PATH, verbose=VERBOSE):
    book_to_path = dict()
    for author_sentence_dict in sentence_dict.values():
        for book_url, book_sentences in author_sentence_dict.items():
            file_name = get_file_name(book_url, path=path)

            with open(file_name, 'w') as f:
                f.write("\n".join(book_sentences))

                # stores the path to the file
                book_to_path[book_url] = file_name
                if verbose: print("{} line(s) from book '{}' written to '{}'".format(len(book_sentences), book_url, file_name))
    
    # save the information about books from which sentences were chosen (their URL, their title, and the file where the sentences were saved)
    csv_data = [[author_name, book_url, book_title, book_to_path[book_url]]
                for author_name, books in author_dict.items()
                for book_title, book_url in books.items()
                if book_url in book_to_path.keys()]
    
    # write the data to CSV
    pprint(csv_data)
    df = pd.DataFrame(csv_data, columns=CSV_HEADERS)
    df.to_csv(CSV_FILE_PATH)
    
save_sentence_dict({'Fabre, Jean-Henri, 1823-1915': {'/ebooks/3421':
['Most of these insects have been submitted to a learned expert, Professor Jean Perez, of Bordeaux.'],
                    '/ebooks/27868':
['The Glow-worm does not eat in the strict sense of the word: he drinks his fill; he feeds on a thin gruel into which he transforms his prey by a method recalling that of the maggot.']}},
                   {'Fabre, Jean-Henri, 1823-1915': {'Bramble-Bees and Others': '/ebooks/3421', 'The Glow-Worm and Other Beetles': '/ebooks/27868', 'The Life of the Fly; With Which are Interspersed Some Chapters of Autobiography': '/ebooks/3422', 'The Life of the Spider': '/ebooks/1887', 'The Mason-Bees': '/ebooks/2884', 'More Hunting Wasps': '/ebooks/3462', 'Social Life in the Insect World': '/ebooks/18350', 'The Wonders of Instinct: Chapters in the Psychology of Insects': '/ebooks/3754'}},
                  verbose=True) 

1 line(s) from book '/ebooks/3421' written to 'data\3421.txt'
1 line(s) from book '/ebooks/27868' written to 'data\27868.txt'
[['Fabre, Jean-Henri, 1823-1915',
  '/ebooks/3421',
  'Bramble-Bees and Others',
  'data\\3421.txt'],
 ['Fabre, Jean-Henri, 1823-1915',
  '/ebooks/27868',
  'The Glow-Worm and Other Beetles',
  'data\\27868.txt']]


In [20]:
def exercice_1(author_number=AUTHOR_NUMBER,
               sentence_number=SENTENCE_PER_AUTHOR,
               language=LANGUAGE,
               role=ROLE,
               book_threshold=BOOK_THRESHOLD,
               base_letter_url=GUTEMBERG_LETTER_URL,
               base_book_url=GUTEMBERG_LETTER_UTF,
               letters=LETTERS,
               save_path=SAVE_PATH,
               verbose=VERBOSE):
    book_anti_diplicate_set = set()
    author_dict = dict()
    
    # Step 1: selecting the k authors
    for letter, count in author_per_letter(author_number=author_number, letters=letters).items():
        if verbose: print("Picking {} author(s) from letter {}".format(count, letter))
            
        # generate a dictionnary of authors and their books
        author_dict_temp, book_anti_diplicate_set = get_authors_books(letter,
                                                                      book_anti_diplicate_set=book_anti_diplicate_set, 
                                                                      language=language,
                                                                      book_threshold=book_threshold,
                                                                      author_number=count,
                                                                      base_letter_url=base_letter_url,
                                                                      base_book_url=base_book_url,
                                                                      verbose=verbose)
        author_dict.update(author_dict_temp)
        
    # Step 2: getting n sentences per author
    sentence_dict = get_sentences(author_dict,
                                  sentence_number=sentence_number,
                                  language=language,
                                  base_book_url=base_book_url,
                                  verbose=verbose)
    
    # Step 3: storing the sentences, 1 file per book, adn 1 file to link the books to their main author
    save_to_files(sentence_dict, author_dict, path=save_path, verbose=verbose)
    return author_dict
        
print(exercice_1(author_number=3, sentence_number=2, verbose=True))

Picking 1 author(s) from letter s
Downloading book /ebooks/38446 from http://www.gutenberg.org/ebooks/38446.txt.utf-8
Downloading book /ebooks/18787 from http://www.gutenberg.org/ebooks/18787.txt.utf-8
Downloading book /ebooks/2389 from http://www.gutenberg.org/ebooks/2389.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/2389
Downloading book /ebooks/1965 from http://www.gutenberg.org/ebooks/1965.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/1965
Downloading book /ebooks/2636 from http://www.gutenberg.org/ebooks/2636.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/2636
Downloading book /ebooks/7949 from http://www.gutenberg.org/ebooks/7949.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/7949
Downloading book /ebooks/3467 from http://www.gutenberg.org/ebooks/3467.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/3467
Downloading book /

Downloading book /ebooks/27513 from http://www.gutenberg.org/ebooks/27513.txt.utf-8
Downloading book /ebooks/11398 from http://www.gutenberg.org/ebooks/11398.txt.utf-8
Book is either badly encoded or unavailable at this address: /ebooks/11398
Downloading book /ebooks/6462 from http://www.gutenberg.org/ebooks/6462.txt.utf-8
Downloading book /ebooks/27684 from http://www.gutenberg.org/ebooks/27684.txt.utf-8
Downloading book /ebooks/17469 from http://www.gutenberg.org/ebooks/17469.txt.utf-8
Downloading book /ebooks/748 from http://www.gutenberg.org/ebooks/748.txt.utf-8
Downloading book /ebooks/27079 from http://www.gutenberg.org/ebooks/27079.txt.utf-8
Yates, Dornford, 1885-1960 {'Anthony Lyveden': '/ebooks/27684', 'Berry and Co.': '/ebooks/17469', 'The Brother of Daphne': '/ebooks/748', 'Jonah and Co.': '/ebooks/27079'}
Sentence repartition:
Counter({'/ebooks/54887': 1, '/ebooks/31130': 1})
Downloading book /ebooks/54887 from http://www.gutenberg.org/ebooks/54887.txt.utf-8
Parsing book /e

NameError: name 'save_sentence_dict' is not defined

## Step 1: get a list of authors and their books (and choose the authors)

In [25]:
test_author_books = get_authors_books('a', author_number=15, book_threshold=2, verbose=True)

Downloading book /ebooks/29666 from http://www.gutenberg.org/ebooks/29666.txt.utf-8
Downloading book /ebooks/26348 from http://www.gutenberg.org/ebooks/26348.txt.utf-8
Downloading book /ebooks/11861 from http://www.gutenberg.org/ebooks/11861.txt.utf-8
Downloading book /ebooks/25753 from http://www.gutenberg.org/ebooks/25753.txt.utf-8
Aaron, S. F. (Samuel Francis), 1862- {"Radio Boys Cronies\rOr, Bill Brown's Radio": '/ebooks/11861', 'Radio Boys Loyalty; Or, Bill Brown Listens In': '/ebooks/25753'}
Downloading book /ebooks/10338 from http://www.gutenberg.org/ebooks/10338.txt.utf-8
Downloading book /ebooks/16791 from http://www.gutenberg.org/ebooks/16791.txt.utf-8
Downloading book /ebooks/23037 from http://www.gutenberg.org/ebooks/23037.txt.utf-8
Downloading book /ebooks/19768 from http://www.gutenberg.org/ebooks/19768.txt.utf-8
Downloading book /ebooks/58663 from http://www.gutenberg.org/ebooks/58663.txt.utf-8
Downloading book /ebooks/4955 from http://www.gutenberg.org/ebooks/4955.txt.u

Downloading book /ebooks/27692 from http://www.gutenberg.org/ebooks/27692.txt.utf-8
Downloading book /ebooks/28776 from http://www.gutenberg.org/ebooks/28776.txt.utf-8
Downloading book /ebooks/12291 from http://www.gutenberg.org/ebooks/12291.txt.utf-8
Downloading book /ebooks/22251 from http://www.gutenberg.org/ebooks/22251.txt.utf-8
Downloading book /ebooks/25848 from http://www.gutenberg.org/ebooks/25848.txt.utf-8
Downloading book /ebooks/25351 from http://www.gutenberg.org/ebooks/25351.txt.utf-8
Abbott, Jacob, 1803-1879 {'Alexander the GreatMakers of History': '/ebooks/30624', 'Caleb in the Country': '/ebooks/23989', 'Charles IMakers of History': '/ebooks/26734', 'Cleopatra': '/ebooks/10992', 'Cyrus the GreatMakers of History': '/ebooks/30707', 'Darius the GreatMakers of History': '/ebooks/27802', "Forests of MaineMarco Paul's Adventures in Pursuit of Knowledge": '/ebooks/24831', 'Genghis Khan, Makers of History Series': '/ebooks/28667', 'Gentle Measures in the Management and Traini

Downloading book /ebooks/44980 from http://www.gutenberg.org/ebooks/44980.txt.utf-8
Downloading book /ebooks/30047 from http://www.gutenberg.org/ebooks/30047.txt.utf-8
Downloading book /ebooks/15648 from http://www.gutenberg.org/ebooks/15648.txt.utf-8
Downloading book /ebooks/22305 from http://www.gutenberg.org/ebooks/22305.txt.utf-8
Downloading book /ebooks/26416 from http://www.gutenberg.org/ebooks/26416.txt.utf-8
Abbot, Willis J. (Willis John), 1863-1934 {"Aircraft and Submarines\rThe Story of the Invention, Development, and Present-Day Uses of War's Newest Weapons": '/ebooks/30047', 'American Merchant Ships and Sailors': '/ebooks/15648', 'The Naval History of the United States. Volume 1': '/ebooks/22305', 'The Naval History of the United States. Volume 2': '/ebooks/26416'}
Downloading book /ebooks/32996 from http://www.gutenberg.org/ebooks/32996.txt.utf-8
Downloading book /ebooks/12541 from http://www.gutenberg.org/ebooks/12541.txt.utf-8
Downloading book /ebooks/38173 from http://w

## Step 2: get the sentences from a book

In [26]:
for books in test_author_books.values():
    for book_name, book_url in books.items():
        print(get_book_sents(book_url, 5, verbose=True))

Downloading book /ebooks/11861 from http://www.gutenberg.org/ebooks/11861.txt.utf-8
Parsing book /ebooks/11861 with spacy
["that's nothing but selfishness!", 'Then: "Bring \'em all back here, Gus."', "Divy-divy, half'n'half, fifty-fifty", '"I don\'t suppose it makes much difference what he says; he simply doesn\'t know what he\'s talk--', '"The road.']
Downloading book /ebooks/25753 from http://www.gutenberg.org/ebooks/25753.txt.utf-8
Parsing book /ebooks/25753 with spacy
['What if the blow had proved fatal?', '"It\'s an invitation to a banquet, or something," Gus said.', 'What if we work it this way?"', 'Questioned eagerly, Bill explained quite freely the purpose of the encounter and its result.', 'When Bill spoke again, some few minutes']
Downloading book /ebooks/97 from http://www.gutenberg.org/ebooks/97.txt.utf-8
Parsing book /ebooks/97 with spacy
['"Not at all silly," said I, losing my temper; "here for example, I take this Square," and, at the word, I grasped a moveable Square, w

Downloading book /ebooks/34673 from http://www.gutenberg.org/ebooks/34673.txt.utf-8
Parsing book /ebooks/34673 with spacy
["The Foundation's EIN or federal tax identification number is 64-6221541.", 'Many tales are told.', 'The second phase of this hydro-engineering feat was now begun.', 'A slap on the water and a shower of spray informed us that we were recognized.', 'Smaller trees were dug up and roots which crossed the path of the canal were cut off as clean as if chopped with an axe.']
Downloading book /ebooks/34672 from http://www.gutenberg.org/ebooks/34672.txt.utf-8
Parsing book /ebooks/34672 with spacy
['Dr. Gregory B. Newby Chief Executive and Director gbnewby@pglaf.org Section 4.', 'Copyright 1919', 'Copyright laws in most countries are in a constant state of change.', 'You may convert to and distribute this work in any binary, compressed, marked up, nonproprietary or proprietary form, including any word processing or hypertext form.', '1.F.3.']
Downloading book /ebooks/34669 

['There are four several methods by which the various communities into which the human race is divided obtain their subsistence from the productions of the earth, each of which leads to its own peculiar system of social organization, distinct in its leading characteristics from those of all the rest.', 'there to render an account of their administration, and to answer any charges which had been made against them.', 'The reason is, that towns are the seats of commerce and manufactures, and they derive their chief importance from those pursuits; whereas the Monguls and Tartars led almost exclusively a wandering and pastoral life, and all their ideas of wealth and grandeur were associated with great flocks and herds of cattle, and handsome tents, and long trains of wagons loaded with stores of clothing, arms, and other movables, and vast encampments in the neighborhood of rich and extended pasture-grounds.', 'One of them was to the King of France.', 'His birth took place, as nearly as can

['The roof was formed of sheets of hemlock bark, laid, like slates upon rafters made of the stems of slender trees.', 'The horse, hearing footsteps, and supposing from the sound that somebody might be coming to catch him, was at first disposed to set off and gallop away; but looking round and seeing that it was nobody but Phonny he went on eating as before.', '"She has found the ladder," said Mary Bell, and leaving the stairs she went to meet her.', 'She also had a pail of water ready, from the spring, and the tea-kettle by the side of it, ready to be filled.', 'In the morning, the first thought which came into her mind was, that Mary Bell was coming to see her.']
Downloading book /ebooks/28283 from http://www.gutenberg.org/ebooks/28283.txt.utf-8
Parsing book /ebooks/28283 with spacy
["Mary's reception.--", "The end of Mary's ambition realized.--", 'She said in this letter that she was glad to hear that they had pronounced sentence of death against her, for she was weary of life, and h

['_', 'Rollo also took great pains to guard against another fault which boys often fall into in writing their letters; that is, the fault of growing careless about the writing as they go on with the work, by which means a letter is produced which looks very neat and pretty at the beginning, but becomes an ill-looking and almost illegible scrawl at the end.', "* * * * * MY UNCLE TOBY'S LIBRARY, By FRANCIS FORRESTER, Esq., Consists of TWELVE VOLUMES, elegantly bound, and Illustrated with upwards of SIXTY BEAUTIFUL ENGRAVINGS.", '"I wish you would go on with us," said Mr. Parkman.', 'In the centre of this room there was a sort of low counter, enclosing a sort of oblong square.']
Downloading book /ebooks/24182 from http://www.gutenberg.org/ebooks/24182.txt.utf-8
Parsing book /ebooks/24182 with spacy
['"Because," replied Rollo, "unless there was a quay or a shore close by, they would not have any thing to fasten the line to."', 'Mr. George and Rollo, after this, walked about the deck of the

Downloading book /ebooks/28776 from http://www.gutenberg.org/ebooks/28776.txt.utf-8
Parsing book /ebooks/28776 with spacy
['"Then you have had no opportunity to spend it at all?"', '"Come here a minute," said Phonny.', 'The bench had been cleared off, so that there was a good space there to put the box upon.', '"Well," said Stuyvesant.', 'He concluded to have his ladder eight feet long, and to have six cross-bars, one foot apart, the upper and lower cross-bars to be one foot from the ends of the ladder.']
Downloading book /ebooks/12291 from http://www.gutenberg.org/ebooks/12291.txt.utf-8
Parsing book /ebooks/12291 with spacy
['It will be perceived that three methods of examining classes have now been named, and these will afford the teacher the means of introducing a very great variety in his mode of conducting his recitations, while he still carries his class forward steadily in their prescribed course.', 'Mode of illustrating it.--', 'But we are wandering a little from our subject, w

['The arm rapidly grew inflamed, became terribly painful, and must be amputated or the life lost.', 'The Indians probably retaliated upon the first band of white men which came within their power.', 'Let us now pass from these scenes to the spring of the year 1854.', '"Am I the person you are looking for?"', 'Shunan was thoroughly humbled, and became as docile as a child.']
Downloading book /ebooks/23798 from http://www.gutenberg.org/ebooks/23798.txt.utf-8
Parsing book /ebooks/23798 with spacy
['Selecting a small party of but nineteen men, about the first of August he emerged from Boonesborough, marched boldly to the Ohio, crossed the river, entered the valley of the Scioto, and was within four miles of an Indian town, Paint Creek, which he intended to destroy, when he chanced to encounter a band of thirty savages painted, thoroughly armed and on the war path, to join the band advancing from Old Chilicothe.', 'Exercise and excitement gave them health.', 'It is the intention of the publ

ValueError: [E088] Text of length 1052189 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

## Step 3: get `n` sentences per author

In [None]:
for books in test_author_books.values():
    print(get_line(books, 4))

## Step 4: get `k` authors and `n` sentences per author

In [None]:
print(extract_book(test_author_books, 2))

In [None]:
book_title_per_book_per_authors
book_lines_per_book_per_authors
wikidata_uri_per_author
wiki_abstract_per_lang_per_author
wiki_movement_per_author

In [15]:
list(x for x in range(10))[:
                           5]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]