In [1]:
import re
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [2]:
## Read in the three books ##
def read_book(num):
    names = ['1_Inkheart.txt', '2_Inkspell.txt', '3_Inkdeath.txt']
    with open('./books/'+names[num], 'r') as f:
        book = f.read().splitlines()
        f.close()
    book = [line for line in book if line != '']
    return book


## For each book, seperate it in chapters ##
def chop_chapters(book, reg=r'[0-9]+'):
    chapters = {}

    first_line = 0
    chap_count = 0
    for count, line in enumerate(book):
        tmp = re.match(reg, line)
        if tmp is not None:
            chapters[chap_count] = book[first_line:count]
            first_line = count
            chap_count += 1
        elif count == len(book)-1:
            chapters[chap_count] = book[first_line:count+1]
            first_line = count
            chap_count += 1
    return chapters


# The name of the book is a recurring line which we don't want to take into account
def remove_book_name(book, name):
    return [line for line in book if name not in line]


# In the third book, each chapter begins with the name of the chapter
# something that we also don't want to include in our text processing
def remove_chapter_name(chap_dict):
    for chap_num, chapter in chap_dict.items():
        # print(chapter[:2], '|', chapter[2:])
        chap_dict[chap_num] = chapter[2:]
    return chap_dict


In [3]:
# We read every book and its chapters
book1 = read_book(0)
book2 = read_book(1)
book3 = read_book(2)

# We remove the book name references
book2 = remove_book_name(book2, 'Ink 2 - Inkspell')
book3 = remove_book_name(book3, 'Ink 3 - Inkdeath')

# We divide each book in chapters (where book 3 has different chapter names)
ch1 = chop_chapters(book1)
ch2 = chop_chapters(book2)
ch3 = chop_chapters(book3, reg=r'CHAPTER [0-9]+')

# And for the third book we remove the chapter names
ch3 = remove_chapter_name(ch3)

In [4]:
# In the first book, each chapter starts with a poem or quote
# The author is always quoted with '- [author]' so we can separate it easily
for ch_count, chapter in ch1.items():
    for count, line in enumerate(chapter):
        if line[0] == '-':
            # print(ch_count, '\n', chapter[count] + '\n', chapter[count+1:])
            # We set the chapter to the text WITHOUT the poem
            ch1[ch_count] = chapter[count+1:]
            # No need to check more lines in this chapter
            break

In [5]:
# The same goes for the second book
for ch_count, chapter in ch2.items():
    for count, line in enumerate(chapter):
        if line[0] == '-':
            # print(ch_count, '\n', chapter[count] + '\n', chapter[count+1:])
            ch2[ch_count] = chapter[count+1:]
            break



In [6]:
for ch_count, chapter in ch3.items():
    if ch_count == 1:
        for count, line in enumerate(chapter):
            print(line)


Moonlight fell on Elinor’s bathrobe, her nightdress, her bare feet, and the dog lying in front of them. Orpheus’s dog. Oh, the way he looked at her with his eternally sad eyes! As if asking himself why, in the name of all the exciting smells in the world, she was sitting in her library in the middle of the night, surrounded by silent books, just staring into space.
“Why?” said Elinor in the silence. “Because I can’t sleep, you stupid animal.” But she patted his head all the same. This is what you’ve come to, Elinor, she thought as she hauled herself out of her armchair. Spending your nights talking to a dog. You don’t even like dogs, least of all this one, with his heavy breathing that always reminds you of his appalling master!
Still, she had kept the dog in spite of the painful memories he brought back. She’d kept the chair, too, even though the Magpie had sat in it. Mortola . . . how often Elinor thought she heard the old woman’s voice when she went into the quiet library, how often

In [7]:
import random
## Check everything we did above ##
for ch in [ch1, ch2, ch3]:
    random_chap = random.randint(0, len(ch))
    print('Chapter {}'.format(random_chap), ch[random_chap][:2], '\n\n')


Chapter 16 ['You were just three years old, Meggie," Mo began. "I remember how we celebrated your birthday. We gave you a picture book - you know, the one about the sea serpent with a toothache winding itself around the lighthouse. ..."', 'Meggie nodded. It was still in her book box - Mo had twice given it a new dress. "We?" she asked.'] 


Chapter 19 ['Meggie woke with a start. She had been dreaming, and her dreams had been bad, but she didn’t remember what they were about, only the fear they left behind like a knife wound in the heart. Noise came to her ears, shouting and loud laughter, children’s voices, the barking of dogs, the grunting of pigs, hammering, sawing. She felt sunlight on her face, and the air she was breathing smelled of dung and freshly baked bread. Where was she? Only when she saw Fenoglio sitting at his writing desk did she remember. Ombra - she was in Ombra.', '“Good morning!” Fenoglio had obviously slept extremely well. He looked very pleased with himself and the

In [8]:
from Book import Book
import itertools

b1 = Book(ch1)

In [10]:
from collections import Counter

Counter([x for x in b1.get_proc_num(1) if x[1] == 'NNP']).most_common()

[(('Meggie', 'NNP'), 53),
 (('Mo', 'NNP'), 40),
 (('Dustfinger', 'NNP'), 15),
 (('Come', 'NNP'), 5),
 (('Please', 'NNP'), 3),
 (('Twelve', 'NNP'), 3),
 (('Capricorn', 'NNP'), 3),
 (('Go', 'NNP'), 2),
 (('May', 'NNP'), 2),
 (('Oh', 'NNP'), 2),
 (('Send', 'NNP'), 2),
 (('Word', 'NNP'), 2),
 (('Which', 'NNP'), 1),
 (('Fire', 'NNP'), 1),
 (('Later', 'NNP'), 1),
 (('Dr.', 'NNP'), 1),
 (('Jekyll', 'NNP'), 1),
 (('Mr', 'NNP'), 1),
 (('Hyde', 'NNP'), 1),
 (('Books', 'NNP'), 1),
 (('Silvertongue', 'NNP'), 1),
 (('Ah', 'NNP'), 1),
 (('Ginger', 'NNP'), 1),
 (('Back', 'NNP'), 1),
 (('Believe', 'NNP'), 1),
 (('Yes', 'NNP'), 1),
 (('Try', 'NNP'), 1)]

In [18]:
all_pos = list(itertools.chain.from_iterable(b1.get_process().values()))

all_nnp = [x for x in all_pos if x[1] == 'NNP']

Counter(all_nnp).most_common()

[(('Meggie', 'NNP'), 1528),
 (('Dustfinger', 'NNP'), 799),
 (('Mo', 'NNP'), 770),
 (('Capricorn', 'NNP'), 718),
 (('Elinor', 'NNP'), 658),
 (('Basta', 'NNP'), 631),
 (('Fenoglio', 'NNP'), 323),
 (('Farid', 'NNP'), 237),
 (('Silvertongue', 'NNP'), 154),
 (('Flatnose', 'NNP'), 128),
 (('Magpie', 'NNP'), 100),
 (('Gwin', 'NNP'), 88),
 (('Well', 'NNP'), 66),
 (('Resa', 'NNP'), 66),
 (('Cockerell', 'NNP'), 61),
 (('Darius', 'NNP'), 51),
 (('Inkheart', 'NNP'), 49),
 (('Shadow', 'NNP'), 49),
 (('Did', 'NNP'), 42),
 (('Mortola', 'NNP'), 40),
 (('Pippo', 'NNP'), 38),
 (('Come', 'NNP'), 34),
 (('Oh', 'NNP'), 32),
 (('Mortimer', 'NNP'), 27),
 (('Bell', 'NNP'), 26),
 (('Tinker', 'NNP'), 25),
 (('Please', 'NNP'), 22),
 (('Paula', 'NNP'), 22),
 (('Suppose', 'NNP'), 21),
 (('Ah', 'NNP'), 19),
 (('Heavens', 'NNP'), 18),
 (('Go', 'NNP'), 15),
 (('Black', 'NNP'), 15),
 (('Peter', 'NNP'), 15),
 (('No', 'NNP'), 14),
 (('Poor', 'NNP'), 14),
 (('Teresa', 'NNP'), 14),
 (('A', 'NNP'), 13),
 (('Tell', 'NNP'), 

In [None]:
import nltk
from nltk import word_tokenize
from nltk import pos_tag

def preprocess(sent):
    sent = word_tokenize(sent)
    sent = pos_tag(sent)
    return sent

In [None]:
sent = ch1[1][0]
sent = preprocess(sent)
sent

In [None]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

In [None]:
from nltk.chunk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_tree = ne_chunk(pos_tag(word_tokenize(ch1[1][0])))
print(ne_tree)



In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()

In [None]:
chapter1_text = " ".join(ch1[1])
chapter1 = nlp(chapter1_text)
len(chapter1.ents)

In [None]:
labels = [x.label_ for x in chapter1.ents]
Counter(labels)

In [None]:
sentences = [x for x in chapter1.sents]
print(sentences[1])

In [None]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')
