# Summary of Subject Corpus

## Imports

In [1]:
import re
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib widget
import seaborn as sns
import spacy
# from collocater import collocater
import nltk

## Parameters

In [2]:
corpus_db_url = "http://corpus-db.org/api"
book = 'The Hound of The Baskervilles'
book_id = 3070.0  # have to know the id at the moment

## Get Data

### Get Metadata

In [3]:
metadata = pd.DataFrame(json.loads(requests.get(corpus_db_url + f"/id/{book_id}").text), index=[0])
metadata = metadata.replace('', np.nan).dropna(axis=1)

display(metadata)

Unnamed: 0,lcsh,creator,downloads,rights_url,authoryearofdeath,_repo,wp_subjects,gutenberg_issued,identifiers,formats,...,id,gutenberg_bookshelf,_version,title,type,wp_literary_genres,publisher,covers,description,filename
0,"{'Blessing and cursing -- Fiction', 'Holmes, S...","{'author': {'agent_name': 'Doyle, Arthur Conan...",348,http://creativecommons.org/about/pdm,1930,The-Hound-of-the-Baskervilles_3070,"['Gothic_novels', '1902_novels', 'Dartmoor', '...",2002-02-01,"{'gutenberg': '3070', 'wikidata': 'Q45192'}",{'text/plain; charset=us-ascii': 'http://www.g...,...,3070.0,"Bestsellers, American, 1895-1923",0.2.0,The Hound of the Baskervilles,Text,['Detective_fiction'],Project Gutenberg,"[{'cover_type': 'generated', 'image_path': 'co...",The Hound of the Baskervilles is the third of ...,/run/media/jon/SAMSUNG/gitenberg/The-Hound-of-...


### Get Full Text

In [4]:
corpus = json.loads(requests.get(corpus_db_url + f"/id/{book_id}/fulltext").text)

print('Book is {0} characters long.'.format(len(corpus[0]['text'])))

Book is 318546 characters long.


### Load into SpaCy

In [5]:
# !python -m spacy download en_core_web_lg  # note this doesn't really work properly with pipenvs

nlp = spacy.load("en_core_web_lg")

# collie = collocater.Collocater.loader()
# nlp.add_pipe(collie)

doc_string = re.sub(' +', ' ', corpus[0]['text'].replace('\r', ' ').replace('\n', ' ').replace("\'", "'"))  # remove weird characters and extra whitespace
doc = nlp(doc_string)  

display(doc[:10])

 The Hound of the Baskervilles by Sir Arthur Conan

### List of SpaCy properties

In [6]:
display([prop for prop in dir(doc) if not prop.startswith('_')])

['cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_bytes',
 'to_disk',
 'to_json',
 'to_utf8_array',
 'user_data',
 'user_hooks',
 'user_span_hooks',
 'user_token_hooks',
 'vector',
 'vector_norm',
 'vocab']

## High-level Analyses

### Total Number of Words

In [7]:
print('Total number of words is {0}.'.format(len(doc)))

Total number of words is 70781.


### Number of Words Per Chapter

In [8]:
def split_into_chapters(doc):
    """
    Parameters:
    doc (spacy.Doc): list of sentences representing a book

    Returns:
    A list of integers representing the positions of chapters as a word number
    """
    
    words = [t.text for t in doc]

    chapter_word_idx = [] #represents location of chapters in book
    chapter_word_idx.append(0)
    for i,w in enumerate(words):
        if w == "Chapter":
            chapter_word_idx.append(i)
    chapter_word_idx.append(len(words))
        
    return np.array(chapter_word_idx)

In [9]:
chapter_word_idx = split_into_chapters(doc)  # list of chapters locations in words
ignore = 16  # ignore the first 16 for tHotB as they're the prelim
chapter_word_idx = chapter_word_idx[ignore:]
# display(chapter_word_idx)

chapters = np.arange(1,len(chapter_word_idx))
chapter_word_count = chapter_word_idx[1:] - chapter_word_idx[:-1]
# display(chapters, chapter_word_count)

plt.figure()
sns.barplot(x=chapters, y=chapter_word_count)
plt.ylabel('Word Count')
plt.xlabel('Chapter')
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Vocabulary

In [10]:
words = [w.text.lower() for w in doc]

print('Total number of unique words (vocabulary) is {0}.'.format(len(set(words))))

chapter_vocab = np.array([len(set(words[chapter_word_idx[i]:chapter_word_idx[i+1]])) for i in np.arange(len(chapters))])

plt.figure()
sns.barplot(x=chapters, y=100*chapter_vocab/chapter_word_count)
plt.ylabel('Unique Word Count/%')
plt.ylim(0,100)
plt.xlabel('Chapter')
plt.tight_layout()
plt.show()

Total number of unique words (vocabulary) is 5627.


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Most Common Words of Different Types

In [11]:
# Common Nouns
nouns = [w.lemma_ for w in doc.noun_chunks if (not w.root.is_stop)]  # this gets rid of common pronouns like 'i', 'he', 'she', etc.
nouns = pd.DataFrame(nouns,columns=['word'])
nouns = nouns.groupby('word')['word']
nouns = nouns.describe()['count']
nouns = nouns.sort_values(ascending=False)
print("The 20 most common nouns are:")
display(nouns.head(20))

The 20 most common nouns are:


word
the moor         133
Holmes           112
Sir Henry        102
Dr. Mortimer      63
Stapleton         60
Sir Charles       53
-PRON- friend     49
the man           49
-PRON- hand       47
London            44
Barrymore         43
-PRON- eye        38
the baronet       38
the matter        35
-PRON- wife       35
-PRON- face       33
a man             31
-PRON- mind       31
the house         30
the hound         29
Name: count, dtype: object

In [12]:
# Common Adjectives
adjectives = [(w.lemma_) for w in doc if (not (w.is_punct or w.is_space or w.is_stop) and w.pos_=="ADJ")]
adjectives = pd.DataFrame(adjectives, columns=['word'])
adjectives = adjectives.groupby('word')['word']
adjectives = adjectives.describe()['count']
adjectives = adjectives.sort_values(ascending=False)
print("The 20 most common adjectives are:")
display(adjectives.head(20))

The 20 most common adjectives are:


word
good       71
old        69
great      60
little     55
long       49
black      45
dark       42
dear       31
small      29
able       27
clear      27
young      26
poor       26
strange    25
gray       22
ready      22
new        21
sure       21
low        21
deep       20
Name: count, dtype: object

In [13]:
verbs = [(w.lemma_) for w in doc if (not (w.is_punct or w.is_space or w.is_stop) and w.pos_=="VERB")]
verbs = pd.DataFrame(verbs, columns=['word'])
verbs = verbs.groupby('word')['word']
verbs = verbs.describe()['count']
verbs = verbs.sort_values(ascending=False)
print("The 20 most common verbs are:")
display(verbs.head(20))

The 20 most common verbs are:


word
say       246
come      189
know      182
think     124
tell      117
see       111
find       97
hear       88
look       87
go         85
take       69
ask        69
leave      68
run        57
turn       57
stand      57
give       54
lie        44
pass       44
follow     44
Name: count, dtype: object

In [14]:
# Times
# times = [(w.root.text) for w in doc.ents if w.label_ == 'TIME']  # good for showing that most action is at night
times = [(w.lemma_) for w in doc.ents if w.label_ == 'TIME']  # shows hours mentioned, e.g. ten o'clock
times = pd.DataFrame(times, columns=['word'])
times = times.groupby('word')['word']
times = times.describe()['count']
times = times.sort_values(ascending=False)
print("The times mentioned are:")
display(times.head(20))

The times mentioned are:


word
the night             11
this morning          10
last night             9
the morning            8
evening                7
night                  7
a few minute           4
that night             4
an hour                3
the evening            3
half an hour           3
morning                3
ten o'clock            3
five or ten minute     2
next morning           2
two hour               2
one night 's           2
every night            2
every evening          2
twenty - four hour     2
Name: count, dtype: object

In [15]:
# Persons
persons = [(w.lemma_) for w in doc.ents if w.label_ == 'PERSON']
persons = pd.DataFrame(persons, columns=['word'])
persons = persons.groupby('word')['word']
persons = persons.describe()['count']
persons = persons.sort_values(ascending=False)
print("The persons mentioned by name are:")
display(persons.head(20))

The persons mentioned by name are:


word
Holmes                 146
Henry                  130
Watson                 113
Stapleton               93
Mortimer                85
Charles                 79
Barrymore               70
Sherlock Holmes         29
Henry Baskerville       24
Coombe Tracey           18
Charles Baskerville     15
Lyons                   11
Selden                  11
Laura Lyons             11
Baskerville              9
Devonshire               9
Lestrade                 9
Cartwright               8
James Mortimer           5
Grimpen Mire             5
Name: count, dtype: object

In [16]:
# Verbs associated with Holmes
holmes_verbs = [(w.sent.root.lemma_) for w in doc if w.text.lower()=="holmes"]
holmes_verbs = pd.DataFrame(holmes_verbs, columns=['word'])
holmes_verbs = holmes_verbs.groupby('word')['word']
holmes_verbs = holmes_verbs.describe()['count']
holmes_verbs = holmes_verbs.sort_values(ascending=False)
print("The 20 most common verbs associated with 'Holmes' are:")
display(holmes_verbs.head(20))

The 20 most common verbs associated with 'Holmes' are:


word
say       52
be        23
look       6
ask        5
have       5
cry        4
think      3
shrug      3
Holmes     3
strike     3
lean       2
follow     2
see        2
do         2
come       2
save       2
lay        2
"          2
sit        2
tell       2
Name: count, dtype: object

## Collocations

Would like to use `collocater` here but there are some issues for me so have used nltk.

In [17]:
# nltk.download('punkt')
# nltk.download('stopwords')

nltk_doc = nltk.Text(nltk.word_tokenize(doc_string))

print(nltk_doc.collocations())

[nltk_data] Downloading package stopwords to /home/chas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sir Henry; Sir Charles; Dr. Mortimer; Sherlock Holmes; Coombe Tracey;
Baskerville Hall; Merripit House; said Holmes; Dr. Watson; Grimpen
Mire; Baker Street; Mr. Holmes; Henry Baskerville; Laura Lyons; Yew
Alley; Mr. Sherlock; Northumberland Hotel; Mrs. Laura; Miss Stapleton;
dear fellow
None


## Concordances and Dispersion Plots

Of words from the book title, key characters and locations.

In [18]:
nltk_doc.concordance('hound')

Displaying 25 of 69 matches:
The Hound of the Baskervilles by Sir Arthur Co
-- Fixing the Nets Chapter 14 -- The Hound of the Baskervilles Chapter 15 -- A 
rrative : -- `` Of the origin of the Hound of the Baskervilles there have been 
and there ran mute behind him such a hound of hell as God forbid should ever be
 great , black beast , shaped like a hound , yet larger than any hound that eve
d like a hound , yet larger than any hound that ever mortal eye has rested upon
ale , my sons , of the coming of the hound which is said to have plagued the fa
ge creature or heard the baying of a hound . The latter question he put to me s
ey were the footprints of a gigantic hound ! '' Chapter 3 The Problem I confess
ark is material . '' `` The original hound was material enough to tug a man 's 
 . `` Of course , I 've heard of the hound ever since I was in the nursery . It
t ? '' `` The peasants say it is the Hound of the Baskervilles calling for its 
er ? '' `` You know the story of the hound ?

In [19]:
nltk_doc.concordance('Stapleton')

Displaying 25 of 93 matches:
ankland , of Lafter Hall , and Mr. Stapleton , the naturalist , there are no ot
ould send him back a new man . Mr. Stapleton , a mutual friend who was much con
the residence of the naturalist -- Stapleton , if I remember right , was his na
thing . There is this naturalist , Stapleton , and there is his sister , who is
ur mutual friend , Mortimer . I am Stapleton , of Merripit House . '' `` Your n
'' said I , `` for I knew that Mr. Stapleton was a naturalist . But how did you
y kind . '' `` Excellent ! '' said Stapleton . `` You are perfectly right to be
ghbours upon the moor . I accepted Stapleton 's invitation , and we turned toge
em more fertile than the rest . '' Stapleton laughed . `` That is the great Gri
ly , throbbing murmur once again . Stapleton looked at me with a curious expres
cross our path , and in an instant Stapleton was rushing with extraordinary ene
d not doubt that this was the Miss Stapleton of whom I had been told , since la
t between b

In [20]:
plt.figure()
nltk_doc.dispersion_plot(['hound','Hound','Baskervilles','Holmes','Watson','Henry','Stapleton'])
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
plt.figure()
nltk_doc.dispersion_plot(['Hall', 'moor', 'Coombe', 'Devonshire', 'Grimpen', 'Merripit'])  # note that 'hall' and 'Hall' are different.
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Conditonal Frequency Distributions

Of key characters and locations over time

In [22]:
plt.figure()
cfd = nltk.ConditionalFreqDist(
           (target, c)
           for c in chapters
           for w in nltk_doc[chapter_word_idx[c-1]:chapter_word_idx[c]]
           for target in ['holmes', 'watson', 'henry', 'mortimer', 'stapleton', 'barrymore']
           if w.lower().startswith(target))
cfd.plot(cumulative=True)
plt.xlabel('Chapter')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
plt.figure()
cfd = nltk.ConditionalFreqDist(
           (target, c)
           for c in chapters
           for w in nltk_doc[chapter_word_idx[c-1]:chapter_word_idx[c]]
           for target in ['hall', 'moor', 'coombe', 'devonshire', 'grimpen', 'merripit']
           if w.lower().startswith(target))
cfd.plot(cumulative=True)
plt.xlabel('Chapter')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …