# Summary of Subject Corpus

## Imports

In [None]:
import re
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib widget
import seaborn as sns
import spacy
from collocater import collocater
from spacy_readability import Readability
import nltk

## Parameters

In [None]:
corpus_db_url = "http://corpus-db.org/api"
book = 'The Hound of The Baskervilles'
book_id = 3070.0  # have to know the id at the moment

## Get Data

### Get Metadata

In [None]:
metadata = pd.DataFrame(json.loads(requests.get(corpus_db_url + f"/id/{book_id}").text), index=[0])
metadata = metadata.replace('', np.nan).dropna(axis=1)

display(metadata)

### Get Full Text

In [None]:
corpus = json.loads(requests.get(corpus_db_url + f"/id/{book_id}/fulltext").text)

print('Book is {0} characters long.'.format(len(corpus[0]['text'])))

### Load into SpaCy

In [None]:
# !python -m spacy download en_core_web_lg  # note this doesn't really work properly with pipenvs

nlp = spacy.load("en_core_web_lg")

# Add collocater pipeline
# collie = collocater.Collocater.loader()  # this adds a lot of processing time for a big text <- can it be sped up?
# nlp.add_pipe(collie)

# Add readability pipeline
nlp.add_pipe(Readability(), last=True)

doc_string = re.sub(' +', ' ', corpus[0]['text'].replace('\r', ' ').replace('\n', ' ').replace("\'", "'"))  # remove weird characters and extra whitespace
doc = nlp(doc_string)  

display(doc[:11])

### List of SpaCy properties

In [None]:
display([prop for prop in dir(doc) if not prop.startswith('_')])

## High-level Analyses

### Total Number of Words

In [None]:
print('Total number of words is {0}.'.format(len(doc)))

### Number of Words Per Chapter

In [None]:
def split_into_chapters(doc):
    """
    Parameters:
    doc (spacy.Doc): list of sentences representing a book

    Returns:
    A list of integers representing the positions of chapters as a word number
    """
    
    words = [t.text for t in doc]

    chapter_word_idx = [] #represents location of chapters in book
    chapter_word_idx.append(0)
    for i,w in enumerate(words):
        if w == "Chapter":
            chapter_word_idx.append(i)
    chapter_word_idx.append(len(words))
        
    return np.array(chapter_word_idx)

In [None]:
chapter_word_idx = split_into_chapters(doc)  # list of chapters locations in words
ignore = 16  # ignore the first 16 for tHotB as they're the prelim
chapter_word_idx = chapter_word_idx[ignore:]
# display(chapter_word_idx)

chapters = np.arange(1,len(chapter_word_idx))
chapter_word_count = chapter_word_idx[1:] - chapter_word_idx[:-1]
# display(chapters, chapter_word_count)

plt.figure()
sns.barplot(x=chapters, y=chapter_word_count)
plt.ylabel('Word Count')
plt.xlabel('Chapter')
plt.tight_layout()
plt.show()

### Vocabulary

In [None]:
words = [w.text.lower() for w in doc]

print('Total number of unique words (vocabulary) is {0}.'.format(len(set(words))))

chapter_vocab = np.array([len(set(words[chapter_word_idx[i]:chapter_word_idx[i+1]])) for i in np.arange(len(chapters))])

plt.figure()
sns.barplot(x=chapters, y=100*chapter_vocab/chapter_word_count)
plt.ylabel('Unique Word Count/%')
plt.ylim(0,100)
plt.xlabel('Chapter')
plt.tight_layout()
plt.show()

## Most Common Words of Different Types

In [None]:
# Common Nouns
nouns = [w.lemma_ for w in doc.noun_chunks if (not w.root.is_stop)]  # this gets rid of common pronouns like 'i', 'he', 'she', etc.
nouns = pd.DataFrame(nouns,columns=['word'])
nouns = nouns.groupby('word')['word']
nouns = nouns.describe()['count']
nouns = nouns.sort_values(ascending=False)
print("The 20 most common nouns are:")
display(nouns.head(20))

In [None]:
# Common Adjectives
adjectives = [(w.lemma_) for w in doc if (not (w.is_punct or w.is_space or w.is_stop) and w.pos_=="ADJ")]
adjectives = pd.DataFrame(adjectives, columns=['word'])
adjectives = adjectives.groupby('word')['word']
adjectives = adjectives.describe()['count']
adjectives = adjectives.sort_values(ascending=False)
print("The 20 most common adjectives are:")
display(adjectives.head(20))

In [None]:
verbs = [(w.lemma_) for w in doc if (not (w.is_punct or w.is_space or w.is_stop) and w.pos_=="VERB")]
verbs = pd.DataFrame(verbs, columns=['word'])
verbs = verbs.groupby('word')['word']
verbs = verbs.describe()['count']
verbs = verbs.sort_values(ascending=False)
print("The 20 most common verbs are:")
display(verbs.head(20))

In [None]:
# Times
# times = [(w.root.text) for w in doc.ents if w.label_ == 'TIME']  # good for showing that most action is at night
times = [(w.lemma_) for w in doc.ents if w.label_ == 'TIME']  # shows hours mentioned, e.g. ten o'clock
times = pd.DataFrame(times, columns=['word'])
times = times.groupby('word')['word']
times = times.describe()['count']
times = times.sort_values(ascending=False)
print("The times mentioned are:")
display(times.head(20))

In [None]:
# Persons
persons = [(w.lemma_) for w in doc.ents if w.label_ == 'PERSON']
persons = pd.DataFrame(persons, columns=['word'])
persons = persons.groupby('word')['word']
persons = persons.describe()['count']
persons = persons.sort_values(ascending=False)
print("The persons mentioned by name are:")
display(persons.head(20))

In [None]:
# Verbs associated with Holmes
holmes_verbs = [(w.sent.root.lemma_) for w in doc if w.text.lower()=="holmes"]
holmes_verbs = pd.DataFrame(holmes_verbs, columns=['word'])
holmes_verbs = holmes_verbs.groupby('word')['word']
holmes_verbs = holmes_verbs.describe()['count']
holmes_verbs = holmes_verbs.sort_values(ascending=False)
print("The 20 most common verbs associated with 'Holmes' are:")
display(holmes_verbs.head(20))

## Collocations

Would like to use `collocater` here but there are some issues for me so have used nltk.

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')

nltk_doc = nltk.Text(nltk.word_tokenize(doc_string))

nltk_doc.collocations(50)

In [None]:
# collocations = pd.DataFrame(data=np.array(doc._.collocs), columns=['collocation'])
# collocations = collocations.groupby('collocation')['collocation']
# collocations = collocations.describe()['count']
# collocations = collocations.sort_values(ascending=False)

# display(collocations.head(50).index.values)

## Concordances and Dispersion Plots

Of words from the book title, key characters and locations.

In [None]:
nltk_doc.concordance('hound')

In [None]:
nltk_doc.concordance('Stapleton')

In [None]:
plt.figure()
nltk_doc.dispersion_plot(['hound','Hound','Baskervilles','Holmes','Watson','Henry','Stapleton'])
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
nltk_doc.dispersion_plot(['Hall', 'moor', 'Coombe', 'Devonshire', 'Grimpen', 'Merripit'])  # note that 'hall' and 'Hall' are different.
plt.tight_layout()
plt.show()

## Conditonal Frequency Distributions

Of key characters and locations over time

In [None]:
plt.figure()
cfd = nltk.ConditionalFreqDist(
           (target, c)
           for c in chapters
           for w in nltk_doc[chapter_word_idx[c-1]:chapter_word_idx[c]]
           for target in ['holmes', 'watson', 'henry', 'mortimer', 'stapleton', 'barrymore']
           if w.lower().startswith(target))
cfd.plot(cumulative=True)
plt.xlabel('Chapter')
plt.show()

In [None]:
plt.figure()
cfd = nltk.ConditionalFreqDist(
           (target, c)
           for c in chapters
           for w in nltk_doc[chapter_word_idx[c-1]:chapter_word_idx[c]]
           for target in ['hall', 'moor', 'coombe', 'devonshire', 'grimpen', 'merripit']
           if w.lower().startswith(target))
cfd.plot(cumulative=True)
plt.xlabel('Chapter')
plt.show()

## Readability

### Average Sentence Lengths

In [None]:
sentence_lengths = pd.DataFrame(data=[len(s) for s in doc.sents],columns=['sentence_length']).assign(chapter='all')

for c in chapters:
    chap = doc[chapter_word_idx[c-1]:chapter_word_idx[c]].as_doc()
    sentence_lengths = pd.concat([sentence_lengths, pd.DataFrame(data=[len(s) for s in chap.sents],columns=['sentence_length']).assign(chapter=f'{c:02}')], ignore_index=True)

display(sentence_lengths.groupby('chapter').describe().sort_values('chapter'))

sns.catplot(data=sentence_lengths, x='chapter', y='sentence_length', kind='violin', width=1, inner=None, cut=0, aspect=2)
plt.show()

### Average Word Length

In [None]:
word_lengths = pd.DataFrame(data=[len(w) for w in doc],columns=['word_length']).assign(chapter='all')

for c in chapters:
    chap = doc[chapter_word_idx[c-1]:chapter_word_idx[c]]
    word_lengths = pd.concat([word_lengths, pd.DataFrame(data=[len(w) for w in chap],columns=['word_length']).assign(chapter=f'{c:02}')], ignore_index=True)

display(word_lengths.groupby('chapter').describe().sort_values('chapter'))

sns.catplot(data=word_lengths, x='chapter', y='word_length', kind='violin', width=1, inner=None, cut=0, aspect=2)
plt.show()

### Readability Scores

In [None]:
print('Flesch-Kincaid Grade Level for the book is {0}'.format(doc._.flesch_kincaid_grade_level))
print('Flesch-Kincaid Reading Ease for the book  is {0}'.format(doc._.flesch_kincaid_reading_ease))
print('Dale-Chaell for the book  is {0}'.format(doc._.dale_chall))
print('SMOG for the book  is {0}'.format(doc._.smog))
print('Coleman-Liau Index for the book  is {0}'.format(doc._.coleman_liau_index))
print('Automated Readability Index for the book  is {0}'.format(doc._.automated_readability_index))
print('FORCAST for the book  is {0}'.format(doc._.forcast))

In [None]:
scores = pd.DataFrame(index=chapters, columns=['flesch_kincaid_grade_level','flesch_kincaid_reading_ease','dale_chall','smog','coleman_liau_index','automated_readability_index','forcast'], dtype=np.float)

for c in chapters:
    chap = doc[chapter_word_idx[c-1]:chapter_word_idx[c]].as_doc()
    scores.loc[c,'flesch_kincaid_grade_level'] = chap._.flesch_kincaid_grade_level
    scores.loc[c,'flesch_kincaid_reading_ease'] = chap._.flesch_kincaid_reading_ease
    scores.loc[c,'dale_chall'] = chap._.dale_chall
    scores.loc[c,'smog'] = chap._.smog
    scores.loc[c,'coleman_liau_index'] = chap._.coleman_liau_index
    scores.loc[c,'automated_readability_index'] = chap._.automated_readability_index
    scores.loc[c,'forcast'] = chap._.forcast

scores = scores.reset_index()
scores = scores.rename(columns={"index": "chapter"})

scores = scores.melt(id_vars='chapter',var_name='method',value_name='score')

grid = sns.FacetGrid(scores, col="method", col_wrap=4, height=3, sharex=True, sharey=False)
grid.map(sns.lineplot, "chapter", "score")
grid.fig.tight_layout()
plt.show()

# Ratio of Different POS Across Chapters

In [None]:
pos = pd.DataFrame(index=chapters, columns=['ADJ','ADV','INTJ','NOUN','PNOUN','VERB']).fillna(0)

for c in chapters:
    for w in doc[chapter_word_idx[c-1]:chapter_word_idx[c]]:
        if w.pos_ in pos.columns:
            pos.loc[c,w.pos_] = pos.loc[c,w.pos_]+1

pos = 100*pos.div(pos.sum(axis=1), axis=0)
pos.plot(kind='bar', stacked=True)
plt.ylabel('% of Words')
plt.xlabel('Chapter')
plt.show()

## Title Word Occurence Per Chapter

In [None]:
title_words = ['hound', "baskerville"]
occurences = pd.DataFrame(index=chapters, columns=title_words).fillna(0)

for c in chapters:
    for w in doc[chapter_word_idx[c-1]:chapter_word_idx[c]]:
        if w.text.lower() in title_words:
            occurences.loc[c,w.text.lower()] = occurences.loc[c,w.text.lower()]+1

occurences.plot(kind='bar')
plt.ylabel('% of Words')
plt.xlabel('Chapter')
plt.show()


# Use of Time Words Throughout Chapters

In [None]:
# Time of Day
times_of_day = ['dawn','morning','day','noon','afternoon','evening','dusk','night']
time_of_day = pd.DataFrame(columns=['chapter','time'])

for w in doc.ents:
    c = np.digitize(w.start, chapter_word_idx)
    if c>0 and w.label_=="TIME":
        if w.root.lemma_ in times_of_day:
            time_of_day = time_of_day.append({'chapter':c,'time':w.root.lemma_}, ignore_index=True)

time_of_day = time_of_day.pivot_table(index='chapter',columns='time', aggfunc=len).reset_index().melt(id_vars='chapter',var_name='word',value_name='count')

g = sns.catplot(data=time_of_day, x='chapter', y='count', kind='bar', col='word', col_order=times_of_day, col_wrap=3, height=3)
plt.show()

In [None]:
# Hour of Day
numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve"]
hours = [w + " o'clock" for w in numbers]
numeric_hours = [f"{w} o'clock" for w in np.arange(1,13)]

hour_of_day = pd.DataFrame(columns=['chapter','time'])

for w in doc.ents:
    c = np.digitize(w.start, chapter_word_idx)
    if c>0 and w.label_=="TIME":
        for (h,n) in zip(hours,numeric_hours):
            if h in w.text or n in w.text:
                hour_of_day = hour_of_day.append({'chapter':c,'time':h}, ignore_index=True)

hour_of_day = hour_of_day.pivot_table(index='chapter',columns='time', aggfunc=len).reset_index().melt(id_vars='chapter',var_name='word',value_name='count')

g = sns.catplot(data=hour_of_day, x='chapter', y='count', kind='bar', col='word', col_order=hours, col_wrap=3, height=3)
plt.show()

# Use SpaCy's Similarity To Investigate Chapter Difference

In [None]:
similarity_matrix = np.zeros((len(chapters),len(chapters)))

for c1 in chapters:
    chap1 = doc[chapter_word_idx[c1-1]:chapter_word_idx[c1]].as_doc()
    for c2 in chapters: 
        chap2 = doc[chapter_word_idx[c2-1]:chapter_word_idx[c2]].as_doc()
        similarity_matrix[c1-1,c2-1] = chap1.similarity(chap2)

plt.figure()
sns.heatmap(similarity_matrix)
plt.show()