# Summary of Subject Corpus

## Imports

In [None]:
import requests
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import trange, tqdm
import spacy

## Parameters

In [None]:
corpus_db_url = "http://corpus-db.org/api"

## Get Data

### List Subjects

In [None]:
subjects = pd.DataFrame(json.loads(requests.get(corpus_db_url + "/subjects", timeout=5).text), columns=['subject', 'count'])
display(subjects.head(15))  # show the top 15 subjects

### Filter subjects

Returns a list that we can use to combine all 'detective' and 'crime' subjects that are in English (or not specified and so assumed)

In [None]:
desired_subjects = 'detective|crime|private investigators'
undesired_subjects = 'crimean|commercial|case studies|history|biography|against|organized|political'
undesired_languages = 'france|french|chinese|german|argentina|ukraine|dutch|portuguese|philippines'
filtered_subjects = subjects.where(subjects['subject'].str.contains(desired_subjects, case=False, regex=True))
filtered_subjects = filtered_subjects.where(~subjects['subject'].str.contains(undesired_subjects, case=False, regex=True))
filtered_subjects = filtered_subjects.where(~subjects['subject'].str.contains(undesired_languages, case=False, regex=True))
filtered_subjects = filtered_subjects.dropna()['subject'].to_list()

display(filtered_subjects)

### Get metadata for records of all remaining subjects

In [None]:
metadata = []
for subject in tqdm(filtered_subjects):
#     print('Getting metadata for {0}'.format(subject))
    data = None
    count = 0
    while count<10 and data is None:
        try:
            data = json.loads(requests.get(corpus_db_url + f"/subject/{subject}", timeout=1).text)
        except:
            print('Failed, reattempting.')
            count = count + 1
            pass
    metadata = metadata + data
    
print('Got {0} records.'.format(len(metadata)))

### Narrow down by language meta tag

In [None]:
filtered_metadata = [item for item in metadata if 'en' in item['languages']]

print('Narrowed to {0} records.'.format(len(filtered_metadata)))

### Narrow down by Library of Congress meta tag

PE - English language
PR - British literature
PS - American literature

In [None]:
filtered_metadata = [item for item in filtered_metadata if 'PE' in item['LCC'] or 'PR' in item['LCC'] or 'PS' in item['LCC']]

print('Narrowed to {0} records.'.format(len(filtered_metadata)))

### Get Full Text Corpus

In [None]:
(corpus, corpus_labels, corpus_authors, corpus_titles) = pickle.load(open( "save.p", "rb" ))

In [None]:
# def getFulltext(bookID):
#     data = None
#     count = 0
#     while count<100 and data is None:
#         try:
#             data = json.loads(requests.get(corpus_db_url + '/id/' + bookID + '/fulltext', timeout=1).text)
#         except Exception as e:
#             print(f'Failed due to timeout, reattempting ({count}).')
#             count = count + 1
#             pass
#         if isinstance(data,list) and len(data)>0 and isinstance(data[0],dict):
#             return data[0]['text']

# corpus = []
# corpus_labels = []
# corpus_authors = []
# corpus_years = []
# corpus_titles = []
# for book in tqdm(filtered_metadata):
# #     print('Collecting book: {0}'.format(book['title']))
#     data = getFulltext(book['id'])
#     if data is not None:
#         for subject in eval(book['lcsh'].lower()):
#             if 'detective' in subject or 'title' in subject:
#                 corpus_titles.append(book['author'])
#                 corpus_authors.append(book['author'])
#                 corpus.append(data)
#                 corpus_labels.append('detective')
#                 break
#             elif 'crime' in subject:
#                 corpus_titles.append(book['author'])
#                 corpus_authors.append(book['author'])
#                 corpus.append(data)
#                 corpus_labels.append('crime')
#                 break
# #             print('Ignored: {}'.format(subject))
# #     else:
# #         print('No data')

# print('Corpus of {0} texts.'.format(len(corpus)))

In [None]:
# pickle.dump((corpus, corpus_labels, corpus_authors, corpus_titles), open( "save.p", "wb" ))
# print(len(corpus),len(corpus_labels),len(corpus_authors), len(corpus_titles))

## Stylometry - Splitting Genres into Authors Based on Style

### Using SciKit Learn's PCA and Term Frequency

In [None]:
tfidf = TfidfVectorizer(use_idf=False, max_features=1000)
tf = tfidf.fit_transform(corpus).todense()
print(tf.shape)
pca = PCA(n_components=2)
pcaOut = pca.fit_transform(tf)

plt.figure(figsize=[15,15])
xs, ys = pcaOut[:,0], pcaOut[:,1]
for i in range(len(xs)):
    plt.scatter(xs[i], ys[i])
#     plt.annotate(corpus_labels[i], (xs[i], ys[i]))
    plt.annotate(corpus_authors[i], (xs[i], ys[i]))
plt.show()

### Using SciKit Learn's PCA and Term Frequency - Inverse Focument Frequency

In [None]:
tfidf = TfidfVectorizer(use_idf=True, max_features=1000)
tf = tfidf.fit_transform(corpus).todense()
# print(tfidf.get_feature_names()[:10])
pca = PCA(n_components=2)
pcaOut = pca.fit_transform(tf)

plt.figure(figsize=[15,15])
xs, ys = pcaOut[:,0], pcaOut[:,1]
for i in range(len(xs)):
    plt.scatter(xs[i], ys[i])
#     plt.annotate(corpus_labels[i], (xs[i], ys[i]))
    plt.annotate(corpus_authors[i], (xs[i], ys[i]))
plt.show()

### Using SpaCy's Similarity

Takes ages to run - not very informative.
Can't convert full corpus to docs due to memory.

In [None]:
# (nlp, corpus_docs) = pickle.load(open( "savedocs.p", "rb" ))

In [None]:
nlp = spacy.load("en_core_web_lg")
nlp.vocab.prune_vectors(10000)
nlp.max_length = 1500000

In [None]:
# # premake docs
# corpus_docs = []
# for book in trange(len(corpus)):
#     corpus_docs.append(nlp(corpus[book]))
#     corpus[book] = None  # for memory reasons
    
# del corpus  # for memory reasons

In [None]:
# pickle.dump((nlp,corpus_docs), open( "savedocs.p", "wb" ))

In [None]:
# book_similarities = np.zeros((len(corpus_docs),len(corpus_docs)))

# for idx1,doc1 in enumerate(corpus_docs):
#     for idx2,doc2 in enumerate(corpus_docs):
#         if idx1==idx2:
#             book_similarities[idx1, idx2] = 1.0
#         elif idx2>idx1:
#             book_similarities[idx1, idx2] = doc1.similarity(doc2)
#         else:
#             book_similarities[idx1, idx2] = book_similarities[idx2, idx1]

# ranked_order = book_similarities[0,:].argsort()
# book_similarities = book_similarities[ranked_order,:]

# plt.figure(figsize=(30,30))
# sns.heatmap(book_similarities)
# ranked_authors = [corpus_authors[idx] for idx in ranked_order]
# plt.yticks(np.arange(len(corpus_docs)),ranked_authors, rotation='horizontal')
# plt.xticks(np.arange(len(corpus_docs)),corpus_authors[:100], rotation='vertical')
# plt.show()

## Sentence Tree Averaging

From https://github.com/JonathanReeve/james-sentence

In [None]:
class sentenceStats(): 
    def __init__(self, doc): 
        """ Takes a SpaCy document as input. """
        self.doc = doc
        self.sents = list(doc.sents)
        self.vectors = [self.countLevels(sent) for sent in self.sents]
        self.maxes = [max(vec) for vec in self.vectors]
        self.averageSent = self.averageVectors(self.vectors)
    
    def countLevels(self, sent):
        """ Create a numeric representation vector of a syntactic tree by counting the numbers
        of child nodes at each level of the tree. """
#         print('Counting levels for sentence: ', sent)
        sentRoot = sent.root
        counts = [1, len(list(sentRoot.children))]
        branches = list(sentRoot.children)
        children = []
        while branches != []: 
    #         print('---Branches: ', branches)
            for branch in branches: 
    #             print('Branch: ', branch)
    #             print('Children: ', list(branch.children))
                for child in list(branch.children): 
                    children.append(child)
    #             print('Children list: ', children)
            counts.append(len(children))
            branches = children.copy()
            children = []
        return counts

    def averageVectors(self, vectorList): 
        """ Does an element-wise average for all the vectors in a list. """
        lengths = [len(vector) for vector in vectorList]
        maxLength = max(lengths)
        for vector in vectorList: 
            while len(vector) < maxLength: 
                vector.append(0) # Zero-pad
        vectorList = np.array(vectorList)
        return vectorList.mean(axis=0) # average each column
    
    def plotAverageSent(self): 
        pd.Series(self.averageSent).plot(kind="bar")


def getVector(doc): 
    stats = sentenceStats(doc)
    return stats.averageSent, np.max(stats.maxes)

In [None]:
sentence_tree_averages = []
sentence_tree_max = []

for book in tqdm(corpus):
    sentence_tree_average, sentence_tree_mx = getVector(nlp(book))
    sentence_tree_averages.append(sentence_tree_average)
    sentence_tree_max.append(sentence_tree_mx)

df = pd.DataFrame(sentence_tree_averages, index=corpus_titles).T.fillna(0)
df['author'] = corpus_authors