# Summary of Subject Corpus

## Imports

In [1]:
import requests
import json
import pandas as pd
import numpy as np

## Parameters

In [2]:
corpus_db_url = "http://corpus-db.org/api"
author = 'Doyle, Arthur Conan'

## Get Data

### List Books

In [3]:
unwanted_columns = ['downloads', 'authoryearofbirth', 'authoryearofdeath', 'formats', '']
metadata = pd.DataFrame(json.loads(requests.get(corpus_db_url + f"/author/{author}").text))
metadata = metadata.replace('', np.nan).dropna(axis=1).drop(labels=unwanted_columns, axis=1)

display(metadata.sample(5))  # show 5 random books

print('Got metadata for {0} books.'.format(len(metadata)))

Unnamed: 0,lcsh,languages,LCC,author,id,title,type
25,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",2350.0,His Last Bow: An Epilogue of Sherlock Holmes,Text
68,"{'Blessing and cursing -- Fiction', 'Holmes, S...",['pl'],{'PR'},"Doyle, Arthur Conan",34079.0,Tajemnica Baskerville'ów: dziwne przygody Sher...,Text
57,"{'East and West -- Fiction', 'Egypt -- Social ...",['en'],{'PR'},"Doyle, Arthur Conan",21768.0,"A Desert Drama: Being The Tragedy Of The ""Koro...",Text
8,{'Suburbs -- Great Britain -- Fiction'},['en'],{'PR'},"Doyle, Arthur Conan",356.0,Beyond the City,Text
82,"{'Adventure stories, English', 'France -- Hist...",['fi'],{'PR'},"Doyle, Arthur Conan",49584.0,Napoleonin sotilaan seikkailut,Text


Got metadata for 91 books.


### Narrow down by language meta tag

In [4]:
filtered_metadata = pd.DataFrame([item for (_, item) in metadata.iterrows() if 'en' in ''.join(item['languages'])], columns=metadata.columns)

display(filtered_metadata.sample(5))  # show 5 random books

print('Narrowed to {0} books.'.format(len(filtered_metadata)))

Unnamed: 0,lcsh,languages,LCC,author,id,title,type
77,"{'Fiction', 'Short stories', 'Detective and my...",['en'],{'PR'},"Doyle, Arthur Conan",40848.0,"The Gully of Bluemansdyke, and Other stories",Text
12,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",834.0,The Memoirs of Sherlock Holmes,Text
53,"{'Short stories', 'Mystery fiction'}",['en'],{'PR'},"Doyle, Arthur Conan",17398.0,The Cabman's Story: The Mysteries of a London ...,Text
81,set(),['en'],set(),"Doyle, Arthur Conan",48320.0,Adventures of Sherlock Holmes: Illustrated,Text
43,"{'Adventure stories, English', 'France -- Hist...",['en'],{'PR'},"Doyle, Arthur Conan",11247.0,The Exploits of Brigadier Gerard,Text


Narrowed to 68 books.


### Narrow down by Library of Congress language meta tag

PE - English language
PR - British literature
PS - American literature

In [5]:
filtered_metadata = pd.DataFrame([item for (_, item) in filtered_metadata.iterrows() if 'PE' in item['LCC'] or 'PR' in item['LCC'] or 'PS' in item['LCC']], columns=metadata.columns)

display(filtered_metadata.sample(5))  # show 5 random books

print('Narrowed to {0} books.'.format(len(filtered_metadata)))

Unnamed: 0,lcsh,languages,LCC,author,id,title,type
6,"{'Short stories, English', 'Great Britain -- S...",['en'],{'PR'},"Doyle, Arthur Conan",294.0,"The Captain of the Polestar, and Other Tales",Text
41,"{'Sports stories, English', 'Great Britain -- ...",['en'],{'PR'},"Doyle, Arthur Conan",10446.0,"The Green Flag, and Other Stories of War and S...",Text
29,"{'Blessing and cursing -- Fiction', 'Holmes, S...",['en'],{'PR'},"Doyle, Arthur Conan",3070.0,The Hound of the Baskervilles,Text
17,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",2097.0,The Sign of the Four,Text
57,"{'East and West -- Fiction', 'Egypt -- Social ...",['en'],{'PR'},"Doyle, Arthur Conan",21768.0,"A Desert Drama: Being The Tragedy Of The ""Koro...",Text


Narrowed to 57 books.


### Narrow down by Library of Congress Subject Heading

In [6]:
filtered_metadata = pd.DataFrame([item for (_, item) in filtered_metadata.iterrows() if 'detective' in item['lcsh'].lower()], columns=metadata.columns)

display(filtered_metadata.sample(5))  # show 5 random books

print('Narrowed to {0} books.'.format(len(filtered_metadata)))

Unnamed: 0,lcsh,languages,LCC,author,id,title,type
25,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",2350.0,His Last Bow: An Epilogue of Sherlock Holmes,Text
4,"{'England -- Fiction', 'Private investigators ...",['en'],{'PR'},"Doyle, Arthur Conan",244.0,A Study in Scarlet,Text
24,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",2349.0,The Adventure of the Devil's Foot,Text
77,"{'Fiction', 'Short stories', 'Detective and my...",['en'],{'PR'},"Doyle, Arthur Conan",40848.0,"The Gully of Bluemansdyke, and Other stories",Text
23,{'Private investigators -- England -- Fiction'...,['en'],{'PR'},"Doyle, Arthur Conan",2348.0,The Disappearance of Lady Frances Carfax,Text


Narrowed to 21 books.


### Get Full Text Corpus

In [7]:
def getFulltext(bookID): 
    return json.loads(requests.get(corpus_db_url + '/id/' + bookID + '/fulltext').text)

corpus = [getFulltext(bookID) for bookID in filtered_metadata['id']]

print('Corpus of {0} texts.'.format(len(filtered_metadata)))

Corpus of 21 texts.
