# Summary of Subject Corpus

## Imports

In [1]:
import requests
import json
import pandas as pd

## Parameters

In [2]:
corpus_db_url = "http://corpus-db.org/api"

## Get Data

### List Subjects

In [3]:
subjects = pd.DataFrame(json.loads(requests.get(corpus_db_url + "/subjects").text), columns=['subject', 'count'])
display(subjects.head(15))  # show the top 15 subjects

Unnamed: 0,subject,count
0,Fiction,1920
1,Short stories,1581
2,Science fiction,1283
3,Adventure stories,766
4,Historical fiction,634
5,Poetry,633
6,Love stories,619
7,",",557
8,English wit and humor -- Periodicals,555
9,Conduct of life -- Juvenile fiction,554


### Filter subjects

Returns a list that we can use to combine all 'detective' and 'crime' subjects that are in English (or not specified and so assumed)

In [4]:
desired_subjects = 'detective|crime|private investigators'
undesired_subjects = 'crimean|commercial|case studies|history|biography|against|organized|political'
undesired_languages = 'france|french|chinese|german|argentina|ukraine|dutch|portuguese|philippines'
filtered_subjects = subjects.where(subjects['subject'].str.contains(desired_subjects, case=False, regex=True))
filtered_subjects = filtered_subjects.where(~subjects['subject'].str.contains(undesired_subjects, case=False, regex=True))
filtered_subjects = filtered_subjects.where(~subjects['subject'].str.contains(undesired_languages, case=False, regex=True))
filtered_subjects = filtered_subjects.dropna()['subject'].to_list()

display(filtered_subjects)

['Detective and mystery stories',
 'Mystery and detective stories',
 'Detective and mystery stories, English',
 'Private investigators -- England -- Fiction',
 'Women detectives -- Juvenile fiction',
 'Crime',
 'Crime -- Fiction',
 'Detective and mystery stories, American',
 'Crime -- Great Britain',
 'Private investigators -- Fiction',
 'Crime -- England -- London',
 'Crime -- United States',
 'Detectives',
 'Detectives -- Fiction',
 'Detectives -- Juvenile fiction',
 'Detectives -- United States',
 'Crime -- Drama',
 'Crime -- Juvenile fiction',
 'Private investigators -- England -- London -- Fiction',
 'Computer crimes -- Australia',
 'Computer crimes -- United States',
 'Crime -- England -- Fiction',
 'Crime -- England -- London -- Early works to 1800',
 'Crime -- Illinois -- Chicago',
 'Crime -- New York (State) -- New York',
 'Crime -- United States -- Fiction',
 'Crime -- West (U.S.)',
 'Detective and mystery plays',
 'Detective and mystery stories -- Periodicals',
 'Detectives 

### Get metadata for records of all remaining subjects

In [5]:
metadata = []
for subject in filtered_subjects:
    metadata = metadata + json.loads(requests.get(corpus_db_url + f"/subject/{subject}").text)
    
print('Got {0} records.'.format(len(metadata)))

Got 1104 records.


### Narrow down by language meta tag

In [6]:
filtered_metadata = [item for item in metadata if 'en' in item['languages']]

print('Narrowed to {0} records.'.format(len(filtered_metadata)))

Narrowed to 1038 records.


### Narrow down by Library of Congress meta tag

PE - English language
PR - British literature
PS - American literature

In [7]:
filtered_metadata = [item for item in filtered_metadata if 'PE' in item['LCC'] or 'PR' in item['LCC'] or 'PS' in item['LCC']]

print('Narrowed to {0} records.'.format(len(filtered_metadata)))

Narrowed to 664 records.


### Get Full Text Corpus

In [8]:
def getFulltext(bookID): 
    return json.loads(requests.get(corpus_db_url + '/id/' + bookID + '/fulltext').text)

corpus = [getFulltext(bookID) for bookID in filtered_metadata[0]]

print('Corpus of {0} texts.'.format(len(filtered_metadata)))

Corpus of 664 texts.
