In [1]:
import requests
import json
import pandas as pd
import numpy as np
import time
import helper_functions as hf
from helper_functions import *
import datetime

In [30]:
payload = {'userId':'1'}
a = requests.get('https://api.readrr.app/api/datasciencetogetasecretandpineapplepizzaandbrocolli/2')

In [32]:
b = json.loads(a.text)

In [33]:
aggregate(b,'title')

['Hooked',
 'Tiny Habits',
 'Gödel, Escher, Bach',
 'WTF?',
 'Whiplash',
 'Whiplash',
 'The Goal',
 'Measure What Matters',
 "Man's Search For Meaning"]

In [4]:
from importlib import reload
from helper_functions import *

## Obtain NYT list information
I'm trying to get bestseller information from the nyt API. They have a variety of lists, and metadata on each list that they have. The metadata is stored in the '/lists/names.json' endpoint. Lets view this metadata first (it will help with looping functionality later when we need details from each sub-list) 

### Getting the raw information

In [5]:
from secrets import NYT_API_KEY

In [6]:
# Hit the API endpoint to get the list of all the lists avalable through the api
book_lists = requests.get('https://api.nytimes.com/svc/books/v3/lists/names.json?api-key=' + NYT_API_KEY)

In [62]:
# storing list names from API response
listoflists = json.loads(book_lists.text)['results']
listoflists

[{'list_name': 'Combined Print and E-Book Fiction',
  'display_name': 'Combined Print & E-Book Fiction',
  'list_name_encoded': 'combined-print-and-e-book-fiction',
  'oldest_published_date': '2011-02-13',
  'newest_published_date': '2020-03-29',
  'updated': 'WEEKLY'},
 {'list_name': 'Combined Print and E-Book Nonfiction',
  'display_name': 'Combined Print & E-Book Nonfiction',
  'list_name_encoded': 'combined-print-and-e-book-nonfiction',
  'oldest_published_date': '2011-02-13',
  'newest_published_date': '2020-03-29',
  'updated': 'WEEKLY'},
 {'list_name': 'Hardcover Fiction',
  'display_name': 'Hardcover Fiction',
  'list_name_encoded': 'hardcover-fiction',
  'oldest_published_date': '2008-06-08',
  'newest_published_date': '2020-03-29',
  'updated': 'WEEKLY'},
 {'list_name': 'Hardcover Nonfiction',
  'display_name': 'Hardcover Nonfiction',
  'list_name_encoded': 'hardcover-nonfiction',
  'oldest_published_date': '2008-06-08',
  'newest_published_date': '2020-03-29',
  'updated': '

### Extracting the date and list name information
One way to extract data and convert dates to datetime format

In [32]:
# aggregate all the published date values
str_dates = aggregate(listoflists,'oldest_published_date')

In [33]:
# Function to turn the string formatted dates to naive datetime.date representations
def str_to_date(val):
    a = val.split('-')
    b = []
    for i in a:
        b.append(int(i))
    return datetime.date(b[0],b[1],b[2])

In [34]:
# Store datetimes in a new list
datetimes = []
for i in str_dates:
    datetimes.append(str_to_date(i))

In [35]:
# The oldest entry
min(datetimes)

datetime.date(2008, 6, 8)

In [36]:
# Get list names 
list_names = aggregate(listoflists,'list_name')

In [17]:
# zip list name and oldest published dates for display purposes
list(zip(a,b))

[('2011-02-13', 'Combined Print and E-Book Fiction'),
 ('2011-02-13', 'Combined Print and E-Book Nonfiction'),
 ('2008-06-08', 'Hardcover Fiction'),
 ('2008-06-08', 'Hardcover Nonfiction'),
 ('2008-06-08', 'Trade Fiction Paperback'),
 ('2008-06-08', 'Mass Market Paperback'),
 ('2008-06-08', 'Paperback Nonfiction'),
 ('2011-02-13', 'E-Book Fiction'),
 ('2011-02-13', 'E-Book Nonfiction'),
 ('2008-06-08', 'Hardcover Advice'),
 ('2008-06-08', 'Paperback Advice'),
 ('2013-04-28', 'Advice How-To and Miscellaneous'),
 ('2009-03-15', 'Hardcover Graphic Books'),
 ('2009-03-15', 'Paperback Graphic Books'),
 ('2009-03-15', 'Manga'),
 ('2011-02-13', 'Combined Print Fiction'),
 ('2011-02-13', 'Combined Print Nonfiction'),
 ('2008-06-08', 'Chapter Books'),
 ('2012-12-16', 'Childrens Middle Grade'),
 ('2015-08-30', 'Childrens Middle Grade E-Book'),
 ('2015-08-30', 'Childrens Middle Grade Hardcover'),
 ('2015-08-30', 'Childrens Middle Grade Paperback'),
 ('2008-06-08', 'Paperback Books'),
 ('2008-06-0

### Alternativley we can just read the dictionary into a pandas dataframe

In [41]:
lists_df = pd.DataFrame(listoflists)
lists = lists_df['list_name_encoded'].to_list()

In [59]:
# function to convert the str formatted values to datetimes
def cols_to_datetime(df, list_of_cols):
    for i in list_of_cols:
        try:
            df[i]=pd.to_datetime(df[i])
        except KeyError:
            print(f'{i} is not a key in the df')
        except ValueError:
            print(f"{i} isn't in datetime format")
    pass

In [60]:
cols_to_datetime(lists_df,['list_name','oldest_published_date'])

list_name isn't in datetime format


In [38]:
lists[:5]

['combined-print-and-e-book-fiction',
 'combined-print-and-e-book-nonfiction',
 'hardcover-fiction',
 'hardcover-nonfiction',
 'trade-fiction-paperback']

## Get information from NYT API
Here we retreive information from a specific list. the current hardcover bestsellers.

In [64]:
a = requests.get('https://api.nytimes.com/svc/books/v3/lists/current/hardcover-fiction.json?api-key='+NYT_API_KEY)

In [67]:
b = json.loads(a.text)

In [77]:
# Overview of heirarchy
print('Top Level:\n',b.keys(),'\n')
print('Results:\n',b['results'].keys(),'\n')
print('Books sample:\n',b['results']['books'][0].keys())

Top Level:
 dict_keys(['status', 'copyright', 'num_results', 'last_modified', 'results']) 

Results:
 dict_keys(['list_name', 'list_name_encoded', 'bestsellers_date', 'published_date', 'published_date_description', 'next_published_date', 'previous_published_date', 'display_name', 'normal_list_ends_at', 'updated', 'books', 'corrections']) 

Books sample:
 dict_keys(['rank', 'rank_last_week', 'weeks_on_list', 'asterisk', 'dagger', 'primary_isbn10', 'primary_isbn13', 'publisher', 'description', 'price', 'title', 'author', 'contributor', 'contributor_note', 'book_image', 'book_image_width', 'book_image_height', 'amazon_product_url', 'age_group', 'book_review_link', 'first_chapter_link', 'sunday_review_link', 'article_chapter_link', 'isbns', 'buy_links', 'book_uri'])


In [78]:
# Most of the relevant information is in books:
b['results']['books'][0]

{'rank': 1,
 'rank_last_week': 0,
 'weeks_on_list': 1,
 'asterisk': 0,
 'dagger': 0,
 'primary_isbn10': '0805096604',
 'primary_isbn13': '9780805096606',
 'publisher': 'Holt',
 'description': 'The third book in the Wolf Hall trilogy. After Anne Boleyn’s execution, Thomas Cromwell’s enemies assemble.',
 'price': 0,
 'title': 'THE MIRROR & THE LIGHT',
 'author': 'Hilary Mantel',
 'contributor': 'by Hilary Mantel',
 'contributor_note': '',
 'book_image': 'https://s1.nyt.com/du/books/images/9780805096606.jpg',
 'book_image_width': 329,
 'book_image_height': 500,
 'amazon_product_url': 'https://www.amazon.com/dp/0805096604?tag=NYTBSREV-20&tag=NYTBS-20',
 'age_group': '',
 'book_review_link': '',
 'first_chapter_link': '',
 'sunday_review_link': '',
 'article_chapter_link': '',
 'isbns': [{'isbn10': '0805096604', 'isbn13': '9780805096606'},
  {'isbn10': '1250773261', 'isbn13': '9781250773265'}],
 'buy_links': [{'name': 'Amazon',
   'url': 'https://www.amazon.com/dp/0805096604?tag=NYTBSREV-20

In [43]:
books = []
for i in b['results']['books']:
    books.append(i['title'])

In [44]:
books

['WHERE THE CRAWDADS SING',
 'AMERICAN DIRT',
 'THE SILENT PATIENT',
 'ONE MINUTE OUT',
 'GOLDEN IN DEATH',
 'THE DUTCH HOUSE',
 'SUCH A FUN AGE',
 'A LONG PETAL OF THE SEA',
 'THE GUARDIANS',
 'THE GIVER OF STARS',
 'THE SUN DOWN MOTEL',
 'CROOKED RIVER',
 'THE OTHER MRS.',
 'THE HOLDOUT',
 'DEAR EDWARD']

##  Getting all lists from a time period

In [30]:
lists_df.head()

Unnamed: 0,display_name,list_name,list_name_encoded,newest_published_date,oldest_published_date,updated
0,Combined Print & E-Book Fiction,Combined Print and E-Book Fiction,combined-print-and-e-book-fiction,2020-03-01,2011-02-13,WEEKLY
1,Combined Print & E-Book Nonfiction,Combined Print and E-Book Nonfiction,combined-print-and-e-book-nonfiction,2020-03-01,2011-02-13,WEEKLY
2,Hardcover Fiction,Hardcover Fiction,hardcover-fiction,2020-03-01,2008-06-08,WEEKLY
3,Hardcover Nonfiction,Hardcover Nonfiction,hardcover-nonfiction,2020-03-01,2008-06-08,WEEKLY
4,Paperback Trade Fiction,Trade Fiction Paperback,trade-fiction-paperback,2020-03-01,2008-06-08,WEEKLY


In [31]:
lists_df['book_list']=np.nan

In [32]:
lists_df.head()

Unnamed: 0,display_name,list_name,list_name_encoded,newest_published_date,oldest_published_date,updated,book_list
0,Combined Print & E-Book Fiction,Combined Print and E-Book Fiction,combined-print-and-e-book-fiction,2020-03-01,2011-02-13,WEEKLY,
1,Combined Print & E-Book Nonfiction,Combined Print and E-Book Nonfiction,combined-print-and-e-book-nonfiction,2020-03-01,2011-02-13,WEEKLY,
2,Hardcover Fiction,Hardcover Fiction,hardcover-fiction,2020-03-01,2008-06-08,WEEKLY,
3,Hardcover Nonfiction,Hardcover Nonfiction,hardcover-nonfiction,2020-03-01,2008-06-08,WEEKLY,
4,Paperback Trade Fiction,Trade Fiction Paperback,trade-fiction-paperback,2020-03-01,2008-06-08,WEEKLY,


In [33]:
# This doesn't work, I have to figure out why.
lists_df.loc[lists_df.list_name_encoded=='hardcover-fiction','book_list'] = ['test',"this"]

ValueError: Must have equal len keys and value when setting with an iterable

In [34]:
lists_df.head()

Unnamed: 0,display_name,list_name,list_name_encoded,newest_published_date,oldest_published_date,updated,book_list
0,Combined Print & E-Book Fiction,Combined Print and E-Book Fiction,combined-print-and-e-book-fiction,2020-03-01,2011-02-13,WEEKLY,
1,Combined Print & E-Book Nonfiction,Combined Print and E-Book Nonfiction,combined-print-and-e-book-nonfiction,2020-03-01,2011-02-13,WEEKLY,
2,Hardcover Fiction,Hardcover Fiction,hardcover-fiction,2020-03-01,2008-06-08,WEEKLY,
3,Hardcover Nonfiction,Hardcover Nonfiction,hardcover-nonfiction,2020-03-01,2008-06-08,WEEKLY,
4,Paperback Trade Fiction,Trade Fiction Paperback,trade-fiction-paperback,2020-03-01,2008-06-08,WEEKLY,


In [35]:
lists1 = lists[:2]
lists1

['combined-print-and-e-book-fiction', 'combined-print-and-e-book-nonfiction']

## Rate limitations 

NYT imposes a rate limitation of 10 calls per minute ( 6 second delay for continuous calling ) and 4000 calls a day. This is a reasonable time span, and the rate limits need to be accounted for once the program is ready to index all the data from NYT. 

In [52]:
# Retrive information 
books_dict2 = {}
for item in lists:
    request_url="https://api.nytimes.com/svc/books/v3/lists/current/"+item+".json?api-key="+NYT_API_KEY
    results = requests.get(request_url)
    results = json.loads(results.text)
    print("status:", results['status'])
    try:
        book_list = aggregate(results['results']['books'],'title')
        books_dict2[item]=book_list
        print(item, "added")
    except KeyError:
        print(item, "NOT added")
    time.sleep(6)

status: OK
combined-print-and-e-book-fiction added
status: OK
combined-print-and-e-book-nonfiction added
status: OK
hardcover-fiction added
status: OK
hardcover-nonfiction added
status: OK
trade-fiction-paperback added
status: OK
mass-market-paperback added
status: OK
paperback-nonfiction added
status: OK
e-book-fiction added
status: OK
e-book-nonfiction added
status: OK
hardcover-advice added
status: OK
paperback-advice added
status: OK
advice-how-to-and-miscellaneous added
status: OK
hardcover-graphic-books added
status: OK
paperback-graphic-books added
status: OK
manga added
status: OK
combined-print-fiction added
status: OK
combined-print-nonfiction added
status: OK
chapter-books added
status: OK
childrens-middle-grade added
status: OK
childrens-middle-grade-e-book added
status: OK
childrens-middle-grade-hardcover added
status: OK
childrens-middle-grade-paperback added
status: OK
paperback-books added
status: OK
picture-books added
status: OK
series-books added
status: OK
young-adu

In [46]:
lists[:10]

['combined-print-and-e-book-fiction',
 'combined-print-and-e-book-nonfiction',
 'hardcover-fiction',
 'hardcover-nonfiction',
 'trade-fiction-paperback',
 'mass-market-paperback',
 'paperback-nonfiction',
 'e-book-fiction',
 'e-book-nonfiction',
 'hardcover-advice']

In [54]:
books_dict2

{'combined-print-and-e-book-fiction': ['ONE MINUTE OUT',
  'AMERICAN DIRT',
  'WHERE THE CRAWDADS SING',
  'LITTLE FIRES EVERYWHERE',
  'CHASING CASSANDRA',
  'THE SILENT PATIENT',
  'GOLDEN IN DEATH',
  'THE DUTCH HOUSE',
  'THE GIVER OF STARS',
  'THE TATTOOIST OF AUSCHWITZ',
  'THE GUARDIANS',
  'THE OTHER MRS.',
  'THE OUTSIDER',
  'SUCH A FUN AGE',
  'A LONG PETAL OF THE SEA'],
 'combined-print-and-e-book-nonfiction': ['THE MAMBA MENTALITY',
  'DARK TOWERS',
  'OPEN BOOK',
  'A VERY STABLE GENIUS',
  'UN-TRUMPING AMERICA',
  'UNTIL THE END OF TIME',
  'EDUCATED',
  'PROFILES IN CORRUPTION',
  'ON TYRANNY',
  'JUST MERCY',
  'TALKING TO STRANGERS',
  'SAPIENS',
  'BECOMING',
  'THE BODY KEEPS THE SCORE',
  'YOU NEVER FORGET YOUR FIRST'],
 'hardcover-fiction': ['WHERE THE CRAWDADS SING',
  'AMERICAN DIRT',
  'THE SILENT PATIENT',
  'ONE MINUTE OUT',
  'GOLDEN IN DEATH',
  'THE DUTCH HOUSE',
  'SUCH A FUN AGE',
  'A LONG PETAL OF THE SEA',
  'THE GUARDIANS',
  'THE GIVER OF STARS',
 

In [None]:

request_url="https://api.nytimes.com/svc/books/v3/lists/current/hardcover-political-books.json?api-key=76quqESu9qYqfKpEfnJou4kpY1uJ9pMV"
results = requests.get(request_url)
results = json.loads(results.text)

In [None]:
aggregate(results['results']['books'],'title')

In [None]:
results['status']

In [None]:
df = pd.DataFrame(columns=['test'])
df['test']=lists

In [None]:
df

Basic Features:

- Title
- Author
    - Gender
    - Nationality
    - Languages Spoken By author (at time of writing)
- Genre
- Publisher
- Published year
- Published in 
    - (country, state city?)
- Pages
- Sales Data
- Original Language

Possible Engineered Features:

- popularity
- Combined Rating Weighed from different sources
- NLP
    - publisher site information
    - book author information
    - Reviews (from one or several sources)
    - Book wiki page
    - Author wiki page
    - Reddit information
    - books reddit search results + sentiment analysis on results
- does reddit exist
- does wiki exist
- does fandom exist
- twitter mentions
- google mentions 
- NN on book cover
