# DS-SF-34 | 03 | Databases, Scrapping, and APIs | Codealong | Answer Key

# Part C | Scrapping and Amazon Product Reviews (cont.)

> ## We are now ready to extract the reviews offline and no longer need to query the Amazon website.

In [1]:
import os
import gzip
import json
import lxml.html
import dateutil

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

> ## Input

In [2]:
with gzip.open(os.path.join('..', 'datasets', 'dataset-03-reviews.json.gz'), 'rb') as f:
    pages = json.loads(f.read())

In [3]:
len(pages)

587

## First page

In [4]:
page = pages['1']['content']

In [5]:
page



In [6]:
document = lxml.html.fromstring(page)

In [7]:
type(document)

lxml.html.HtmlElement

(http://lxml.de/api/lxml.html-module.html#fromstring and http://lxml.de/api/lxml.html.HtmlElement-class.html)

> ## All reviews of a page

(http://lxml.de/api/lxml.etree._Element-class.html#xpath)

In [8]:
reviews = document.xpath('//*[@data-hook="review"]')

## First review

In [9]:
review = reviews[0]

In [10]:
type(review)

lxml.html.HtmlElement

> ## id

(http://lxml.de/api/lxml.etree._Element-class.html#get)

In [11]:
review.get('id')

'R3TUANQ2EB3ECB'

> # star rating

In [12]:
review.find('.//*[@data-hook="review-star-rating"]').get('class')

'a-icon a-icon-star a-star-1 review-rating'

(http://lxml.de/api/lxml.etree._Element-class.html#find)

(https://en.wikipedia.org/wiki/XPath)

> ## title

In [13]:
review.findtext('.//*[@data-hook="review-title"]')

'Skip it. Life is too short.'

(http://lxml.de/api/lxml.etree._Element-class.html#findtext)

> ## author

In [14]:
review.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]')

'MichaelMichaels'

> ## date

In [15]:
review.findtext('.//*[@data-hook="review-date"]')

'on April 21, 2017'

> ## body

In [16]:
review.findtext('.//*[@data-hook="review-body"]')

"I've never read any of the Harry Potter books so I cannot compare. This book is relentlessly grim. There's not one reason I can think of to recommend this book. Each character is miserable and unhappy and horrible. And there are a million of them and good luck keeping them straight. Ultimately, you do not care what happens to any of them. Skip it. Life is too short."

> ## Output

In [17]:
df = pd.DataFrame(columns = ['date', 'id', 'author', 'title', 'body', 'star_rating'])

In [18]:
df

Unnamed: 0,date,id,author,title,body,star_rating


## Putting all of it together

(https://docs.python.org/2/howto/unicode.html and https://docs.python.org/2/library/stdtypes.html)

In [19]:
def date(node):
    date = review.findtext('.//*[@data-hook="review-date"]').replace('on ', '')
    return dateutil.parser.parse(date)

def id(node):
    return node.get('id')

def author(node):
    return node.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]').encode('ascii', 'ignore')

def title(node):
    return node.findtext('.//*[@data-hook="review-title"]').encode('ascii', 'ignore')

def body(node):
    return node.findtext('.//*[@data-hook="review-body"]').encode('ascii', 'ignore')

def star_rating(node):
    node = node.find('.//*[@data-hook="review-star-rating"]')

    if node == None:
        return np.nan

    for star_rating in range(1, 6):
        if node.find_class('a-star-{:d}'.format(star_rating)):
             return star_rating

    return np.nan

(http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html)

In [20]:
for i in sorted(pages.keys(), key = lambda i: int(i)):
    page = pages[i]['content']
    document = lxml.html.fromstring(page)
    reviews = document.xpath('//*[@data-hook="review"]')

    for review in reviews:
        df = df.append({'date': date(review),
                        'id': id(review),
                        'author': author(review),
                        'title': title(review),
                        'body': body(review),
                        'star_rating': star_rating(review)},
                       ignore_index = True)

In [21]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2017-04-21,R3TUANQ2EB3ECB,MichaelMichaels,Skip it. Life is too short.,I've never read any of the Harry Potter books ...,1.0
1,2017-04-20,R2DD03ZZ4218VW,Frans van Wyk,Four Stars,Excellent Read with a lot of real life values ...,4.0
2,2017-04-20,R296NVKLH5QS4W,Sabina Duke,Characters,Hard to keep the characters straight,4.0
3,2017-04-05,R3MP7W8LH6VHU8,Jen Blau,GIVE IT A CHANCE!,I almost put this book down. I'm new to Rowlin...,5.0
4,2017-04-04,RZWP48RKJCXT1,Lilith Eleanor,Frighteningly good,Amazing. Rowling combines fantastic writing wi...,5.0
...,...,...,...,...,...,...
5856,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5857,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5858,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5859,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [22]:
df.shape

(5861, 6)

In [23]:
df.to_csv(os.path.join('..', 'datasets', 'dataset-03-reviews.csv'), index = False)