# Scrape Amazon Reviews

In [67]:
from selenium import webdriver

# BeautifulSoup converts the contents of a page into a proper format
from bs4 import BeautifulSoup

import csv

In [32]:
url = 'https://www.amazon.com/Flexispot-Electric-Height-Adjustable-Standing/product-reviews/B0813MN98G/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=1'

driver = webdriver.Chrome()
driver.get(url)

In [33]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

result = soup.find_all('div',{'data-hook':'review'})

In [35]:
item = result[0]
item

<div class="a-section review aok-relative" data-hook="review" id="ROM3I9WFCN84X"><div class="a-row a-spacing-none" id="ROM3I9WFCN84X-review-card"><div class="a-section celwidget" data-cel-widget="customer_review-ROM3I9WFCN84X" data-csa-c-id="i11qe2-7m8dqw-r93v66-v9d33h" id="customer_review-ROM3I9WFCN84X"><div class="a-row a-spacing-mini" data-hook="genome-widget"><a class="a-profile" data-a-size="small" href="/gp/profile/amzn1.account.AF3ICHIOM7QGFBRLMKFG3M7YBKPA/ref=cm_cr_arp_d_gw_btm?ie=UTF8"><div aria-hidden="true" class="a-profile-avatar-wrapper"><div class="a-profile-avatar"><img class="a-lazy-loaded" data-src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/5b337b2f-3586-4fcb-b6ff-1a7be5f57c0e._CR0,0,368,368_SX48_.jpg" src="https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/grey-pixel.gif"/><noscript><img src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/5b337b2f-3586-4fcb-b6ff-1a7be5f57c0e._CR0,0,368,368_SX48_.jpg

In [36]:
item.find('span',{'class':'a-icon-alt'}).text

'5.0 out of 5 stars'

In [56]:
item.find('a',{'data-hook':'review-title'}).text.strip()

'Some tradeoffs, but overall a great desk'

In [38]:
item.find('span',{'data-hook':'review-date'}).text

'Reviewed in the United States on July 16, 2020'

In [40]:
item.find('span',{'data-hook':'review-body'}).text.strip()

"Built pretty well and tall enough for a 6'0'' person.This desk has dramatically improved my computer intensive lifestyle.It's great value but there are trade offs at this price point.Firstly, the legs must start at the same height to avoid a crooked desk. If you happened to offset the heights of the legs during assembly, the instructions won't help you fix it at all. This is how to do it. Assemble the entire desk with the exception of the transmission rod. Plug the desk into your wall socket. Adjust the height of the motorized leg to its lowest setting by simply pressing the down button (use the motor). Next, insert the transmission rod only into the non-motorized leg. Clamp an adjustable wrench onto the transmission rod (hexagonal part) and manually crank the leg to its lowest setting. Next connect the transmission rod to both legs and test it out.Secondly, the four threaded holes that allow the center beam to bolt to the legs are tapped at an angle slightly off from 90 degrees. This

In [61]:
def extract_record(item):
    
    # rating
    try:
        rating = item.find('span',{'class':'a-icon-alt'}).text
    except AttributeError:
        rating = ''
    
    # review_title
    try:
        review_title = item.find('a',{'data-hook':'review-title'}).text.strip()    
    except AttributeError:
        review_title = ''   
    
    #review_date
    review_date = item.find('span',{'data-hook':'review-date'}).text
    
    #review_content 
    try:
        review_content = item.find('span',{'data-hook':'review-body'}).text.strip()  
    except AttributeError:
        review_content = ''   
        
    result = (rating, review_title, review_date, review_content)
    
    return result

In [68]:
def main(base_url):
    
    
    driver = webdriver.Chrome()
    records = []
    
    for page in range(1,51):
        url = base_url + str(page)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div',{'data-hook':'review'})

        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    with open('review_result.csv', 'w', newline ='', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Rating','Review_title','Review_date','Review_content'])
        writer.writerows(records)    


In [69]:
main('https://www.amazon.com/Flexispot-Electric-Height-Adjustable-Standing/product-reviews/B0813MN98G/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=')

In [70]:
import pandas as pd
df = pd.read_csv('review_result.csv')
df.head()

Unnamed: 0,Rating,Review_title,Review_date,Review_content
0,5.0 out of 5 stars,"Some tradeoffs, but overall a great desk","Reviewed in the United States on July 16, 2020",Built pretty well and tall enough for a 6'0'' ...
1,5.0 out of 5 stars,Perfect size and easy setup,"Reviewed in the United States on August 13, 2019",I've been looking for a standing desk to add t...
2,5.0 out of 5 stars,Great investment for virtual school and work,"Reviewed in the United States on September 14,...",The media could not be loaded.\n ...
3,4.0 out of 5 stars,Best value for Motorized Height Adjustable Tab...,"Reviewed in the United States on May 25, 2020","Pros:* For under $300, this was a good buy.* F..."
4,5.0 out of 5 stars,DON'T HESITATE - BUY THIS DESK!!!,"Reviewed in the United States on May 18, 2020","I just purchased this desk on Thursday, May 14..."
