Requirements: 
1. Selenium
2. BeautifulSoup

## Install and Import Requirements

In [87]:
import csv
from bs4 import BeautifulSoup

In [88]:
pip install selenium 

Note: you may need to restart the kernel to use updated packages.


In [89]:
from selenium import webdriver

## Startup the webdriver

In [90]:
driver = webdriver.Chrome()

In [91]:
url = 'https://www.amazon.com'
driver.get(url)

In [92]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=3AAARZP6Y3RZ0&sprefix=gamming+acc%2Caps%2C371&ref=nb_sb_ss_sc_5_10'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [93]:
url = get_url('gaming accessories')
print(url)

https://www.amazon.com/s?k=gaming+accessories&crid=3AAARZP6Y3RZ0&sprefix=gamming+acc%2Caps%2C371&ref=nb_sb_ss_sc_5_10


In [94]:
driver.get(url)

## Extract the collection

In [95]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [96]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})

In [97]:
len(results)

22

## Prototype the record

In [100]:
item = results[0]

In [101]:
#simple property tree navigation to extract the header
atag = item.h2.a

In [104]:
description = atag.text.strip()

In [105]:
url = 'https://www.amazon.com' + atag.get('href')

In [106]:
price_parent = item.find('span', 'a-price')

In [107]:
price = price_parent.find('span', 'a-offscreen').text

In [108]:
item.i.text

'4.6 out of 5 stars'

In [113]:
review_count = item.find('span', {'class' : 'a-size-base'}).text

## Generalize the pattern

In [117]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    #price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    
    #reviews
    rating = item.i.text
    review_count = item.find('span', {'class' : 'a-size-base'}).text
    
    result = (description, price, rating, review_count, url)
    return result

In [118]:
#extract all content in page
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'text'

## Error handling

In [123]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    #description
    atag = item.h2.a
    description = atag.text.strip()
    
    #url of each product
    url = 'https://www.amazon.com' + atag.get('href')
    
    #price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    try: 
    #reviews
        rating = item.i.text
        review_count = item.find('span', {'class' : 'a-size-base'}).text

    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    return result

In [124]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
    records.append(extract_record(item))

In [125]:
#checking rows
for row in records:
    print(row[2])

4.6 out of 5 stars
5.0 out of 5 stars
4.8 out of 5 stars
4.4 out of 5 stars
4.3 out of 5 stars
4.4 out of 5 stars
4.3 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.8 out of 5 stars
4.6 out of 5 stars
4.8 out of 5 stars
4.4 out of 5 stars

4.3 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.8 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars


## Getting the next page

In [126]:
#Modify the url function
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=3AAARZP6Y3RZ0&sprefix=gamming+acc%2Caps%2C371&ref=nb_sb_ss_sc_5_10'
    search_term = search_term.replace(' ', '+')
    
    #add term query to the url
    url = template.format(search_term)
    
    #add page query placeholder
    url += '&page={}'
    return url

## Putting it all together

In [127]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=3AAARZP6Y3RZ0&sprefix=gamming+acc%2Caps%2C371&ref=nb_sb_ss_sc_5_10'
    search_term = search_term.replace(' ', '+')
    
    #add term query to the url
    url = template.format(search_term)
    
    #add page query placeholder
    url += '&page={}'
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    #price
    try:
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        price = ''
    
    #reviews
    try: 
        rating = item.i.text
        review_count = item.find('span', {'class' : 'a-size-base'}).text

    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    return result

def main(search_term):
    """run main program routine"""
    #startup the webdriver
    driver = webdriver.Chrome()
    
    record = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
                
    driver.close()

    #save data to csv file
    with open('gaming_accessories1.csv','w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description','Price','Rating','Review_Count', 'Url'])
        writer.writerows(records)

In [83]:
main('gaming accessories')