## Importing Necessary Libraries

In [1]:
import csv
from bs4 import BeautifulSoup

In [2]:
# Importing the necessary libraries to automate the scraping process 

from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager

driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())



Current firefox version is 98.0
Get LATEST geckodriver version for 98.0 firefox
Getting latest mozilla release info for v0.30.0
Trying to download new driver from https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-win64.zip
Driver has been saved in cache [C:\Users\moh20\.wdm\drivers\geckodriver\win64\v0.30.0]


In [3]:
# feeding the driver with the started address we are going to use

url = "https://www.amazon.com/"
driver.get(url)

In [4]:
# generating a url search term

def get_url(search_term):
    base_url = "https://www.amazon.com/s?k={}&sprefix=ultra%2Caps%2C287&ref=nb_sb_ss_ts-doa-p_4_5"
    search_term = search_term.replace(' ', '+')
    return base_url.format(search_term)

In [5]:
url = get_url('ultrawide monitor')
print (url)

https://www.amazon.com/s?k=ultrawide+monitor&sprefix=ultra%2Caps%2C287&ref=nb_sb_ss_ts-doa-p_4_5


In [6]:
driver.get(url)

## Extract the Content

In [7]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [8]:
items = soup.find_all('div' ,{'data-component-type':'s-search-result'})

In [9]:
len(items)

22

# Prototyping the Record

In [10]:
## getting the item name and the url 

item = items[0]
atag = item.h2.a

In [11]:
description = atag.text.strip()
print(description)

LG 34WK650-W 34" UltraWide 21:9 IPS Monitor with HDR10 and FreeSync (2018), Black/White


In [12]:
item_url = 'https://www.amazon.com' + atag.get('href')

In [13]:
item_url

'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A00020462TABCF6RA1T4T&url=%2FLG-34WK650-W-34-UltraWide-21%2Fdp%2FB078GSH1LV%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dultrawide%2Bmonitor%26qid%3D1648216607%26sprefix%3Dultra%252Caps%252C287%26sr%3D8-1-spons%26psc%3D1&qualifier=1648216607&id=7118712389588287&widgetName=sp_atf'

In [14]:
## getting the item price

In [15]:
item_price = item.find('span',{'class':'a-price'}).find('span',{'class':'a-offscreen'}).text

In [16]:
print(item_price)

$349.99


In [17]:
## getting the item reviews and review count

In [18]:
item_review = item.i.text
print (item_review)

4.6 out of 5 stars


In [19]:
item_review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text
item_review_count

'1,397'

## Generalizing The Pattern

In [20]:
def extrac_records(item):
    ## getting the item name and the url 
    atag = item.h2.a
    description = atag.text.strip()
    item_url = 'https://www.amazon.com' + atag.get('href')
    
    ## Getting The Price
    item_price = item.find('span',{'class':'a-price'}).find('span',{'class':'a-offscreen'}).text
    
    ## Getting The Reviews
    item_review = item.i.text
    item_review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text
    
    result = (description,item_price,item_review,item_review_count,item_url)
    return result

In [21]:
records = []
items = soup.find_all('div' ,{'data-component-type':'s-search-result'})

for item in items:
    records.append(extrac_records(item))

AttributeError: 'NoneType' object has no attribute 'find'

In [22]:
## As u can see there will be errors with the no attribute type 
## Basically what that means is that there are missing information 
## in some of the records so we will need to handle these errors


## Error Handling 

In [23]:
def extrac_records(item):
    ## getting the item name and the url 
    atag = item.h2.a
    description = atag.text.strip()
    item_url = 'https://www.amazon.com' + atag.get('href')
    
    ## Getting The Price
    try:
        item_price = item.find('span',{'class':'a-price'}).find('span',{'class':'a-offscreen'}).text
    except AttributeError:
        return
    
    ## Getting The Reviews
    try:
        item_review = item.i.text
        item_review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        item_review = ' '
        item_review_count = ' '
    
    
    result = (description,item_price,item_review,item_review_count,item_url)
    return result

In [24]:
records = []
items = soup.find_all('div' ,{'data-component-type':'s-search-result'})

for item in items:
    record = extrac_records(item)
    if record:
        records.append(record) 

In [25]:
records[0]

('LG 34WK650-W 34" UltraWide 21:9 IPS Monitor with HDR10 and FreeSync (2018), Black/White',
 '$349.99',
 '4.6 out of 5 stars',
 '1,397',
 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A00020462TABCF6RA1T4T&url=%2FLG-34WK650-W-34-UltraWide-21%2Fdp%2FB078GSH1LV%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dultrawide%2Bmonitor%26qid%3D1648216607%26sprefix%3Dultra%252Caps%252C287%26sr%3D8-1-spons%26psc%3D1&qualifier=1648216607&id=7118712389588287&widgetName=sp_atf')

## Getting The Next Page

In [26]:
#There are multiple ways of doing this but for the sake of simplicity and
#avoiding unnecessary automation we will just add the page query to the url 

In [27]:
# generating a url search term

def get_url(search_term):
    base_url = "https://www.amazon.com/s?k={}&sprefix=ultra%2Caps%2C287&ref=nb_sb_ss_ts-doa-p_4_5"
    search_term = search_term.replace(' ', '+')
    url = base_url.format(search_term)
    url += '&page{}'
    return url

# Putting it all together

In [28]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager



# generating a url search term

def get_url(search_term):
    base_url = "https://www.amazon.com/s?k={}&sprefix=ultra%2Caps%2C287&ref=nb_sb_ss_ts-doa-p_4_5"
    search_term = search_term.replace(' ', '+')
    url = base_url.format(search_term)
    url += '&page{}'
    return url


def extrac_records(item):
    ## getting the item name and the url 
    atag = item.h2.a
    description = atag.text.strip()
    item_url = 'https://www.amazon.com' + atag.get('href')
    
    ## Getting The Price
    try:
        item_price = item.find('span',{'class':'a-price'}).find('span',{'class':'a-offscreen'}).text
    except AttributeError:
        return
    
    ## Getting The Reviews
    try:
        item_review = item.i.text
        item_review_count = item.find('span',{'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        item_review = ' '
        item_review_count = ' '
    
    
    result = (description,item_price,item_review,item_review_count,item_url)
    return result



def main(search_term):    
    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    
    records = []
    url = get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source,'html.parser')
        items = soup.find_all('div' ,{'data-component-type':'s-search-result'})
        for item in items:
            record = extrac_records(item)
            if record:
                records.append(record)
    driver.close()
    
    # Save the data to csv
    with open(r'D:\Data Analysis projects\Python\Amazon Web Scraping\DataExported\amazondata.csv' , 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description','Price','Review','Review Count','URL'])
        writer.writerows(records)

In [29]:
main('ultrawide monitor')



Current firefox version is 98.0
Get LATEST geckodriver version for 98.0 firefox
Driver [C:\Users\moh20\.wdm\drivers\geckodriver\win64\v0.30.0\geckodriver.exe] found in cache
