In [1]:
# %% Imports

import pandas as pd
import bs4
from bs4 import BeautifulSoup
import requests
import csv

In [2]:
# %% Setup version

print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

pandas version: 1.4.2
bs4 version: 4.11.1
requests version: 2.27.1
csv version: 1.0


In [3]:
def get_url(search_term, pages):
    """
    
    The get_url function returns a list of urls for the searched phrase and the number of pages
    
    :search_term: The name or expression of the item you're looking for on Amazon 
    :pages: The number of pages you want to scrape (< maximum number of web pages shown)
    
    """
    
    template= 'https://www.amazon.com/s?k={}&page={}&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1'    
    urls= []
    
    for i in range(2,pages):
        search_term= search_term.replace(' ','+')
        url= template.format(search_term, i)
        urls.append(url)
    return urls

In [4]:
def scrape_record(item):
    """
    
    Scrape_record does scrape infos(Description, Rating, Reiew count ) of a particualar item an returned as a record
    
    :item: an html div where the infos are located
    
    """

    #Product's description
    atag= item.h2.a
    description= atag.text.strip()
    
    #Product's price
    try:
        price_parent= item.find('span','a-price')
        price= price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    #Product's rating
    try:
        rating= item.i.text.strip()[:3]
    except AttributeError:
        rating= 'None'
        
    #Product's reiew count    
    try:
        review_count= item.find('span',{'class': 'a-size-base s-underline-text'}).text
    except AttributeError:
        review_count= 'None'
    
    record= (description, price, rating, review_count)
    
    return record


In [5]:
def scrape_all_pages(urls):
    """
    The scrape_all_pages function loops over the urls and scrapes all items before saving the data in csv format.    
    
    :urls: list of urls generated by calling the get_url function
    """
    records= [] 
    
    for url in urls:
        headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
        page= requests.get(url, headers=headers) 
        soup= BeautifulSoup(page.content,'html.parser')

        results= soup.find_all('div',{'data-component-type':'s-search-result'})
        
        for item in results:
            record= scrape_record(item)
            if record:
                records.append(record)
                
    # Writing the rows into a csv file. If desired, this convertion set of code can be implemented as a function;
    
    with open("books_data.csv", 'w', newline='', encoding='utf-8') as f:
        writer= csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'Review_count'])
        writer.writerows(records)
    return records

In [6]:
urls= get_url('football',75)
records= scrape_all_pages(urls)

In [7]:
df= pd.read_csv('books_data.csv')

In [8]:
pd.set_option('display.max_rows', None)
df

Unnamed: 0,Description,Price,Rating,Review_count
0,WILSON NFL Super Grip Composite Football,$21.95,4.8,
1,Speed and agility training set- training ladde...,$28.99,4.1,
2,XINXIANG Agility Ladder 1 Agility Training Equ...,$40.99,4.4,
3,Repster Football Gloves - Tacky Grip Skin Tigh...,$32.99,4.4,
4,WILSON GST Composite Football - Junior Size,$76.00,4.7,
5,A Football Life: Season 1,$14.05,4.7,
6,Nike Men's Vapor Jet 6.0 Football Receiver Gloves,$69.95,4.6,
7,Nike All Field 3.0 Football Regular,$39.99,4.7,
8,"Sof Sole Sneaker Balls Shoe, Gym Bag, and Lock...",$7.99,4.6,
9,Oakley Legacy Adult Football Helmet Shield Single,$56.99,4.7,


In [9]:
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
page= requests.get('https://www.amazon.com/s?k=cars+for+sell&crid=24MJ52SLWK8V3&sprefix=cars+for+%2Caps%2C3936&ref=nb_sb_noss_2', headers=headers) 
soup= BeautifulSoup(page.content,'html.parser')

results= soup.find_all('div',{'data-component-type':'s-search-result'})

In [10]:
urls

['https://www.amazon.com/s?k=football&page=2&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=3&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=4&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=5&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=6&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=7&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=8&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=9&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&ref=nb_sb_noss_1',
 'https://www.amazon.com/s?k=football&page=10&crid=2C1DDU2F76VAW&sprefix=computers%2Caps%2C270&r