# A web scraping bot to extract sellers and listings data from an ecommerce platform (Etsy)

Scraping data from Etsy and then putting it into csv files/spreadsheets.The bot is given a seller's url and extracts seller data first and puts it into seller sheet, it then extracts data for all of the listings of the seller and puts it in listing sheet. After every 24 hours the sheet is updated.

Importing the necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import csv

Setting the URLs and headers

In [2]:
seller_url_1 = 'https://www.etsy.com/shop/ferraportrait?ref=l2-about-shopname'
seller_url_2 = 'https://www.etsy.com/shop/OnemerceMarketingLtd?ref=l2-about-shopname'
seller_url_3 = 'https://www.etsy.com/shop/BellasLines?ref=l2-about-shopname'


In [3]:

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}


# Helper functions for the bot

In [4]:
def get_seller_data(seller_url):
    
    response = requests.get(seller_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    name = soup.find(class_='wt-text-heading-01 wt-text-truncate').get_text()
    sales = soup.find(class_='wt-text-caption wt-no-wrap').get_text()
    number_of_reviews = soup.find(class_='wt-display-inline-block wt-vertical-align-middle').get_text()
    average_rating = soup.find_all(class_='screen-reader-only')[2].get_text()
    number_of_items = soup.find_all(class_='wt-mr-md-2')[0].get_text()
    if soup.find_all(class_='wt-text-title-03 wt-display-block'):
        creation_year =  soup.find_all(class_='wt-text-title-03 wt-display-block')[1].get_text()
    else:
        creation_year = 'Not found'
    
    #cleanup number of reviews string by removing brackets
    number_of_reviews = number_of_reviews.replace('(', '')
    number_of_reviews = number_of_reviews.replace(')', '')
    
    #cleanup sales
    sales = sales.replace(' Sales','')
    
    #timestamp
    date = datetime.date.today()

    
    data_list = [name,  sales, number_of_items,  creation_year, number_of_reviews, average_rating, date]
    
    return data_list
    

In [5]:
def get_listing_links(seller_url):
    
   
   
    productlinks = []
    
    response = requests.get(seller_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser') 
    
    ########## check for featured products to remove repitition later on ###############
    featured_product_links = []
    featured = soup.find_all('div', class_='featured-products-area wt-position-relative')
    
    if len( soup.find_all('div', class_='featured-products-area wt-position-relative') ) != 0:
        featured_product_list = featured[0].find_all(class_='listing-link wt-display-inline-block wt-transparent-card')

        for featured_link in featured_product_list:
            featured_product_links.append(featured_link['href'])
        
    ############## Finding number of pages of items if any #########################

    pagination = soup.find_all(class_='wt-action-group__item-container')[1]
    max_page = pagination.find('p').get_text()
    
    max_page = max_page.replace('Page 1 of ', '')
    last_page = int(max_page) + 1


    ####### storing number of items ####################
    number_of_items = soup.find_all(class_='wt-mr-md-2')[0].get_text()
    num_items = int(number_of_items)
    
    ###############  Finding product links and appending to list #####################
    if num_items > 36:
    
        for x in range(1, last_page):
            

            response = requests.get(seller_url+'&page='+str(x)+'#items', headers = headers)
            soup = BeautifulSoup(response.content, 'html.parser')  

            product_list = soup.find_all('div', class_='js-merch-stash-check-listing v2-listing-card wt-position-relative wt-grid__item-xs-6 wt-flex-shrink-xs-1 wt-grid__item-xl-3 wt-grid__item-lg-4 wt-grid__item-md-4 listing-card-experimental-style')

            for item in product_list:
                for link in item.find_all('a', href=True):
                    if link['href'] not in featured_product_links:
                        productlinks.append(link['href'])
                    
    else:
                    
        product_list = soup.find_all('div', class_='js-merch-stash-check-listing v2-listing-card wt-position-relative wt-grid__item-xs-6 wt-flex-shrink-xs-1 wt-grid__item-xl-3 wt-grid__item-lg-4 wt-grid__item-md-4 listing-card-experimental-style')

        for item in product_list:
            for link in item.find_all('a', href=True):
                if link['href'] not in featured_product_links:
                    productlinks.append(link['href'])
            
    
    return productlinks
    

In [6]:
def get_listing_data(listing_url):
    
    response = requests.get(listing_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find(class_='wt-text-body-01 wt-line-height-tight wt-break-word wt-mt-xs-1').get_text().strip()
    price = soup.find('div',class_='wt-display-flex-xs wt-align-items-center wt-flex-wrap')
    
    #cleanup price
    price = price.find('p').get_text().strip()
    price = price.replace('Price:\n        ', '')
    
    date = datetime.date.today()
    
    data_list =[title,price,date]
        
    return data_list   
    

# Etsy scraping bot

Initializing the csv files for each sheet with their headings

In [7]:

seller_header = ['Seller name', 'Sales', 'Number of items', 'Creation Year', 'Number of Reviews', 'Average Rating', 'Date Updated']
with open('sellers.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(seller_header)
    
listings_header = ['Seller name', 'Listing Title', 'Price','Date Updated']
with open('listings.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(listings_header)   

Scrapes with just one seller url as input

In [8]:
def Etsy_Scraping_Bot(seller_url):
    
    seller_data = get_seller_data(seller_url)

    with open('sellers.csv', 'a+', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(seller_data)
        
    listing_links = get_listing_links(seller_url)
    
    
    for link in listing_links:
        
        listing_data = get_listing_data(link)
        listing_data.insert(0,seller_data[0])
        
        with open('listings.csv', 'a+', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(listing_data)
        
    

In [9]:
sellers = [seller_url_1, seller_url_2, seller_url_3]

The following function takes multiple sellers as input and scrapes the data for each

In [10]:
def scrape_multiple_sellers(sellers_urls):
    for seller_url in sellers_urls:
        Etsy_Scraping_Bot(seller_url)

# A loop where the bot updates the sheets every 24 hours

Run this cell only to update, the rest of the cells have to be run only once

In [None]:
while True:
    scrape_multiple_sellers(sellers)
    time.sleep(86400)  # wait for 24 hours

# Reading and displaying both sheets

In [5]:
df = pd.read_csv(r'C:\Users\faqeha\sellers.csv')
df2 = pd.read_csv(r'C:\Users\faqeha\listings.csv')
print(df)

            Seller name   Sales  Number of items Creation Year  \
0         ferraportrait   1,697              169          2023   
1         ferraportrait   1,697              169          2023   
2  OnemerceMarketingLtd     393               32     Not found   
3           BellasLines  18,529               35          2022   

   Number of Reviews    Average Rating Date Updated  
0                493  5 out of 5 stars   2023-09-22  
1                493  5 out of 5 stars   2023-09-22  
2                 86  5 out of 5 stars   2023-09-22  
3               6606  5 out of 5 stars   2023-09-22  


In [6]:
print(df2)

       Seller name                                      Listing Title  \
0    ferraportrait  Custom portrait gift, personalized photo, face...   
1    ferraportrait  Custom family portrait and friendship gift, cu...   
2    ferraportrait  Custom illustration gifts Faceless Portrait, p...   
3    ferraportrait  Best Friend Gift Birthday, couple Portrait, Fa...   
4    ferraportrait  Personalized Family Portrait From Photo, Carto...   
..             ...                                                ...   
231    BellasLines  Newborn Gift, Custom Baby Portrait, Personalis...   
232    BellasLines  Custom Chirstmas Gift, Custom Family Portrait,...   
233    BellasLines  Custom Pet Portrait, Dog Portrait, Cat Portrai...   
234    BellasLines  Custom Pet Portrait, Pet Portrait Drawing, Cus...   
235    BellasLines  Custom family gift, personalized portrait, Cus...   

         Price Date Updated  
0    USD 3.09+   2023-09-22  
1    USD 3.09+   2023-09-22  
2    USD 3.09+   2023-09-22  
3  