In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

import pandas as pd

In [2]:
# declare global variables

WAIT_TIME = 5 # seconds

# default input & output file names
input_file = 'missing_barcodes.csv'
output_file = 'cosmo_webscrape_v4.csv'
url = 'https://cosmomusic.ca'

In [3]:
# create empty dataframe
df = pd.DataFrame(columns=['Product Name', 'Product Model', 'Product Price', 'UPC', 'Product Description', 'Product Image URL'])

upc_df = pd.read_csv(input_file)
product_list = upc_df['SKU'].tolist()

In [4]:
print(f'Finding Barcodes for {len(product_list)} products:')
for product in product_list:
    print(f'\t{product}')
print('\nLoading Chrome Driver...')
driver = webdriver.Chrome('./chromedriver')
print(f'Opening {url} in Chrome browser...')
driver.get(url)
products_not_found = 0
products_not_found_list = []


Finding Barcodes for 9 products:
	PRDB100
	YAS26
	YCL255
	2311100000
	2311000000
	0378553506
	0234810000
	0990700100
	0239979002

Loading Chrome Driver...
Opening https://cosmomusic.ca in Chrome browser...


In [5]:
for product_model in product_list:    
    # click magnifying glass icon to open search input
    print(f'\nSearching for product {product_model}...')
    search_button = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li[data-header-search-trigger-target]'))).click()

    # click search input and enter product model
    search_input = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[name="q"]')))
    search_input.clear()
    search_input.send_keys(product_model)
    search_input.send_keys(Keys.RETURN)

    # get search result elements
    search_result_elements = driver.find_elements(By.CSS_SELECTOR, "div[class='grid__cell grid__cell--50 grid__cell--25-at-medium']")

    # if no search results, print product not found
    if len(search_result_elements) == 0:
        print(f'Product {product_model} not found')
        products_not_found += 1
        products_not_found_list.append(product_model)
        continue
    # if only one search result, click on it
    elif len(search_result_elements) == 1:
        try:
            product_summary_link = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class='product-summary__media-link']"))).click()
        except:
            print('Product not found')
    else:
        # if more than one search result, loop through each search result and click on the one that matches the product model
        try:
            print('clicking product link...')
            product_summary_name = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.XPATH, f"//a[contains(text(), ' {product_model} ')]"))).click()
        except:
            print(f'Product {product_model} not found, redirecting...')
            for i in range(len(search_result_elements) - 1):
                print(f'Search results element: {search_result_elements[i]}')
                search_result_elements[i].click()
                product_info_list = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "ul[class='product-info__list']")))
                elementList = product_info_list.find_elements(By.TAG_NAME, "li")
                search_product_model = elementList[1].text.split(': ')[1]
                if search_product_model == product_model:
                    print(f'Product {product_model} found')
                    break
                else:
                    print(f'Product {product_model} does not match {search_product_model}, tried {i} times')
                    driver.back()

    # get product info list which contains UPC and search product model
    try:
        product_info_list = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "ul[class='product-info__list']")))
    except:
        print('timed out')

    elementList = product_info_list.find_elements(By.TAG_NAME, "li")
    
    search_product_model = elementList[1].text.split(': ')[1]

    if search_product_model != product_model:
        print(f'Product {product_model} does not match {search_product_model}, redirecting...')
        driver.back()
        for i in range(len(search_result_elements) - 1):
            search_result_elements = driver.find_elements(By.CSS_SELECTOR, "div[class='grid__cell grid__cell--50 grid__cell--25-at-medium']")
            search_result_elements[i].click()
            product_info_list = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "ul[class='product-info__list']")))
            elementList = product_info_list.find_elements(By.TAG_NAME, "li")
            search_product_model = elementList[1].text.split(': ')[1]
            if search_product_model == product_model:
                print(f'Product {product_model} found\n')
                break
            else:
                driver.back()
        if i > len(search_result_elements) - 1:
            print(f'Product {product_model} not found')
            products_not_found += 1
            products_not_found_list.append(product_model)
            continue

    UPC = elementList[2].text.split(': ')[1]

    # Get product name, price, and image URL
    product_name = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "h1[class='product-info__name']"))).text.strip()
    product_price = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='product-prices__sell-price']"))).text
    
    product_image_url = driver.find_element(By.CSS_SELECTOR, "img[class='product-details__primary-image-link-image']").get_attribute("src")
    
    # get product description
    try:
        product_description = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div[class='product-detail-container__description-body']"))).text
    except:
        product_description = 'No description available'

    # print results to console
    print('----------------------------------------')
    print(f'Product name: {product_name}')
    print(f'Product model: {product_model}')
    print(f'Product price: {product_price}')
    print(f'UPC: {UPC}')
    print(f'Product description: {product_description}')
    print(f'Product image URL: {product_image_url}')
    print('----------------------------------------\n')

    # add row to dataframe
    row = {'Product Name': product_name, 'Product Model': product_model, 'Product Price': product_price, 'UPC': UPC, 'Product Description': product_description, 'Product Image URL': product_image_url}
    df_new_row = pd.DataFrame([row])
    df = pd.concat([df, df_new_row], axis=0, ignore_index=True)


Searching for product PRDB100...
----------------------------------------
Product name: Profile 100 Series Dreadnought Guitar Gig Bag
Product model: PRDB100
Product price: $43.99
UPC: 694970191631
Product description: Profile 100 Series Dreadnought Guitar Gig Bag, Black with Silver Accents, Profile Model PRDB100.
Profile 100 Series bags offer dependable quality in a simple streamlined design.
• Interior: 108 x 42 x 14 cm
• 10mm foam padding
• Deluxe Profile zippers
• Stylish textured silver piping and accents with satin nylon lining
• Shoulder straps, cushioned leatherette carrying handle
• Rubber bottom treads
• Dimesional zippered accessory pouch at the headstock with silver accent
• Zippered front storage pouch for sheet music and books
Product image URL: https://cosmomusic.ca/product_images/profile-100-series-dreadnought-guitar-gig-bag/5f5d83b06d4dcf00173fc11b/detail.jpg?c=1604674406
----------------------------------------


Searching for product YAS26...
------------------------

In [6]:
# save dataframe to CSV file
df.to_csv(output_file, index=False)
print('Results saved to CSV file\n')

print('Results:')
print(df)

print(f'Number of products found: {len(df)}\n')
print(f'Number of products not found: {products_not_found}\n')
if products_not_found > 0:
    print('Products not found:')
    for product in products_not_found_list:
        print(f'\t{product}')

# close browser
driver.close()
driver.quit()

Results saved to CSV file

Results:
                                        Product Name Product Model  \
0      Profile 100 Series Dreadnought Guitar Gig Bag       PRDB100   
1              Yamaha YAS-26 Standard Alto Saxophone         YAS26   
2                         Yamaha YCL-255 Bb Clarinet        YCL255   
3               Fender Mustang LT25 Guitar Combo Amp    2311100000   
4               Fender Frontman 10G Guitar Combo Amp    2311000000   
5  Squier Affinity Series Precision Bass PJ - Map...    0378553506   
6                Fender MD20 Mini Deluxe Amp - Black    0234810000   
7                            Fender Harmonica Holder    0990700100   
8                        Fender Bullet Tuner - Black    0239979002   

  Product Price           UPC  \
0        $43.99  694970191631   
1     $1,649.99  086792961491   
2       $829.99  086792961064   
3       $209.99  885978992515   
4       $109.99  717669568771   
5       $389.99  885978722884   
6        $69.99  717669524852   