# Long & McQuade Webscrape

Script for webscraping product information from Long & McQuade website and saving as CSV format suitable for importing into Shopify

In [None]:
# declare constants

WAIT_TIME = 5 # seconds
ALLOW, BLOCK = 1, 2 # for chrome experimental options
HEADLESS = False

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ActionChains

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-infobars")
if HEADLESS: 
  chrome_options.add_argument("--headless")
  
chrome_options.add_argument("--disable-gpu")
chrome_options.add_experimental_option("prefs", { 
    "profile.default_content_setting_values.media_stream_mic": BLOCK,     
    "profile.default_content_setting_values.media_stream_camera": BLOCK,  
    "profile.default_content_setting_values.geolocation": BLOCK,          
    "profile.default_content_setting_values.notifications": BLOCK         
  })

import pandas as pd
from time import sleep

In [None]:
# default input & output file names
input_file = ''
output_file = 'AG03MK2.csv'
url = 'https://www.long-mcquade.com/'


In [None]:
# create empty dataframe
df = pd.DataFrame(columns=['Variant SKU', 'Variant Price' ])

if input_file:
    product_df = pd.read_csv(input_file)
    product_list = product_df['Product Model'].tolist()
else:
    product_list = [ # if input_file is not specified, products can be added to product_list array to be scraped
        'AG03MK2 B',
    ]

In [None]:
print(f'Scraping data for {len(product_list)} products:')
for product in product_list:
    print(f'\t{product}')

In [None]:
service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)
print(f'Opening {url} in Chrome browser...')
driver.get(url)
products_not_found = 0
products_not_found_list = []

In [None]:
for product_model in product_list:
    actions = ActionChains(driver)
    # search for product model
    search_input = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[id="SearchTxt"]')))
    search_input.clear()
    search_input.send_keys(product_model)
    search_input.send_keys(Keys.RETURN)

    # click product link
    try:
        product_link = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.XPATH, f"//p[text()='Model: {product_model}']")))
        actions.move_to_element(product_link).perform() # scoll window to element to be clicked
        product_link.click()
    except:
        print(f'Product {product_model} not found')
        products_not_found += 1
        products_not_found_list.append(product_model)
        continue
    print(f'Product model: {product_model}')
    
    # Get product price
    try:
        product_price = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span[id='product-regular-price']"))).text
        print(f'Product price: {product_price}\n')
    except:
        product_price = 0
        print('Product price not found')
    
    try:
        product_brand = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span[id='product-brand']"))).text
        print(f'Product brand: {product_brand}\n')
    except:
        product_brand = ''
        print('Product brand not found')
    
    try:
        product_title = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span[id='product-header-name']"))).text
        print(f'Product title: {product_title}\n')
    except:
        product_title = ''
        print('Product title not found')
    
    try:
        product_description = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div[id='Description-tab']"))).text
        print(f'Product description: {product_description}\n')
    except:
        product_description = ''
        print('Product description not found')
  
    try:
        product_image = WebDriverWait(driver, WAIT_TIME).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "img[id='product-image']"))).get_attribute("src")
        print(f'Product image: {product_image}\n')
    except:
        product_image = ''
        print('Product image not found')
  
    
    # add row to dataframe
    row = {
        'Variant SKU': product_model, 
        'Vendor': product_brand, 
        'Title': f'{product_brand} {product_title}', 
        'Body (HTML)': product_description, 
        'Product Image': product_image, 
        'Variant Price': product_price, 
        'Variant Inventory Tracker': 'shopify', 
        'Variant Inventory Policy': 'continue',
        }
    df_new_row = pd.DataFrame([row])
    df = pd.concat([df, df_new_row], axis=0, ignore_index=True)

In [None]:
# save dataframe to CSV file
df.to_csv(output_file, index=False)
print('Results saved to CSV file\n')

print('Results:')
print(df)

print(f'Number of products found: {len(df)}\n')
print(f'Number of products not found: {products_not_found}\n')
if products_not_found > 0:
    print('Products not found:')
    for product in products_not_found_list:
        print(f'\t{product}')

# close browser
driver.close()
driver.quit()