In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
import numpy as np

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from datetime import datetime

In [2]:
def slow_scroll_and_back_up(driver, wait_time=3):
    #模仿用户下滚页面动作刷新商品信息Scroll down page like human for product info to be renewed
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    sections = total_height // 18  # Dividing the page into 18 sections

    for i in range(2, 19, 3):  # Scroll through every third section
        driver.execute_script(f"window.scrollTo(0, {sections * i});")
        time.sleep(wait_time)

    driver.execute_script("window.scrollTo(0, 0);")

def process_tag(tag, rf_dct, brand_name):
    data = {}
    try:
        sku = tag.find('span', class_='sku-value').text.strip()
    except:
        print('No model id found, may not be a tv product. Passing...')
        return None
    else:
        price = tag.find('span', class_='sr-only')
        clearance = tag.find('div', class_='pricing-price__sale-message')
        oos = tag.find('div', class_='fulfillment-fulfillment-summary')
        price_str = price.text.strip() if price else '00,000.00'
        oos_str = oos.text.strip() if oos else 'Scrape Failed'
        
        data['Product Division'] = 'TV'
        data['Channel'] = 'BBY/Amazon'
        data['Brand'] = f'{brand_name}'
        data['Model'] = sku
        data['Price'] = re.findall(r'([0-9]+.[0-9]+)', price_str.replace(',', ''))[0]
        data['Status'] = oos_str if (oos_str == 'Unavailable nearby') or (oos_str == 'Sold Out') else ''
        data['Clearance'] = 'Clearance' if clearance else ''
        
        return data if sku in rf_dct[brand_name] else None

def getBBYPrices(driver, brand_name, rf_dct, url_dct):
    
    driver.get(url_dct[brand_name])
    slow_scroll_and_back_up(driver)
    
    content_temp = driver.page_source
    temp = BeautifulSoup(content_temp, 'html.parser')
    items = temp.find('div', class_='left-side').text.strip().split(' ')
    num_iterations = np.ceil(int(items[2]) / 18)

    result = pd.DataFrame()
    num_page = 0

    while num_page <= num_iterations:
        slow_scroll_and_back_up(driver)
        content = driver.page_source
        soup = BeautifulSoup(content, 'html.parser')
        tag_lst = soup.find_all('li', class_='sku-item')
        
        for tag in tag_lst:
            data = process_tag(tag, rf_dct, brand_name)
            if data:
                result = pd.concat([result, pd.DataFrame([data])])
        
        num_page += 1
        print(f'Page: {num_page}/{int(num_iterations)}')
        
        #滚动刷新按钮refresh for button
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
        if (int(items[2]) > 18) or (num_page != num_iterations):
            
            #等待按钮显示直到被按下wait for button to show and press
            driver.execute_script("arguments[0].click();",WebDriverWait(driver,20).until(
                EC.element_to_be_clickable((
                    By.CSS_SELECTOR, "a.sku-list-page-next"
                ))
            ))
        else: 
            break
        time.sleep(5)

    missing_numbers = [num for num in rf_dct[brand_name] if num not in result['Model'].values]
    missing_df = pd.DataFrame({
        'Product Division':'TV', 'Channel':'BBY/Amazon', 'Brand':f'{brand_name}',
        'Model': missing_numbers, 'Price': 'None', 'Clearance':'None', 'Status':'None'})
    
    result = pd.concat([result, missing_df], ignore_index=True)
    result = result.drop_duplicates('Model', keep='first')
    
    return result

In [3]:
start_time = time.time()
# page_url = 'https://www.bestbuy.com/site/searchpage.jsp?st=toshiba+tv+&_dyncharset=UTF-8&_dynSessConf=&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys'
service = Service(executable_path=r'C:\Users\jianhui.ding\Untitled Folder\Price Mining\drive\msedgedriver.exe')
options = webdriver.EdgeOptions()
driver = webdriver.Edge(service=service, options=options)

# Define the rf_dct here or pass it as an argument to the getBBYPrices function.
rf = pd.read_excel('ReferenceSheet.xlsx', sheet_name='BBY')
A = pd.DataFrame()
A['Brand'] = rf['Brand']
A['Model'] = rf['Model']

rf_dct = {k: list(v) for k, v in A.groupby('Brand')['Model']}

# rf_dct = {
#     'Hisense':['75U9DG', '75U800GR', '85UX', '75U8K', '65U8K', '55U8K', '75U7K', '65U7K', '55U7K', '75U6K', 
#                          '65U6K', '55U6K', '50U6H', '65U6GR', '55U6GR', '85A7H', '75A6H', '70A6H', '65A6H','55A6H', 
#                          '50A6H' ,'43A6H', '65R6G', '55R6G', '50R6G', '43R6G', '43A4H', '40A4H', '32A4H'],
#               'TCL':['98QM850G', '85QM850G', '75QM850G', '65QM850G', '98R754', '85Q750G', '75Q750G', '65Q750G', '55Q750G',
#                      '85Q650G', '75Q650G', '65Q650G', '55Q650G', '75S555', '65S555', '55S555', '50S555', '85S450G', 
#                      '75S450G', '65S450G', '55S450G', '50S450G', '43S450G', '85S455', '75S455', '65S455', '58S455', 
#                      '55S455', '50S455', '43S455', '43S350G', '40S350G', '32S359', '32S350G'], 
#               'Vizio':['P75QX-H1', 'P75Q9-J01', 'P65Q9-J01', 'M75QXM-K03', 'M65QXM-K03', 'M50QXM-K01', 'M75Q7-J03',
#                        'M65Q7-J09', 'M55Q7-J01', 'M50Q7-J01', 'M75Q6-J03', 'M65Q6-J09', 'M50Q6-J01','V755M-K03','V705M-K03',
#                        'V655M-K04', 'V555M-K01', 'V505M-K09', 'V435M-K04', 'D40F-J09', 'D32fm-K01'],
#               'Roku':['75R6A5R', '65R6A5R', '55R6A5R', '75R4A5R', '65R4A5R', '55R4A5R', '50R4A5R', '43R4A5R', '40R2A5R',
#                       '32R2A5R'],
#               'Samsung':['QN85QN85CAFXZA','QN75QN85CAFXZA','QN65QN85CAFXZA','QN55QN85CAFXZA','QN85Q80CAFXZA',
#                          'QN75Q80CAFXZA','QN65Q80CAFXZA', 'QN55Q80CAFXZA','QN85Q60CAFXZA','QN75Q60CAFXZA','QN65Q60CAFXZA',
#                          'QN55Q60CAFXZA','QN50Q60CAFXZA','QN43Q60CAFXZA','UN85CU8000FXZA','UN75CU8000FXZA','UN65CU8000FXZA',
#                          'UN55CU8000FXZA','UN50CU8000FXZA','UN43CU8000FXZA','UN85CU7000FXZA','UN75CU7000FXZA',
#                          'UN70CU7000FXZA','UN65CU7000FXZA','UN55CU7000FXZA','UN50CU7000FXZA','UN43CU7000FXZA','UN40N5200AFXZA',
#                          'UN32M4500BFXZA'],
#               'LG':['75QNED85UQA','65QNED85UQA','55QNED85UQA','75QNED80UQA','65QNED80UQA','55QNED80UQA','50QNED80UQA',
#                     '75NANO75UQA','65NANO75UQA','55NANO75UQA','50NANO75UQA','86UQ7590PUD','75UQ7590PUB','70UQ7590PUB',
#                     '65UQ7570PUJ','55UQ7570PUJ','50UQ7570PUJ','43UQ7590PUB'],
#               'Toshiba': ['75M550KU','65M550KU','55M550KU','75C350KU','65C350LU','55C350LU','50C350LU','43C350LU','43V35KU',
#                           '32V35KU']
# }


url_dct = {
        'Hisense':'https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&sp=-currentprice%20skuidsaas&st=hisense+tv',
        'TCL': "https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&sp=-currentprice%20skuidsaas&st=tcl+tv",
        'Vizio': 'https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&sp=-currentprice%20skuidsaas&st=vizio+tv',
        'Roku': 'https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=brand_facet%3DBrand~Roku&st=roku+tv',
        'Samsung': 'https://www.bestbuy.com/site/searchpage.jsp?cp=2&id=pcat17071&qp=currentprice_facet%3DPrice~0%20to%204000&sp=-currentprice%20skuidsaas&st=samsung+tv',
        'LG': 'https://www.bestbuy.com/site/searchpage.jsp?st=lg+tv+&_dyncharset=UTF-8&_dynSessConf=&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys',
        'Toshiba': 'https://www.bestbuy.com/site/searchpage.jsp?st=toshiba+tv+&_dyncharset=UTF-8&_dynSessConf=&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys'
    }
df_his = getBBYPrices(driver, 'Hisense', rf_dct, url_dct)
driver.quit()

# duration = time.time() - start_time
df_his['sort'] = df_his['Brand'].apply(lambda x: rf_dct['Hisense'].index(x) if x in rf_dct['Hisense'] else len(rf_dct['Hisense']))
df_his = df_his.sort_values('sort').drop('sort', axis=1)
df_his

No model id found, may not be a tv product. Passing...
Page: 1/3
Page: 2/3
Page: 3/3
Page: 4/3


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,Hisense,85UX,4499.99,Unavailable nearby,
26,TV,BBY/Amazon,Hisense,40A4H,159.99,,
25,TV,BBY/Amazon,Hisense,43A4H,189.99,,
24,TV,BBY/Amazon,Hisense,43R6G,239.99,,
23,TV,BBY/Amazon,Hisense,43A6H,239.99,,
22,TV,BBY/Amazon,Hisense,50A6H,259.99,,
21,TV,BBY/Amazon,Hisense,50R6G,269.99,,Clearance
20,TV,BBY/Amazon,Hisense,55A6H,299.99,,
19,TV,BBY/Amazon,Hisense,50U6H,299.99,,
18,TV,BBY/Amazon,Hisense,55R6G,309.99,,


In [4]:
driver = webdriver.Edge(service=service, options=options)
df_tcl = getBBYPrices(driver, 'TCL', rf_dct, url_dct)
driver.quit()

df_tcl['sort'] = df_tcl['Model'].apply(lambda x: rf_dct['TCL'].index(x) if x in rf_dct['TCL'] else len(rf_dct['TCL']))
df_tcl = df_tcl.sort_values('sort').drop('sort', axis=1)
df_tcl

Page: 1/4
No model id found, may not be a tv product. Passing...
Page: 2/4
Page: 3/4
Page: 4/4
Page: 5/4


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,TCL,98QM850G,6999.99,,
2,TV,BBY/Amazon,TCL,85QM850G,2199.99,,
3,TV,BBY/Amazon,TCL,75QM850G,1699.99,,
5,TV,BBY/Amazon,TCL,65QM850G,1199.99,,
1,TV,BBY/Amazon,TCL,98R754,3999.99,Unavailable nearby,
4,TV,BBY/Amazon,TCL,85Q750G,1599.99,,
7,TV,BBY/Amazon,TCL,75Q750G,999.99,,
10,TV,BBY/Amazon,TCL,65Q750G,699.99,,
15,TV,BBY/Amazon,TCL,55Q750G,549.99,,
6,TV,BBY/Amazon,TCL,85Q650G,1099.99,,


In [5]:
driver = webdriver.Edge(service=service, options=options)
df_viz = getBBYPrices(driver, 'Vizio', rf_dct, url_dct)
driver.quit()


df_viz['sort'] = df_viz['Model'].apply(lambda x: rf_dct['Vizio'].index(x) if x in rf_dct['Vizio'] else len(rf_dct['Vizio']))
df_viz = df_viz.sort_values('sort').drop('sort', axis=1)
df_viz

Page: 1/2
Page: 2/2
Page: 3/2


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
11,TV,BBY/Amazon,Vizio,P75QX-H1,,,
0,TV,BBY/Amazon,Vizio,P75Q9-J01,1199.99,,Clearance
12,TV,BBY/Amazon,Vizio,P65Q9-J01,,,
13,TV,BBY/Amazon,Vizio,M75QXM-K03,,,
2,TV,BBY/Amazon,Vizio,M65QXM-K03,807.99,,Clearance
6,TV,BBY/Amazon,Vizio,M50QXM-K01,499.99,,
1,TV,BBY/Amazon,Vizio,M75Q7-J03,899.99,Unavailable nearby,Clearance
14,TV,BBY/Amazon,Vizio,M65Q7-J09,,,
15,TV,BBY/Amazon,Vizio,M55Q7-J01,,,
16,TV,BBY/Amazon,Vizio,M50Q7-J01,,,


In [6]:
driver = webdriver.Edge(service=service, options=options)
df_rok = getBBYPrices(driver, 'Roku', rf_dct, url_dct)
driver.quit()
df_rok


Page: 1/1


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,Roku,50R4A5R,299.99,,
1,TV,BBY/Amazon,Roku,55R4A5R,329.99,,
2,TV,BBY/Amazon,Roku,65R4A5R,399.99,,
3,TV,BBY/Amazon,Roku,65R6A5R,599.99,,
4,TV,BBY/Amazon,Roku,75R4A5R,699.99,,
5,TV,BBY/Amazon,Roku,32R2A5R,149.99,,
6,TV,BBY/Amazon,Roku,40R2A5R,229.99,,
7,TV,BBY/Amazon,Roku,43R4A5R,229.99,,
8,TV,BBY/Amazon,Roku,75R6A5R,999.99,,
9,TV,BBY/Amazon,Roku,55R6A5R,499.99,,


In [7]:
driver = webdriver.Edge(service=service, options=options)
df_sam = getBBYPrices(driver, 'Samsung', rf_dct, url_dct)
driver.quit()

df_sam['sort'] = df_sam['Model'].apply(lambda x: rf_dct['Samsung'].index(x) if x in rf_dct['Samsung'] else len(rf_dct['Samsung']))
df_sam = df_sam.sort_values('sort').drop('sort', axis=1)
df_sam

No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
Page: 1/11
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...

Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,Samsung,QN85QN85CAFXZA,0.0,,
2,TV,BBY/Amazon,Samsung,QN75QN85CAFXZA,2199.99,,
5,TV,BBY/Amazon,Samsung,QN65QN85CAFXZA,1599.99,,
8,TV,BBY/Amazon,Samsung,QN55QN85CAFXZA,1199.99,,
1,TV,BBY/Amazon,Samsung,QN85Q80CAFXZA,2299.99,,
3,TV,BBY/Amazon,Samsung,QN75Q80CAFXZA,1799.99,,
7,TV,BBY/Amazon,Samsung,QN65Q80CAFXZA,1199.99,,
11,TV,BBY/Amazon,Samsung,QN55Q80CAFXZA,1099.99,,
4,TV,BBY/Amazon,Samsung,QN85Q60CAFXZA,1599.99,,
9,TV,BBY/Amazon,Samsung,QN75Q60CAFXZA,0.0,,


In [8]:
driver = webdriver.Edge(service=service, options=options)
df_lge = getBBYPrices(driver, 'LG', rf_dct, url_dct)
driver.quit()

df_lge['sort'] = df_lge['Model'].apply(lambda x: rf_dct['LG'].index(x) if x in rf_dct['LG'] else len(rf_dct['LG']))
df_lge = df_lge.sort_values('sort').drop('sort', axis=1)
df_lge

Page: 1/7
Page: 2/7
Page: 3/7
Page: 4/7
Page: 5/7
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
Page: 6/7
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No model id found, may not be a tv product. Passing...
No mo

Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
12,TV,BBY/Amazon,LG,75QNED85UQA,1899.99,,
9,TV,BBY/Amazon,LG,65QNED85UQA,1199.99,Unavailable nearby,
13,TV,BBY/Amazon,LG,55QNED85UQA,999.99,,
8,TV,BBY/Amazon,LG,75QNED80UQA,1199.99,,
16,TV,BBY/Amazon,LG,65QNED80UQA,764.99,,Clearance
11,TV,BBY/Amazon,LG,55QNED80UQA,649.99,Unavailable nearby,
17,TV,BBY/Amazon,LG,50QNED80UQA,599.99,,
15,TV,BBY/Amazon,LG,75NANO75UQA,899.99,,
7,TV,BBY/Amazon,LG,65NANO75UQA,649.99,,
14,TV,BBY/Amazon,LG,55NANO75UQA,0.0,,


In [9]:
driver = webdriver.Edge(service=service, options=options)
df_tos = getBBYPrices(driver, 'Toshiba', rf_dct, url_dct)
driver.quit()

df_tos['sort'] = df_lge['Model'].apply(lambda x: rf_dct['Toshiba'].index(x) if x in rf_dct['Toshiba'] else len(rf_dct['Toshiba']))
df_tos = df_tos.sort_values('sort').drop('sort', axis=1)
df_tos

Page: 1/1


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,Toshiba,55C350LU,279.99,,
1,TV,BBY/Amazon,Toshiba,43C350LU,219.99,,
2,TV,BBY/Amazon,Toshiba,65C350LU,369.99,,
3,TV,BBY/Amazon,Toshiba,75C350KU,569.99,,
4,TV,BBY/Amazon,Toshiba,32V35KU,89.99,,
5,TV,BBY/Amazon,Toshiba,43V35KU,127.99,,Clearance
6,TV,BBY/Amazon,Toshiba,50C350LU,249.99,,
7,TV,BBY/Amazon,Toshiba,55M550KU,312.99,Unavailable nearby,Clearance
8,TV,BBY/Amazon,Toshiba,75M550KU,,,
9,TV,BBY/Amazon,Toshiba,65M550KU,,,


In [10]:
result = pd.concat([df_his,df_tcl, df_viz, df_rok, df_sam, df_lge, df_tos], axis=0)
duration = time.time() - start_time
result

Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,BBY/Amazon,Hisense,85UX,4499.99,Unavailable nearby,
26,TV,BBY/Amazon,Hisense,40A4H,159.99,,
25,TV,BBY/Amazon,Hisense,43A4H,189.99,,
24,TV,BBY/Amazon,Hisense,43R6G,239.99,,
23,TV,BBY/Amazon,Hisense,43A6H,239.99,,
...,...,...,...,...,...,...,...
5,TV,BBY/Amazon,Toshiba,43V35KU,127.99,,Clearance
6,TV,BBY/Amazon,Toshiba,50C350LU,249.99,,
7,TV,BBY/Amazon,Toshiba,55M550KU,312.99,Unavailable nearby,Clearance
8,TV,BBY/Amazon,Toshiba,75M550KU,,,


In [30]:
rf = pd.read_excel('ReferenceSheet.xlsx', sheet_name='BBY')
rf = rf.drop('Brand', axis=1)
result_f = result.merge(rf, on = 'Model')
column_order = ['Product Division','Channel','Brand','Model','Week of Year','Category',
                'Year','Series','Size (Inch)','Price','Status','Clearance','Note']

dt = datetime.today()
date = dt.strftime('%Y-%m-%d')

result_f['Year'] = result_f['Year'].fillna(0).astype('int')
result_f['Week of Year'] = dt.isocalendar()[1]
result_f['Note'] = ''

order_list = rf['Model'].to_list()
result_sorted = result_f[column_order]
result_sorted['Model'] = pd.Categorical(result_sorted['Model'], categories=order_list, ordered=True)
result_sorted = result_sorted.sort_values('Model').reset_index(drop=True)
result_sorted

Unnamed: 0,Product Division,Channel,Brand,Model,Week of Year,Category,Year,Series,Size (Inch),Price,Status,Clearance,Note
0,TV,BBY/Amazon,Hisense,75U9DG,35,DUAL-CELL,2022,U9,75,975.99,Unavailable nearby,Clearance,
1,TV,BBY/Amazon,Hisense,75U800GR,35,8K,2022,U800G,75,1799.99,Unavailable nearby,,
2,TV,BBY/Amazon,Hisense,85UX,35,Premium ULED,2023,U8,85,4499.99,Unavailable nearby,,
3,TV,BBY/Amazon,Hisense,75U8K,35,Premium ULED,2023,U8,75,1499.99,,,
4,TV,BBY/Amazon,Hisense,65U8K,35,Premium ULED,2023,U8,65,999.99,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,TV,BBY/Amazon,Toshiba,55C350LU,35,,0,A6,55,279.99,,,
147,TV,BBY/Amazon,Toshiba,50C350LU,35,,0,A6,50,249.99,,,
148,TV,BBY/Amazon,Toshiba,43C350LU,35,,0,A6,43,219.99,,,
149,TV,BBY/Amazon,Toshiba,43V35KU,35,,0,A4,43,127.99,,Clearance,


In [14]:

def process_cstc_tag(tag, model_dict, brand_name):
    data = {}
    
    model = tag.find('span', class_="description").text.strip()
    matches = re.findall(r'(\d+)"|([A-Z]+\d+[A-Z]*)', model)
    price = tag.find('div', class_='price').text.strip()
    price = price.replace(',','')
    stock = tag.find('p', class_ = 'stock_status_msg').text.strip()
    s_text = ''.join(stock[9:])
    try:
        model_id = ''.join([matches[0][0],matches[1][1]])
    except:
        print('Product ID not found. Passing...')
        return None
    else:
        data['Product Division'] = 'TV'
        data['Channel'] = 'Costco'
        data['Brand'] = f'{brand_name}'
        data['Model'] = model_id
        data['Price'] = float(price.replace('$',''))
        data['Status'] = '' if s_text == 'Available' else 'Out of Stock'
        data['Clearance'] = ''
        
    return data if model_id in model_dict[brand_name] else None

def getCstcPrices(driver, brand_name,cstc_model_urldct,cstc_model_dict):
    url = cstc_model_urldct[brand_name][0]
    driver.get(url)
    content_temp = driver.page_source
    temp = BeautifulSoup(content_temp, 'html.parser')
    items = temp.find('div', class_='table-cell results hidden-xs hidden-sm hidden-md').text.strip().split(' ')
    
    num_iterations = np.ceil(int(items[-1]) / 24)
    
    c_model_lst = []
    c_price_lst = []
    result = pd.DataFrame()
    
    num_page = 0

    while num_page < num_iterations:
        print(f'Page: {num_page+1}/{int(num_iterations)}')
        url = cstc_model_urldct[brand_name][num_page]
        driver.get(url)
        
        WebDriverWait(driver, 20)
        content = driver.page_source
        soup = BeautifulSoup(content, 'html.parser')

        c_taglst = soup.find_all('div', class_='col-xs-6 col-lg-4 col-xl-3 product')

        for c_tag in c_taglst:
            data = process_cstc_tag(c_tag,cstc_model_dict, brand_name)

            if data:
                result = pd.concat([result, pd.DataFrame([data])])
                
        num_page += 1

    missing_numbers = [num for num in cstc_model_dict[brand_name] if num not in result['Model'].values]
    missing_df = pd.DataFrame({
        'Product Division':'TV', 'Channel':'Costco', 'Brand':f'{brand_name}',
        'Model': missing_numbers, 'Price': 'None', 'Clearance':'None', 'Status':'None'})
    
    result = pd.concat([result, missing_df], ignore_index=True)
    result = result.drop_duplicates('Model', keep='first')    
    return result

In [22]:
service = Service(executable_path=r'C:\Users\jianhui.ding\Untitled Folder\Price Mining\drive\msedgedriver.exe')
options = webdriver.EdgeOptions()

cstc_model_urldct = {
    'Hisense':['https://www.costco.com/CatalogSearch?keyword=hisense+tv&deliveryFacetFlag=true&sortBy=item_location_pricing_salePrice+desc'],
    'TCL':['https://www.costco.com/CatalogSearch?keyword=TCl+tv&deliveryFacetFlag=true'],
    'Samsung':['https://www.costco.com/televisions.html?refine=%7C%7Citem_program_eligibility-ShipIt%7C%7CBrand_attr-Samsung&deliveryFacetFlag=false&sortBy=item_location_pricing_salePrice+desc',
              'https://www.costco.com/televisions.html?currentPage=2&pageSize=24&sortBy=item_location_pricing_salePrice+desc&refine=%7c%7citem_program_eligibility-ShipIt%7c%7cBrand_attr-Samsung&deliveryFacetFlag=false'],
    'LG':['https://www.costco.com/televisions.html?sortBy=item_location_pricing_salePrice+asc&refine=%7c%7citem_program_eligibility-ShipIt%7c%7cBrand_attr-LG&deliveryFacetFlag=false',
         'https://www.costco.com/televisions.html?currentPage=2&pageSize=24&sortBy=item_location_pricing_salePrice+asc&refine=%7c%7citem_program_eligibility-ShipIt%7c%7cBrand_attr-LG&deliveryFacetFlag=false']

}

rf2 = pd.read_excel('ReferenceSheet.xlsx', sheet_name='Costco')
B = pd.DataFrame()
B['Brand'] = rf2['Brand']
B['Model'] = rf2['Model']

cstc_model_dict = {k: list(v) for k, v in B.groupby('Brand')['Model']}
# cstc_model_dict = {
#     'Hisense':['65U75K','55U75K','65A65K','55A65K','50A65K','43A65H','32A45KV'],
#     'TCL':['85S470G','70S470G','58S470G','50S470G','43S470G'],
#     'Samsung':['85Q60C','75Q60C','65Q60C','55Q60C','85CU7000D','75CU7000D','65CU7000D',
#                '58CU7000D','55CU7000D','50CU7000D','43CU7000D'],
#     'LG':['86UR8000','75UR8000','70UR8000','65UR8000','55UR8000','50UR8000','43UR8000']
# }

In [16]:
driver = webdriver.Edge(service=service, options=options)

HSS = getCstcPrices(driver, 'Hisense', cstc_model_urldct, cstc_model_dict)
driver.quit()
HSS

Page: 1/1


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,Costco,Hisense,65U75K,729.99,,
1,TV,Costco,Hisense,55U75K,549.99,,
2,TV,Costco,Hisense,65A65K,399.99,,
3,TV,Costco,Hisense,55A65K,299.99,,
4,TV,Costco,Hisense,50A65K,249.99,,
5,TV,Costco,Hisense,32A45KV,119.99,,
6,TV,Costco,Hisense,43A65H,,,


In [17]:
driver = webdriver.Edge(service=service, options=options)

TCL = getCstcPrices(driver, 'TCL', cstc_model_urldct, cstc_model_dict)
driver.quit()
TCL

Page: 1/1


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,Costco,TCL,58S470G,299.99,,
1,TV,Costco,TCL,50S470G,269.99,,
2,TV,Costco,TCL,43S470G,209.99,,
3,TV,Costco,TCL,70S470G,449.99,,
4,TV,Costco,TCL,85S470G,899.99,,


In [18]:
driver = webdriver.Edge(service=service, options=options)

SSG = getCstcPrices(driver, 'Samsung', cstc_model_urldct, cstc_model_dict)
driver.quit()

SSG

Page: 1/2
Product ID not found. Passing...
Product ID not found. Passing...
Product ID not found. Passing...
Product ID not found. Passing...
Product ID not found. Passing...
Page: 2/2
Product ID not found. Passing...


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,Costco,Samsung,85Q60C,1599.99,,
1,TV,Costco,Samsung,75Q60C,1149.99,,
2,TV,Costco,Samsung,85CU7000D,1099.99,,
3,TV,Costco,Samsung,65Q60C,799.99,,
4,TV,Costco,Samsung,75CU7000D,749.99,,
5,TV,Costco,Samsung,55Q60C,649.99,,
6,TV,Costco,Samsung,65CU7000D,479.99,,
7,TV,Costco,Samsung,58CU7000D,449.99,,
8,TV,Costco,Samsung,55CU7000D,379.99,,
9,TV,Costco,Samsung,50CU7000D,349.99,,


In [19]:
driver = webdriver.Edge(service=service, options=options)

LGE = getCstcPrices(driver, 'LG', cstc_model_urldct, cstc_model_dict)
driver.quit()

LGE

Page: 1/2
Product ID not found. Passing...
Page: 2/2


Unnamed: 0,Product Division,Channel,Brand,Model,Price,Status,Clearance
0,TV,Costco,LG,43UR8000,279.99,,
1,TV,Costco,LG,50UR8000,339.99,,
2,TV,Costco,LG,55UR8000,379.99,,
3,TV,Costco,LG,65UR8000,479.99,,
4,TV,Costco,LG,70UR8000,629.99,,
5,TV,Costco,LG,75UR8000,729.99,,
6,TV,Costco,LG,86UR8000,1199.99,,


In [31]:
result2 = pd.concat([HSS,TCL, SSG, LGE], axis=0)
rf2 = pd.read_excel('ReferenceSheet.xlsx', sheet_name='Costco')
rf2 = rf2.drop('Brand', axis=1)
result2_f = result2.merge(rf2, on = 'Model')

column_order = ['Product Division','Channel','Brand','Model','Week of Year','Category',
                'Year','Series','Size (Inch)','Price','Status','Clearance','Note']

result2_f['Year'] = result2_f['Year'].fillna(0).astype('int')
result2_f['Week of Year'] = dt.isocalendar()[1]
result2_f['Note'] = ''

result_sorted2 = result2_f[column_order]

order_list2 = rf2['Model'].to_list()
result_sorted2['Model'] = pd.Categorical(result_sorted2['Model'], categories=order_list2, ordered=True)
result_sorted2 = result_sorted2.sort_values('Model').reset_index(drop=True)

result_final = pd.concat([result_sorted, result_sorted2], axis=0)

dt = datetime.today()
date = dt.strftime('%Y-%m-%d')
result_final.to_excel(f'Price Monitoring {date}.xlsx')

duration = time.time() - start_time
duration

2402.3348319530487

In [None]:
result2 = pd.concat([HSS,TCL, SSG, LGE], axis=0)
rf2 = pd.read_excel('ReferenceSheet.xlsx', sheet_name='Costco')
A = pd.DataFrame()
A['Brand'] = rf2['Brand']
A['Model'] = rf2['Model']

dict_A = {k: list(v) for k, v in A.groupby('Brand')['Model']}
print(dict_A)