In [51]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
import sys
import urllib.parse

In [8]:
def get_existing_product_titles(filename):
    existing_titles = set()
    with open(filename, mode='r', encoding='utf-8-sig') as file:
        reader = csv.DictReader(file)
        for row in reader:
            existing_titles.add(row['Product Title'])
    return existing_titles

def is_product_seen(product_title, existing_titles):
    if product_title in existing_titles:
        return True
    else:
        return False

from unidecode import unidecode

def custom_text(text):
    text = unidecode(text)
    text = text.replace(' ', '-').lower()
    text = text.replace('&', 'va').lower()
    text = text.replace('(','')
    text = text.replace('0','')
    return text

# only for lazada


In [68]:
browser = webdriver.Chrome()
headers = ['Product Title', 'Brand', 'Price','Regular Price', 'Discount %','Image','Product Review Total', 'Detail','Số lượng đã bán','Highlight']

# Base URL for Tiki products
base_url = "https://tiki.vn/nha-sach-tiki/c8322"
with open('tiki.csv', mode='a', newline='', encoding='utf-8-sig') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    existing_titles = get_existing_product_titles('tiki.csv')
    
    # Write header if the file is empty
    if file.tell() == 0:
        writer.writeheader()
    
    # Visit the base URL
    browser.get(base_url)
    time.sleep(2)
    products = browser.find_elements(By.CSS_SELECTOR, "[data-view-id='product_list_item']")
    listProductLink = []
    for product in products[:5]:
        outerHtml = product.get_attribute("outerHTML")
        productLink = re.search(r'href="(.*?)"', outerHtml).group(1)
        listProductLink.append(productLink)

    for productLink in listProductLink:
        print("DEBUG: " + productLink)

        try:
            browser.get("https:" + productLink)
        except:
            browser.get("https:" + productLink)
        time.sleep(8)  
            
        try:
            # basic information
            productTitle = browser.find_element(By.CLASS_NAME, "Title__TitledStyled-sc-c64ni5-0").text
            print(productTitle)

            if(is_product_seen(productTitle, existing_titles)):
                print('Product already in rows')
                continue
                # Extract product information
            time.sleep(2)  

            try : 
                productBrand = browser.find_element(By.CSS_SELECTOR, "[data-view-id='pdp_details_view_author']").text
            except NoSuchElementException : 
                productBrand = ""
                
            productPrice = browser.find_element(By.CLASS_NAME, "product-price__current-price").text

            productPrice = re.match(r'^[\d|\.|\,]+', productPrice).group()
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight*0.4);")
            time.sleep(8)  
            try : 
                Product_review_total = custom_text(browser.find_element(By.XPATH, "//a[@class='number'][@data-view-id='pdp_main_view_review']").text)
            except NoSuchElementException : 
                Product_review_total = 0
            
            try : 
                ProductSold = browser.find_element(By.CSS_SELECTOR, ".styles__StyledQuantitySold-sc-1onuk2l-3").text
                ProductSold = ProductSold.replace(')', '')
            except NoSuchElementException :   
                ProductSold = 0
            
            try:
                DiscountPercent_element = browser.find_element(By.CLASS_NAME, "product-price__discount-rate")
                DiscountPercent = DiscountPercent_element.text
                productPrice = float(productPrice.replace(',', '')) 
                DiscountPercent = float(DiscountPercent.replace('%', ''))
                productRegularPrice = productPrice / (1 - (DiscountPercent / 100))
            except NoSuchElementException:
                DiscountPercent = 0
                productRegularPrice = productPrice

            productImage = browser.find_element(By.CLASS_NAME, "styles__StyledImg-sc-p9s3t3-0").get_attribute("srcset")
            urls = [url.strip() for url in productImage.split(",")]
            first_url = urls[0].split(" ")[0]


            try:
                specification_containers_main = WebDriverWait(browser, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "fegGLL"))
                )
                specification_containers = WebDriverWait(specification_containers_main, 10).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "WidgetTitle__WidgetContainerStyled-sc-12sadap-0"))
                )
                
                if len(specification_containers) > 8:
                    specification_container = specification_containers[8]

                    specification_container_outer = specification_container.get_attribute('outerHTML')
                    soup = BeautifulSoup(specification_container_outer, 'html.parser')

                    spec_rows = soup.find_all(class_="guWvLv")
                    updated_fieldnames = headers.copy()
                    specifications = {}
                    for rows in spec_rows:
                        span_elements = rows.find_all("span")
                        if len(span_elements) >= 2:
                            spec_name = span_elements[0].get_text()
                            spec_value = span_elements[1].get_text()
                            if spec_name not in headers:
                                updated_fieldnames.append(spec_name)
                                specifications[spec_name] = spec_value
                            else : 
                                specifications[spec_name] = spec_value
                    headers = updated_fieldnames
                else:
                    print("The specification container you are looking for does not exist.")
            except Exception as e:
                print(f"An error occurred during the main extraction process: {e}")

            time.sleep(5)

            expand_button = browser.find_element(By.CLASS_NAME, "btn-more")
            ActionChains(browser).move_to_element(expand_button).perform()
            expand_button.click()
            time.sleep(4)
            
            detail_container = browser.find_element(By.CLASS_NAME, "ToggleContent__View-sc-fbuwol-0")
            span_tags = detail_container.find_elements(By.TAG_NAME, "p")
            span_texts = [span.text for span in span_tags if span.text.strip()]

            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            try : 
                highlightContents = browser.find_elements(By.CLASS_NAME, "HighlightInfo__HighlightInfoContentStyled-sc-1pr13u3-0")
                highlightContentList = [highlightContent.text for highlightContent in highlightContents]
            except NoSuchElementException:
                highlightContentList = ""
            
            row_data = {
                'Product Title': productTitle,
                'Brand': productBrand,
                'Price': productPrice,
                'Regular Price': productRegularPrice,
                'Discount %': DiscountPercent,
                'Image': first_url,
                'Product Review Total': Product_review_total,
                'Detail': span_texts,
                'Số lượng đã bán': ProductSold,
                'Highlight' : highlightContentList,
                **specifications
            }
            
            writer.writerow(row_data)
            time.sleep(3)
        except NoSuchElementException:
            print("Error: Product information not found")
    browser.quit()

DEBUG: //tka.tiki.vn/pixel/pixel?data=UY0XxOIh2DwvOSyNNuZUB5db3NRxyo2IyNrXzyz-xVgPYVpUQt9t_t3mWGSW-eSITzlp419_3WB6QBCHtysQRxEwukZs6662Y2v8OwA2ltNwwgT1hR9KnzCHTdOLt0IXZiofnRh_kPdzZ0P5fLbUQigtwZVLRcWfQ6xqDlKQZKpk04ULYPwzCWX6E7AN8MI0qvjnd_smJ71w7f0XsWmtydeUmjPMXyE5Z4n9jT44Qfnanby0po9JLTmvIiK7mM6ITAT6M5y-BthNjP9-_-o2ZaAD58qUMUWHli4uosXInfrLqdXwTvmO6cruOtFJbBd3mwNHhyFjvLPf3Jd8xWOllGpTdwNiIKGGaOtpl85IIH8kOVof0Jz2Hx9fnCNNKjhGV47X-tJmf_op3nFlX_vnB_j4RFtsrJIBgcI6nCEz45DNK-jwsKROtTvyDQgXhIc_ikVK4RXS9DhYAlB6vOepZppBZxjXX531r0GJTVVkQoOaAIzQHIrWAU6RN8aDUaJu0ECo5gzB1Es3tDlmzFt_8yDnjcYi-jz_1-bTJycykNlAM-gThSPo3nqf8n28nauTOXew027egr6ItAWm_HhSIx70NhfacMX9e8hx5s_Uf3xNrkQB6q9pBcmZ2TfIu5G62-oUiOYwejTR4Sp2CHabYbSDgV6DG_GZpq0z1yHKok3Zy9HenAUCsOH7DPFCOfbZQXhYjGj0eeXn0VMML1irOBrtaUOG-pMcmefnkXODXx2_R4jJ5Nejaqs47V7jkQBUQfwnf4BZqGInLnRJL72VoFKtrZXm7oMtmGruiRGhDf7ft0l-yiPfU6Bm8wIvRCCAVug6gmP0VYKsgSrHyTry9pmnIFsTIs2NLrGYFYrYBp3l5F78DcHJ3eYx_H0yz11MtuO91NFS92H8S6ykRUmcviw7PpUSBRyiMm4xT8r2YjeWTYej1bN-kQK37ecH8yM0aX

In [54]:
# test
# browser = webdriver.Chrome()

# # Base URL for Tiki products
# # base_url = "https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815"
# base_url = "https://tiki.vn/cay-cam-ngot-cua-toi-p74021317.html?spid=74021318"
# browser.get(base_url)

# productTitle = browser.find_element(By.CLASS_NAME, "Title__TitledStyled-sc-c64ni5-0").text
# print(productTitle)

#             # if(is_product_seen(productTitle, existing_titles)):
#             #     print('Product already in rows')
#             #     continue
#                 # Extract product information

# productBrand = browser.find_element(By.CSS_SELECTOR, "[data-view-id='pdp_details_view_author']").text
# productPrice = browser.find_element(By.CLASS_NAME, "product-price__current-price").text
# print(productPrice)
# print(productBrand)
# productPrice = re.match(r'^[\d|\.|\,]+', productPrice).group()
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight*0.35);")
# time.sleep(8)  
# Product_review_total = custom_text(browser.find_element(By.XPATH, "//a[@class='number'][@data-view-id='pdp_main_view_review']").text)
# ProductSold = browser.find_element(By.CSS_SELECTOR, ".styles__StyledQuantitySold-sc-1onuk2l-3").text
# ProductSold = ProductSold.replace(')', '')
# print(ProductSold)
            
# try:
#     DiscountPercent_element = browser.find_element(By.CLASS_NAME, "product-price__discount-rate")
#     DiscountPercent = DiscountPercent_element.text
#     productPrice = float(productPrice.replace(',', '')) 
#     DiscountPercent = float(DiscountPercent.replace('%', ''))
#     productRegularPrice = productPrice / (1 - (DiscountPercent / 100))
# except NoSuchElementException:
#     DiscountPercent = 0
#     productRegularPrice = productPrice

# productImage = browser.find_element(By.CLASS_NAME, "styles__StyledImg-sc-p9s3t3-0").get_attribute("srcset")
# urls = [url.strip() for url in productImage.split(",")]
# first_url = urls[0].split(" ")[0]


# try:
#     specification_containers_main = WebDriverWait(browser, 10).until(
#         EC.presence_of_element_located((By.CLASS_NAME, "fegGLL"))
#     )
#     specification_containers = WebDriverWait(specification_containers_main, 10).until(
#         EC.presence_of_all_elements_located((By.CLASS_NAME, "WidgetTitle__WidgetContainerStyled-sc-12sadap-0"))
#     )
#     if len(specification_containers) > 8:
#         specification_container = specification_containers[8]

#         specification_container_outer = specification_container.get_attribute('outerHTML')
#         soup = BeautifulSoup(specification_container_outer, 'html.parser')

#         # Find rows within the specification container
#         spec_rows = soup.find_all(class_="guWvLv")

#         # Extract specifications
#         specifications = {}
#         for rows in spec_rows:
#             span_elements = rows.find_all("span")
#             if len(span_elements) >= 2:
#                 spec_name = span_elements[0].get_text()
#                 spec_value = span_elements[1].get_text()
#                 if spec_name not in headers:
#                     headers.append(spec_name)
#                     specifications[spec_name] = spec_value
#                 else : 
#                     specifications[spec_name] = spec_value
#             else:
#                 print("Not enough span elements found in the row.")
#         # Print specifications
#         print(specifications)
#     else:
#         print("The specification container you are looking for does not exist.")
# except Exception as e:
#     print(f"An error occurred during the main extraction process: {e}")



# time.sleep(5)


# expand_button = browser.find_element(By.CLASS_NAME, "btn-more")
# ActionChains(browser).move_to_element(expand_button).perform()
# expand_button.click()
# time.sleep(4)

# detail_container = browser.find_element(By.CLASS_NAME, "ToggleContent__View-sc-fbuwol-0")
# span_tags = detail_container.find_elements(By.TAG_NAME, "p")
# span_texts = [span.text for span in span_tags if span.text.strip()]

# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(2)
# highlightContents = browser.find_elements(By.CLASS_NAME, "HighlightInfo__HighlightInfoContentStyled-sc-1pr13u3-0")
# highlightContentList = [highlightContent.text for highlightContent in highlightContents]


  

Cây Cam Ngọt Của Tôi
75.500₫
José Mauro de Vasconcelos
Đã bán 5000+
{'Phiên bản sách': 'Phiên bản thường', 'Công ty phát hành': 'Nhã Nam', 'Ngày xuất bản': '2020-01-01 00:00:00', 'Kích thước': '14 x 20.5 cm', 'Dịch Giả': 'Nguyễn Bích Lan\xa0,\xa0Tô Yến Ly', 'Loại bìa': 'Bìa mềm', 'Số trang': '244', 'Nhà xuất bản': 'Nhà Xuất Bản Hội Nhà Văn'}
