In [None]:
import pandas as pd
import math
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

In [None]:
def start_browser():
    """"
    This function configures webdriver to:
    - ignore certificate errors,
    - hide automatic actions,
    - opening the site to full screen.
    Launches a browser running webdriver.
    ========================================
    ========================================
    Данная функция настраивает webdriver: 
    - на игнорирование ошибок сертификатов,
    - скрытие автоматических действий,
    - открытие сайта на полный экран.
    Запускает браузер под управлением webdriver.
    """
    chrome_options = Options()
    chrome_options.add_argument(
        '--ignore-certificate-errors'
    )
    chrome_options.add_experimental_option(
        "excludeSwitches", ["enable-automation"]
    )
    chrome_options.add_experimental_option(
        'useAutomationExtension', False
    )
    chrome_options.add_argument(
        '--disable-blink-features=AutomationControlled'
    )
    chrome_options.add_argument(
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/113.0.0.0 Safari/537.36'
    )
    chrome_options.add_argument('--start-maximized')
    driver = webdriver.Chrome(options=chrome_options)
    return driver
driver = start_browser()

In [None]:
def all_catalog_url(driver):
    """
    This function collects all links to categories
    products for further parsing.
    ================================================
    ================================================
    Данная функция собирает все ссылки на категории
    товаров для дальнейшего парсинга.
    """
    
    catalog_url_list = []
    
    # Go to the main page:
    # Переходим на главную страницу:
    url = 'https://www.citilink.ru'
    driver.get(url)
    time.sleep(5)
    
    # Click on the "Catalog" button:
    # Кликаем по кнопке "Каталог":
    ActionChains(driver).click(driver
                               .find_element(
                                   By.CSS_SELECTOR, 'span.css-19y4hmw.e1fnp08x0'
                               )
                              ).perform()
    time.sleep(3)
        
    # We collect all transitions to product categories:
    # Собираем все перехады на категории товаров:
    gross_catalog = (
        driver
        .find_elements(
            By.CLASS_NAME, 'e19upju70')
    )
    
    for element_catalog in gross_catalog:
        try:
            # We move the mouse sequentially to each category:
            # Наводим последовательно мышку на каждую категорию:
            ActionChains(driver).move_to_element(element_catalog).perform()
            time.sleep(3)
            # Parse all div's with all links to product groups:
            # Парсим все div со всеми ссылками на группы товаров:
            all_elements = (
                driver
                .find_element(
                    By.CSS_SELECTOR, 'div.css-pvfgk.eqe4b5u0'
                )
            )
            # Parse all div groups from it:
            # Парсим из него все div групп:
            elements = (
                all_elements
                .find_elements(
                    By.CLASS_NAME, 'css-1xb55jt'
                )
            )
            for element in elements:
                # We take all the elements of the goods from it:
                # Берем из него все элементы товаров:
                all_elements = (
                    element
                    .find_elements(
                        By.CLASS_NAME, 'css-v1w0m5'
                    )
                )
                for product in all_elements:                    
                    # We take the url of the product group:
                    # Берем url группы товаров:
                    url_element = (
                        product
                        .find_element(
                            By.CLASS_NAME, 'css-vrsjnq'
                        )
                    )
                    group_url = (
                        url_element
                        .get_attribute('href')
                        .replace('?ref=mainmenu', '')
                    )
                    # Immediately exclude duplicates:
                    # Сразу исключаем дубликаты:
                    if group_url not in catalog_url_list:
                        catalog_url_list.append(group_url)
                    else:
                        continue
        except:
            continue
    return catalog_url_list
catalog_url_list = all_catalog_url(driver)

In [None]:
def product_table(catalog_url_list, driver):
    """
    This function collects all the data we are interested in
    for all products and adds them to the dataframe. 
    Also, as a consequence of the fact that some categories
    products have subcategories, collects links to subcategories
    in a separate list for subsequent parsing.
    =============================================================
    =============================================================
    Данная функция собирает все интересующие нас данные
    по всем товарам и добавляет их в датафрейм. 
    Также, в следствие того, что некоторые категории
    товаров имеют подкатегории, собирает ссылки на подкатегории
    в отдельный список для последующего парсинга.
    """
    
    product_table = pd.DataFrame()
    
    subcategory_url_list = []
    
    for url in catalog_url_list:
        # Go to the group page:
        # Переходим на страницу группы:
        driver.get(url)
        time.sleep(7)
        # To avoid errors, wrap 
        # all possible places of their occurrence in the try - 
        # except construction:
        # Чтобы избежать выпадения ошибок, оборачиваем 
        # все возможные места их возникновения в конструкцию try - except:
        try:
            if 'Subcategory__count' in driver.page_source:
                # We get the number of products in the group:
                # Получаем количество товаров в группе:
                quantity_products = (
                    int(
                        driver
                        .find_element(
                            By.CSS_SELECTOR, 
                            'div.Subcategory__count'
                        )
                        .text.strip().split(' ')[0]
                    )
                )
                # Since the site gives 48 products of the same group to the page,
                # then we will cycle through all the pages with active
                # goods. Due to the fact that Citylink has a heterogeneous
                # code, then we use various conditions and methods for parsing.
                # To cover all pages with active products, round up 
                # the number of pages up to a whole in a big way:
                
                # Так как сайт выдает на страницу 48 товаров одной группы,
                # то будем проходится циклом по всем страницам с активными
                # товарами. В связи с тем, что Ситилинк имеет неоднородный
                # код, то используем различные условия и методы для парсинга.
                # Чтобы охватить все страницы с активными товарами, округляем 
                # количество страниц до целого в большую сторону:
                for n in range(1, math.ceil(quantity_products / 48 + 1)):
                    try:
                        driver.get(url + '?p=' + str(n))
                        time.sleep(7)
                        
                        # Собираем все товары на странице.
                        products = (
                            driver
                            .find_elements(
                                By.CSS_SELECTOR, 
                                'div.product_data__gtm-js'
                            )
                        )
                        
                        # We go through each product and collect the data we 
                        # are interested in:
                        # Проходимся по каждому товару и собираем интересующие 
                        # нас данные:
                        for product in products:
                            atribute = json.loads(
                                product
                                .get_attribute('data-params')
                            )
                            try:
                                id_product = atribute['id']
                                
                                brand_product = atribute['brandName']
                                
                                title_product = atribute['shortName']
                                
                                price_product = atribute['price']
                                
                                url_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.js--Link'
                                    )
                                    .get_attribute('href')
                                )       
                                                                
                                row = {'Артикул': id_product, 
                                       'Бренд': brand_product,
                                       'Наименование  и краткие \
                                       характеристики товара': title_product,
                                       'Цена (RU)': price_product, 
                                       'Ссылка на товар': url_product}
                                
                                product_table = (
                                    pd.concat(
                                        [product_table, pd.DataFrame([row])]
                                    )
                                    .drop_duplicates().reset_index(drop=True)
                                )
                            except:
                                continue
                    except:
                        continue
                    
            elif 'SubcategoryPageTitle__product-count' in driver.page_source:
                # We get the number of products in the group:
                # Получаем количество товаров в группе:
                quantity_products = (
                    int(
                        driver
                        .find_element(
                            By.CSS_SELECTOR, 
                            'span.app-catalog-0.e1h9m05c0'
                        )
                        .text.strip().split(' ')[0]
                    )
                )
                
                for n in range(1, math.ceil(quantity_products / 48 + 1)):
                    try:
                        driver.get(url + '?p=' + str(n))
                        time.sleep(7)
                        
                        products = (
                            driver
                            .find_elements(
                                By.CSS_SELECTOR, 
                                'div.app-catalog-l9pqdy.e1btxpey0'
                            )
                        )
                                                
                        for product in products:
                            try:
                                id_product = (
                                    product
                                    .get_attribute(
                                        'data-meta-product-id'
                                    )
                                )
                                
                                brand_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .text.strip().split(' ')[1]
                                )
                                
                                title_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .text
                                )
                                
                                price_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'span.eb8dq160'
                                    )
                                    .get_attribute('data-meta-price')
                                )
                                                                
                                url_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .get_attribute('href')
                                )
                                
                                row = {'Артикул': id_product, 
                                       'Бренд': brand_product,
                                       'Наименование  и краткие \
                                       характеристики товара': title_product,
                                       'Цена (RU)': price_product, 
                                       'Ссылка на товар': url_product}
                                
                                product_table = (
                                    pd.concat(
                                        [product_table, pd.DataFrame([row])]
                                    )
                                    .drop_duplicates().reset_index(drop=True)
                                )
                            except:
                                continue
                    except:
                        continue
            else:
                driver.get(url)
                time.sleep(7)
                
                products = (
                    driver
                    .find_elements(
                        By.CSS_SELECTOR, 
                        'a.e1dzvan90')
                )
                
                for product in products:
                    try:
                        subcategory_url = product.get_attribute('href')
                        subcategory_url_list.append(subcategory_url)
                    except:
                        continue
        except:
            continue
    return product_table, subcategory_url_list
product_table, subcategory_url_list = product_table(catalog_url_list, driver)

In [None]:
def full_product_table(product_table, subcategory_url_list, driver):
    """
    This function completes the parsing. Parses products from subcategories.
    In general, it repeats the product_table() function
    ========================================================================
    ========================================================================
    Данная функция завершает парсинг. Парсит товары из подкатегорий.
    В целом она повторяет функцию product_table()
    """
    for url in subcategory_url_list:
        driver.get(url)
        time.sleep(7)
        try:
            if 'Subcategory__count' in driver.page_source:
                # We get the number of products in the group:
                # Получаем количество товаров в группе:
                
                quantity_products = (
                    int(
                        driver
                        .find_element(
                            By.CSS_SELECTOR, 
                            'div.Subcategory__count'
                        )
                        .text.strip().split(' ')[0]
                    )
                )
                
                for n in range(1, math.ceil(quantity_products / 48 + 1)):
                    try:
                        driver.get(url + '?p=' + str(n))
                        time.sleep(7)
                        
                        products = (
                            driver
                            .find_elements(
                                By.CSS_SELECTOR, 
                                'div.product_data__gtm-js'
                            )
                        )
                        
                        for product in products:
                            atribute = json.loads(
                                product
                                .get_attribute('data-params')
                            )
                            try:
                                id_product = atribute['id']
                                
                                brand_product = atribute['brandName']
                                
                                title_product = atribute['shortName']
                                
                                price_product = atribute['price']
                                
                                url_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.js--Link'
                                    )
                                    .get_attribute('href')
                                )        
                                                                
                                row = {'Артикул': id_product, 
                                       'Бренд': brand_product,
                                       'Наименование  и краткие \
                                       характеристики товара': title_product,
                                       'Цена (RU)': price_product, 
                                       'Ссылка на товар': url_product}
                                
                                product_table = (
                                    pd.concat(
                                        [product_table, pd.DataFrame([row])]
                                    )
                                    .drop_duplicates().reset_index(drop=True)
                                )
                            except:
                                continue
                    except:
                        continue
            
            elif 'SubcategoryPageTitle__product-count' in driver.page_source:
                quantity_products = (
                    int(
                        driver
                        .find_element(
                            By.CSS_SELECTOR, 
                            'span.app-catalog-0.e1h9m05c0'
                        )
                        .text.strip().split(' ')[0]
                    )
                )
                for n in range(1, math.ceil(quantity_products / 48 + 1)):
                    try:
                        driver.get(url + '?p=' + str(n))
                        time.sleep(7)
                        
                        products = (
                            driver
                            .find_elements(
                                By.CSS_SELECTOR, 
                                'div.e1lqnfu30.app-catalog-mev7lj.ejdpak00'
                            )
                        )
                    
                        for product in products:
                            try:
                                id_product = (
                                    product
                                    .get_attribute(
                                        'data-meta-product-id'
                                    )
                                )
                                
                                brand_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .text.strip().split(' ')[1]
                                )
                                title_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .text
                                )
                                price_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'span.eb8dq160'
                                    ).get_attribute('data-meta-price')
                                )
                                url_product = (
                                    product
                                    .find_element(
                                        By.CSS_SELECTOR, 
                                        'a.app-catalog-9gnskf.e1259i3g0'
                                    )
                                    .get_attribute('href')
                                )
                                
                                row = {'Артикул': id_product, 
                                       'Бренд': brand_product,
                                       'Наименование  и краткие \
                                       характеристики товара': title_product,
                                       'Цена (RU)': price_product, 
                                       'Ссылка на товар': url_product}
                                
                                product_table = (
                                    pd.concat(
                                        [product_table, pd.DataFrame([row])]
                                    )
                                    .drop_duplicates().reset_index(drop=True)
                                )
                            except:
                                continue
                    except:
                        continue
            else:
                continue
        except:
            continue
    driver.quit()
    return product_table
full_product_table(product_table, subcategory_url_list, driver)