In [374]:
%pip install beautifulsoup4 selenium pandas openpyxl lxml

83488.56s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Note: you may need to restart the kernel to use updated packages.


In [375]:

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from typing import List
import pandas as pd
import os
import requests

In [376]:
mango_url = lambda x: f"https://shop.mango.com{x}"

In [377]:
def GetWebDriver():
    return webdriver.Chrome()

In [378]:

def ImageDownloader(url: str, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    r = requests.get(url, stream=True)
    with open(path, "wb") as f:
        f.write(r.content)

In [379]:
class Ids:
    __image_lid = 0
    __product_lid = 0
    __variant_lid = 0
    
    @staticmethod
    def get_image_id():
        Ids.__image_lid += 1
        return Ids.__image_lid
    
    @staticmethod
    def reset_image():
        Ids.__image_lid = 0
        
    @staticmethod
    def get_product_id():
        Ids.__product_lid += 1
        return Ids.__product_lid
    
    @staticmethod
    def reset_product():
        Ids.__product_lid = 0
        
    @staticmethod
    def get_variant_id():
        Ids.__variant_lid += 1
        return Ids.__variant_lid
    
    @staticmethod
    def reset_variant():
        Ids.__variant_lid = 0

In [380]:

class ProductVariation:
    MAX_IMAGES = 1
    def __init__(self, page: BeautifulSoup):
        self.id = Ids.get_variant_id()
        self.name = page.select_one('span.colors-info-name').text.strip()
        images_elements = page.select('div.image-btn > img')
        print(f'\tProductVariation: [{self.id}] ' + self.name)
        if len(images_elements) > 0:
            self.image_urls = [x.attrs['src'] for x in images_elements]
        
        self.image_ids = []
        self.images = {}
        
    def download_images(self):
        self.image_ids = []
        self.images = {}
        for url in self.image_urls:
            if ProductVariation.MAX_IMAGES is not None and len(self.image_ids) >= ProductVariation.MAX_IMAGES:
                break
            image_id = Ids.get_image_id()
            self.image_ids.append(image_id)
            self.images[image_id] = url
            ImageDownloader('https:' + url, os.path.join(f'./ProductImages/{image_id}.png'))

class Product:
    def __init__(self, name: str, first_link: str, driver: webdriver.Chrome):
        self.id = Ids.get_product_id()
        self.name = name
        self.link = first_link
        self.links = []
        self.links.append(first_link)
        
        self.variationIds = []
        self.variations = {}
        
        self.price = None
        self.fetch(driver)
        
    def fetch(self, driver: webdriver.Chrome):
        print(f'Product: [{self.id}] ' + self.name)
        if driver is None:
            driver = GetWebDriver()
        driver.get(mango_url(self.link))
        # Wait for images to load
        while True:
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
                break
            except:    
                print('Waiting before reloading page')
                time.sleep(10)
                driver.refresh()
                
        # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
        
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Get price
        price_element = soup.select_one('meta[itemprop="price"]')
        self.price = int(price_element['content'])
        
        variant_pages = []
        variant_pages.append(soup)
        
        self.links = [self.link]
        color_elements = soup.select('a.color-container')
        
        if len(color_elements) > 0:
            # other_links = color_elements.map(lambda x: x.attrs['href'])
            other_links = [x.attrs['href'] for x in color_elements]
            self.links.extend(other_links)
        
        self.fetch_variants(driver)
        
    def fetch_variants(self, driver: webdriver.Chrome):
        self.variationIds = []
        self.variations = {}
        
        variant_pages = []
        for url in self.links:
            driver.get(mango_url(url))
            # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
            
            while True:
                try:
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
                    break
                except:    
                    print('Waiting before reloading page')
                    time.sleep(10)
                    driver.refresh()
            
            variant_pages.append(BeautifulSoup(driver.page_source, 'lxml'))
        
        for page in variant_pages:
            variant = ProductVariation(page)
            self.variationIds.append(variant.id)
            self.variations[variant.id] = variant
            
    def export_to_csv(self):
        directory = './Data/Product'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        product_variants_data = [{'id': variant.id, 'name': variant.name, 'image_id': variant.image_ids[0]} for variant in self.variations.values()]
        
        df = pd.DataFrame(product_variants_data, columns=['id', 'name', 'image_id'], index=[variant.id for variant in self.variations.values()])
        
        df.to_csv(os.path.join(directory, f'{self.id}.csv'), index=False)
            
    def download_images(self):
        for variant in self.variations.values():
            variant.download_images()
        
    def __repr__(self) -> str:
        return f'Product(name={self.name}, link={self.link})'

In [381]:

class Category:
    
    loading_wait = 2
    scroll_step = 50
    scroll_interval = 0.001
    
    def __init__(self, name: str, link: str):
        self.name = name
        self.link = link
        
        self.product_ids = [int]
        self.products: {int: Product} = {}
        
    def __repr__(self) -> str:
        return f'Category(name={self.name}, link={self.link})'
    
    def fetch_products(self, max = None):
        self.product_ids = [int]
        self.products: {int: Product} = {}
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 5)
        self.driver.get(self.link)
        
        # Wait for the page to load

        # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#catalogProductsList > ul > li > div > div > div > a')))
        while True:
            try:
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#catalogProductsList > ul > li > div > div > div > a')))
                break
            except:    
                print('Waiting before reloading page')
                time.sleep(10)
                self.driver.refresh()
        
        # Scroll to the bottom of the page
        last_height = 0
        while last_height < self.driver.execute_script("return document.body.scrollHeight"):
            self.driver.execute_script(f"window.scrollTo(0, {last_height + self.scroll_step});")
            time.sleep(self.scroll_interval)
            last_height += self.scroll_step
        
        # Get the page source
        soup = BeautifulSoup(self.driver.page_source, 'lxml')
        
        product_elements = soup.select('#catalogProductsList > ul > li > div > div > div > a')
        
        for product_element in product_elements:
            if max is not None and len(self.product_ids) >= max:
                break
            name = product_element.attrs['aria-label']
            first_link = product_element.attrs['href']
            product = Product(name, first_link, self.driver)
            self.product_ids.append(product.id)
            self.products[product.id] = product
            
        # Close the browser
        self.driver.quit()
        self.download_images()
        
            
    def export_to_csv(self):
        # Create a folder with category name in Data folder
        directory = f'./Data/Category'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
            
        # Export into csv with product.id product.name product.price
        products_data = [{'id': product.id, 'name': product.name, 'price': product.price} for product in self.products.values()]

        df = pd.DataFrame(products_data, columns=['id', 'name', 'price'], index=[product.id for product in self.products.values()])
        # df = pd.DataFrame(products_data, columns=['id', 'name', 'price'], index=None)

        df.to_csv(os.path.join(directory, f'{self.name}.csv'), index=False)
        
        for product in self.products.values():
            product.export_to_csv()
        
    def download_images(self):
        print(f'Downloading images for {self.name} ...')
        for product in self.products.values():
            product.download_images()
        
        

In [382]:
# Men_Coats = Category('Men Coats', mango_url('/vn/men/coats_c32859776'))

In [383]:
# Men_Coats.fetch_products(5)

In [384]:
# Men_Coats.export_to_csv()

In [385]:
Categories = [
    Category('Men_Coats', mango_url('/vn/men/coats_c32859776')),
    Category('Men_Cardigans_and_sweaters', mango_url('/vn/men/cardigans-and-sweaters_c33749244')),
    Category('Men_Trousers', mango_url('/vn/men/trousers_c11949748')),
    Category('Men_Suits', mango_url('/vn/men/featured/suits_d14643682')),
    Category('Men_Accessories', mango_url('/vn/men/featured/accessories-edition_d25560162')),
    
    Category('Women_Coats', mango_url('/vn/women/coats_c67886633')),
    Category('Women_Sweaters_and_cardigans', mango_url('/vn/women/sweaters-and-cardigans_c87138853')),
    Category('Women_Shoes', mango_url('/vn/women/shoes_c10336952')),
    
    Category('Girl', mango_url('/redirect.faces?op=conta&seccion=rebajas_nina&tiendaid=kids')),
    Category('Boy', mango_url('/redirect.faces?op=conta&seccion=rebajas_nino&tiendaid=kids'))
]

In [386]:
for category in Categories:
    category.fetch_products(5)
    category.export_to_csv()

Product: [1] Handmade recycled wool double-breasted coat
	ProductVariation: [1] Black
Product: [2] Long recycled wool coat
	ProductVariation: [2] Medium Brown
	ProductVariation: [3] Dark Navy
	ProductVariation: [4] Black
	ProductVariation: [5] Grey
Product: [3] Long recycled wool coat
	ProductVariation: [6] Dark Navy
	ProductVariation: [7] Medium Brown
Product: [4] Long recycled wool coat
	ProductVariation: [8] Grey
	ProductVariation: [9] Medium Brown
Downloading images for Men_Coats ...
Product: [5] Turtleneck wool sweater
	ProductVariation: [10] Chocolate
Product: [6] Turtleneck wool sweater
	ProductVariation: [11] Medium Heather Grey
Product: [7] Striped fine-knit polo shirt
	ProductVariation: [12] Navy
Product: [8] Fine-knit polo shirt
	ProductVariation: [13] Off White
Downloading images for Men_Cardigans_and_sweaters ...
Product: [9] Cotton tapered crop pants
	ProductVariation: [14] Coffee
Product: [10] Super slim-fit Tailored check trousers
	ProductVariation: [15] Dark Navy
	Prod

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=120.0.6099.109)
Stacktrace:
0   chromedriver                        0x0000000100d4c4dc chromedriver + 4162780
1   chromedriver                        0x0000000100d44664 chromedriver + 4130404
2   chromedriver                        0x000000010099bbc0 chromedriver + 293824
3   chromedriver                        0x00000001009746c0 chromedriver + 132800
4   chromedriver                        0x0000000100a0d25c chromedriver + 758364
5   chromedriver                        0x0000000100a21294 chromedriver + 840340
6   chromedriver                        0x00000001009d56bc chromedriver + 530108
7   chromedriver                        0x00000001009d6930 chromedriver + 534832
8   chromedriver                        0x0000000100d11e08 chromedriver + 3923464
9   chromedriver                        0x0000000100d163dc chromedriver + 3941340
10  chromedriver                        0x0000000100cfa038 chromedriver + 3825720
11  chromedriver                        0x0000000100d16f3c chromedriver + 3944252
12  chromedriver                        0x0000000100cec6f4 chromedriver + 3770100
13  chromedriver                        0x0000000100d33980 chromedriver + 4061568
14  chromedriver                        0x0000000100d33af8 chromedriver + 4061944
15  chromedriver                        0x0000000100d442e4 chromedriver + 4129508
16  libsystem_pthread.dylib             0x00000001845a1034 _pthread_start + 136
17  libsystem_pthread.dylib             0x000000018459be3c thread_start + 8
