In [1]:
%pip install beautifulsoup4 selenium pandas openpyxl lxml webdriver_manager

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from typing import List
import pandas as pd
import os
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
mango_url = lambda x: f"https://shop.mango.com{x}"

In [4]:
def GetWebDriver():
    # return webdriver.Chrome()
    driver = webdriver.Chrome(ChromeDriverManager().install())
    return driver

In [5]:

def ImageDownloader(url: str, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    r = requests.get(url, stream=True)
    with open(path, "wb") as f:
        f.write(r.content)

In [6]:
class Ids:
    __image_lid = 0
    __product_lid = 0
    __variant_lid = 0
    
    @staticmethod
    def get_image_id():
        Ids.__image_lid += 1
        return Ids.__image_lid
    
    @staticmethod
    def reset_image():
        Ids.__image_lid = 0
        
    @staticmethod
    def get_product_id():
        Ids.__product_lid += 1
        return Ids.__product_lid
    
    @staticmethod
    def reset_product():
        Ids.__product_lid = 0
        
    @staticmethod
    def get_variant_id():
        Ids.__variant_lid += 1
        return Ids.__variant_lid
    
    @staticmethod
    def reset_variant():
        Ids.__variant_lid = 0

In [7]:

class ProductVariation:
    MAX_IMAGES = 1
    def __init__(self, page: BeautifulSoup):
        self.id = Ids.get_variant_id()
        self.name = page.select_one('span.colors-info-name').text.strip()
        images_elements = page.select('div.image-btn > img')
        print(f'\tProductVariation: [{self.id}] ' + self.name)
        if len(images_elements) > 0:
            self.image_urls = [x.attrs['src'] for x in images_elements]
        
        self.image_ids = []
        self.images = {}
        
    def download_images(self):
        self.image_ids = []
        self.images = {}
        for url in self.image_urls:
            if ProductVariation.MAX_IMAGES is not None and len(self.image_ids) >= ProductVariation.MAX_IMAGES:
                break
            image_id = Ids.get_image_id()
            self.image_ids.append(image_id)
            self.images[image_id] = url
            ImageDownloader('https:' + url, os.path.join(f'./ProductImages/{image_id}.png'))

class Product:
    def __init__(self, name: str, first_link: str, driver: webdriver.Chrome):
        self.id = Ids.get_product_id()
        self.name = name
        self.link = first_link
        self.links = []
        self.links.append(first_link)
        
        self.variationIds = []
        self.variations = {}
        
        self.price = None
        self.fetch(driver)
        
    def fetch(self, driver: webdriver.Chrome):
        print(f'Product: [{self.id}] ' + self.name)
        if driver is None:
            driver = GetWebDriver()
        driver.get(mango_url(self.link))
        # Wait for images to load
        while True:
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
                break
            except:    
                print('Waiting before reloading page')
                time.sleep(10)
                driver.get(mango_url(self.link))
                
        # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
        
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Get price
        price_element = soup.select_one('meta[itemprop="price"]')
        if price_element is not None:
            self.price = int(price_element['content'])
        
        variant_pages = []
        variant_pages.append(soup)
        
        self.links = [self.link]
        color_elements = soup.select('a.color-container')
        
        if len(color_elements) > 0:
            # other_links = color_elements.map(lambda x: x.attrs['href'])
            other_links = [x.attrs['href'] for x in color_elements]
            self.links.extend(other_links)
        
        self.fetch_variants(driver)
        
    def fetch_variants(self, driver: webdriver.Chrome):
        self.variationIds = []
        self.variations = {}
        
        variant_pages = []
        for url in self.links:
            driver.get(mango_url(url))
            # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
            
            while True:
                try:
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.image-btn')))
                    break
                except:    
                    print('Waiting before reloading page')
                    time.sleep(10)
                    driver.get(mango_url(url))
            
            variant_pages.append(BeautifulSoup(driver.page_source, 'lxml'))
        
        for page in variant_pages:
            variant = ProductVariation(page)
            self.variationIds.append(variant.id)
            self.variations[variant.id] = variant
            
    def export_to_csv(self):
        directory = './Data/Product'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        product_variants_data = [{'id': variant.id, 'name': variant.name, 'image_id': variant.image_ids[0]} for variant in self.variations.values()]
        
        df = pd.DataFrame(product_variants_data, columns=['id', 'name', 'image_id'], index=[variant.id for variant in self.variations.values()])
        
        df.to_csv(os.path.join(directory, f'{self.id}.csv'), index=False)
            
    def download_images(self):
        for variant in self.variations.values():
            variant.download_images()
        
    def __repr__(self) -> str:
        return f'Product(name={self.name}, link={self.link})'

In [8]:

class Category:
    
    loading_wait = 2
    scroll_step = 50
    scroll_interval = 0.001
    
    def __init__(self, name: str, link: str):
        self.name = name
        self.link = link
        
        self.product_ids = [int]
        self.products: {int: Product} = {}
        
    def __repr__(self) -> str:
        return f'Category(name={self.name}, link={self.link})'
    
    def fetch_products(self, max = None):
        self.product_ids = [int]
        self.products: {int: Product} = {}
        self.driver = GetWebDriver()
        self.wait = WebDriverWait(self.driver, 5)
        self.driver.get(self.link)
        
        # Wait for the page to load

        # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#catalogProductsList > ul > li > div > div > div > a')))
        while True:
            try:
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#catalogProductsList > ul > li > div > div > div > a')))
                break
            except:    
                print('Waiting before reloading page')
                time.sleep(10)
                self.driver.get(self.link)
        
        # Scroll to the bottom of the page
        last_height = 0
        while last_height < self.driver.execute_script("return document.body.scrollHeight"):
            self.driver.execute_script(f"window.scrollTo(0, {last_height + self.scroll_step});")
            time.sleep(self.scroll_interval)
            last_height += self.scroll_step
        
        # Get the page source
        soup = BeautifulSoup(self.driver.page_source, 'lxml')
        
        product_elements = soup.select('#catalogProductsList > ul > li > div > div > div > a')
        
        for product_element in product_elements:
            if max is not None and len(self.product_ids) >= max:
                break
            name = product_element.attrs['aria-label']
            first_link = product_element.attrs['href']
            product = Product(name, first_link, self.driver)
            self.product_ids.append(product.id)
            self.products[product.id] = product
            
        # Close the browser
        self.driver.quit()
        self.download_images()
        
            
    def export_to_csv(self):
        # Create a folder with category name in Data folder
        directory = f'./Data/Category'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
            
        # Export into csv with product.id product.name product.price
        products_data = [{'id': product.id, 'name': product.name, 'price': product.price} for product in self.products.values()]

        df = pd.DataFrame(products_data, columns=['id', 'name', 'price'], index=[product.id for product in self.products.values()])
        # df = pd.DataFrame(products_data, columns=['id', 'name', 'price'], index=None)

        df.to_csv(os.path.join(directory, f'{self.name}.csv'), index=False)
        
        for product in self.products.values():
            product.export_to_csv()
        
    def download_images(self):
        print(f'Downloading images for {self.name} ...')
        for product in self.products.values():
            product.download_images()
        
        

In [9]:
# Men_Coats = Category('Men Coats', mango_url('/vn/men/coats_c32859776'))

In [10]:
# Men_Coats.fetch_products(5)

In [11]:
# Men_Coats.export_to_csv()

In [12]:
Categories = [
    Category('Men_Coats', mango_url('/vn/men/coats_c32859776')),
    Category('Men_Cardigans_and_sweaters', mango_url('/vn/men/cardigans-and-sweaters_c33749244')),
    Category('Men_Trousers', mango_url('/vn/men/trousers_c11949748')),
    Category('Men_Suits', mango_url('/vn/men/featured/suits_d14643682')),
    Category('Men_Accessories', mango_url('/vn/men/featured/accessories-edition_d25560162')),
    
    Category('Women_Coats', mango_url('/vn/women/coats_c67886633')),
    Category('Women_Sweaters_and_cardigans', mango_url('/vn/women/sweaters-and-cardigans_c87138853')),
    Category('Women_Shoes', mango_url('/vn/women/shoes_c10336952')),
    
    Category('Girl', mango_url('/redirect.faces?op=conta&seccion=rebajas_nina&tiendaid=kids')),
    Category('Boy', mango_url('/redirect.faces?op=conta&seccion=rebajas_nino&tiendaid=kids'))
]

In [13]:
for category in Categories:
    category.fetch_products(20)
    category.export_to_csv()

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [1] Handmade recycled wool double-breasted coat
	ProductVariation: [1] Black
Product: [2] Long recycled wool coat
	ProductVariation: [2] Medium Brown
	ProductVariation: [3] Dark Navy
	ProductVariation: [4] Black
	ProductVariation: [5] Grey
Product: [3] Long recycled wool coat
	ProductVariation: [6] Dark Navy
	ProductVariation: [7] Medium Brown
Product: [4] Long recycled wool coat
	ProductVariation: [8] Grey
	ProductVariation: [9] Medium Brown
Product: [5] Reversible recycled wool trench coat
	ProductVariation: [10] Grey
Product: [6] Classic water-repellent trench coat
	ProductVariation: [11] Medium Brown
Product: [7] Detachable hood wool coat
	ProductVariation: [12] Grey
	ProductVariation: [13] Navy
Product: [8] Reversible water-repellent quilted parka
	ProductVariation: [14] Grey
Product: [9] Recycled woollen coat
	ProductVariation: [15] Grey
	ProductVariation: [16] Brown
	ProductVariation: [17] Navy
Product: [10] Wool funnel neck coat
	ProductVariation: [18] Brown
	ProductVa

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [20] Turtleneck wool sweater
	ProductVariation: [43] Medium Heather Grey
Product: [21] Turtleneck wool sweater
	ProductVariation: [44] Chocolate
Product: [22] Fine-knit polo shirt
	ProductVariation: [45] Light/Pastel Grey
Product: [23] Striped fine-knit polo shirt
	ProductVariation: [46] Navy
Product: [24] Fine-knit polo shirt
	ProductVariation: [47] Off White
Product: [25] Fine-knit polo shirt
	ProductVariation: [48] Navy
Product: [26] Fine-knit polo shirt
	ProductVariation: [49] Burnt Orange
Product: [27] Fine-knit polo shirt
	ProductVariation: [50] Medium Brown
Product: [28] Fine-knit polo shirt
	ProductVariation: [51] Off White
	ProductVariation: [52] Black
	ProductVariation: [53] Ice Grey
	ProductVariation: [54] Dark Green
	ProductVariation: [55] Curry
Product: [29] Turtleneck wool sweater
	ProductVariation: [56] Navy
Product: [30] Long-sleeved cotton jersey polo shirt
	ProductVariation: [57] Ice Grey
Product: [31] Fine-knit polo shirt
	ProductVariation: [58] Chocolate
Pr

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [39] Cotton tapered crop pants
	ProductVariation: [70] Coffee
Product: [40] Recycled fabric slim-fit trousers
	ProductVariation: [71] Beige
	ProductVariation: [72] Brown
	ProductVariation: [73] Dark Heather Grey
Product: [41] Regular-fit denim bermuda shorts
	ProductVariation: [74] Dark Blue
	ProductVariation: [75] Light Blue
	ProductVariation: [76] White
	ProductVariation: [77] Black denim
Product: [42] Recycled fabric slim-fit trousers
	ProductVariation: [78] Black
	ProductVariation: [79] Brown
	ProductVariation: [80] Dark Heather Grey
Product: [43] Super slim-fit Tailored check trousers
	ProductVariation: [81] Dark Navy
	ProductVariation: [82] Sand
	ProductVariation: [83] Beige
	ProductVariation: [84] Navy
	ProductVariation: [85] Black
	ProductVariation: [86] Medium Grey
	ProductVariation: [87] Brown
	ProductVariation: [88] Sky Blue
	ProductVariation: [89] Mint Green
	ProductVariation: [90] Grey
	ProductVariation: [91] Ink Blue
Product: [44] Chambray Bermuda shorts
	Product

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [58] Stretch fabric slim-fit suit trousers
	ProductVariation: [112] Green
	ProductVariation: [113] Dark Navy
Product: [59] Stretch fabric super slim-fit suit trousers
	ProductVariation: [114] Dark Navy
	ProductVariation: [115] Green
Product: [60] Stretch fabric slim-fit printed suit jacket 
	ProductVariation: [116] Dark Navy
	ProductVariation: [117] Green
	ProductVariation: [118] Sky Blue
	ProductVariation: [119] Blue
	ProductVariation: [120] Grey
	ProductVariation: [121] Beige
	ProductVariation: [122] Navy
	ProductVariation: [123] Pastel Green
	ProductVariation: [124] Light Heather Grey
	ProductVariation: [125] Brown
Product: [61] Stretch fabric slim-fit printed suit trousers
	ProductVariation: [126] Dark Navy
	ProductVariation: [127] Sky Blue
	ProductVariation: [128] Pastel Green
	ProductVariation: [129] Grey
	ProductVariation: [130] Beige
	ProductVariation: [131] Navy
	ProductVariation: [132] Light Heather Grey
	ProductVariation: [133] Brown
	ProductVariation: [134] Green
	

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [77] Leather Chelsea ankle boots
	ProductVariation: [241] Brown
Product: [78] Pack of 3 cotton socks
	ProductVariation: [242] Brown
	ProductVariation: [243] Sky Blue
	ProductVariation: [244] Grey
Product: [79] Braided leather belt
	ProductVariation: [245] Brown
Product: [80] Knitted wool-blend cap
	ProductVariation: [246] Khaki
Product: [81] Fringed check scarf
	ProductVariation: [247] Khaki
Product: [82] Suede leather gloves with wool lining
	ProductVariation: [248] Brown
Product: [83] Polarised sunglasses
	ProductVariation: [249] Black
Product: [84] Leather suit shoes
	ProductVariation: [250] Black
Product: [85] Leather suit shoes
	ProductVariation: [251] Black
Product: [86] Suede leather gloves with wool lining
	ProductVariation: [252] Brown
Product: [87] Ribbed wool-blend scarf
	ProductVariation: [253] Brown
Product: [88] Leather reversible belt
	ProductVariation: [254] Black
Product: [89] Polarised sunglasses
	ProductVariation: [255] Brown
Product: [90] Fringed check scar

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [96] Parka with detachable fur-effect collar
	ProductVariation: [262] Khaki
Product: [97] Double-sided zipper coat
	ProductVariation: [263] Ice Grey
Product: [98] Faux fur collar double-breasted coat
	ProductVariation: [264] Dark Navy
Product: [99] Handmade oversized wool coat
	ProductVariation: [265] Khaki
	ProductVariation: [266] Medium Brown
	ProductVariation: [267] Light/Pastel Grey
	ProductVariation: [268] Black
Product: [100] Double-sided coat with buttons
	ProductVariation: [269] Medium Brown
Product: [101] Belt handmade coat
	ProductVariation: [270] Grey
	ProductVariation: [271] Medium Brown
	ProductVariation: [272] Black
Product: [102] Wool coat with jewel buttons
	ProductVariation: [273] Black
Product: [103] Faux-fur midi coat
	ProductVariation: [274] White
Product: [104] Long marbled coat
	ProductVariation: [275] Off White
Product: [105] Waterproof quilted bomber jacket
	ProductVariation: [276] Black
Product: [106] Oversized bomber jacket
	ProductVariation: [277] Kh

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [115] Fine-knit turtleneck sweater
	ProductVariation: [291] Plum
Product: [116] Fine-knit turtleneck sweater
	ProductVariation: [292] Medium Heather Grey
Product: [117] Fine-knit turtleneck sweater
	ProductVariation: [293] Black
Product: [118] Oversized V-neck sweater
	ProductVariation: [294] Light/Pastel Grey
Product: [119] Ribbed long cardigan
	ProductVariation: [295] Ecru
Product: [120] Fine-knit cardigan
	ProductVariation: [296] Medium Heather Grey
Product: [121] Sweater with fur-effect trim
	ProductVariation: [297] Pastel Pink
Product: [122] Oversized cardigan with buttons
	ProductVariation: [298] Light/Pastel Grey
Product: [123] Oversized V-neck sweater
	ProductVariation: [299] Ecru
Product: [124] Sweater with decorative seam
	ProductVariation: [300] Light/Pastel Grey
Product: [125] Oversized V-neck sweater
	ProductVariation: [301] Navy
Product: [126] Ribbed sweater with low-cut back
	ProductVariation: [302] Ecru
Product: [127] V-neck lurex sweater
	ProductVariation: [30

  driver = webdriver.Chrome(ChromeDriverManager().install())


Product: [134] Patent leather-effect shoes with buckle
	ProductVariation: [310] Black
Product: [135] Track outsole boots
	ProductVariation: [311] Black
Product: [136] Metallic heel sandals
	ProductVariation: [312] Silver
Product: [137] Bow-heeled denim shoes
	ProductVariation: [313] Dark Blue
	ProductVariation: [314] Black
Product: [138] Block-heel sandals
	ProductVariation: [315] Ecru
Product: [139] Transparent vinyl wedge shoes
	ProductVariation: [316] White
Product: [140] Platform maxi sandals
	ProductVariation: [317] Black
Product: [141] Strappy heeled sandals
	ProductVariation: [318] Black
Product: [142] Block-heel sandals
	ProductVariation: [319] Black
Product: [143] Chain loafers
	ProductVariation: [320] Black
	ProductVariation: [321] Burgundy
Product: [144] Satin ballerinas with buckle
	ProductVariation: [322] Light Pink
Product: [145] Patent leather-effect heeled boots
	ProductVariation: [323] Black
Product: [146] High-heeled sandals with buckles
	ProductVariation: [324] Fuchs

  driver = webdriver.Chrome(ChromeDriverManager().install())


Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page
Waiting before reloading page


KeyboardInterrupt: 