In [1]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
import time

## Scraping Category Links


In [2]:
url = "https://www.lechocolat-alainducasse.com/uk/"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    menu_items = soup.select('.siteMenuItem')
    
    data = {}
    
    for item in menu_items:
        try:
            link = item.select_one('a')['href']
            content = item.select_one('a span').text.strip()
            data[content] = link
        except:
            pass
    
    print(data)
else:
    print("Failed to retrieve the page.")


{'Easter chocolates': 'https://www.lechocolat-alainducasse.com/uk/easter-chocolate', 'Boxes': 'https://www.lechocolat-alainducasse.com/uk/chocolates', 'Bars': 'https://www.lechocolat-alainducasse.com/uk/chocolate-bar', 'Gifts': 'https://www.lechocolat-alainducasse.com/uk/chocolate-gift', 'Simple Pleasures': 'https://www.lechocolat-alainducasse.com/uk/simple-pleasures', 'Breakfast & Snacks': 'https://www.lechocolat-alainducasse.com/uk/breakfast-snacks'}


## Scraping PDP's Links in each Cateogry


In [3]:

def extract_product_urls(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        product_links = soup.select("#js-product-list .productMiniature a")

        urls = [link['href'] for link in product_links]

        return urls
    else:
        print("Failed to retrieve the webpage.")
        return []


# url = "https://www.lechocolat-alainducasse.com/uk/easter-chocolate"
# product_urls = extract_product_urls(url)
# print(product_urls)

pdp_dict={}
for data_value in data:
    pdp_links=extract_product_urls(data[data_value])
    pdp_dict[data_value]=pdp_links


In [4]:
def refresh_product(token, id_product, id_customization, group_value, quantity):
    headers = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest',
    }

    params = {
        'controller': 'product',
        'token': token,
        'id_product': id_product,
        'id_customization': id_customization,
        'group[6]': group_value,
        'qty': quantity,
    }

    data = {
        'quickview': '0',
        'ajax': '1',
        'action': 'refresh',
        'quantity_wanted': quantity,
    }

    response = requests.post(
        'https://www.lechocolat-alainducasse.com/uk/index.php',
        params=params,
        headers=headers,
        data=data,
    )

    return response.text


In [11]:
def get_product_data(soup):
    form = soup.find('form', id='add-to-cart-or-refresh')

    if form:
        input_fields = form.find_all('input')

        params = {}
        for field in input_fields:
            name = field.get('name')
            value = field.get('value')
            if name and value:
                params[name] = value

        token = params.get('token', '')
        id_product = params.get('id_product', '')
        id_customization = params.get('id_customization', '')
        group_value = params.get('group[6]', '')
        quantity = params.get('qty', '')

        response_text = refresh_product(token, id_product, id_customization, group_value, quantity)

        json_data = json.loads(response_text)
        product_details = json_data.get('product_details', '').strip()
        soup = BeautifulSoup(product_details, 'html.parser')
        product_details_div = soup.select('#product-details')

        if product_details_div:
            data_product_value = product_details_div[0].get('data-product')
            varaint_data = json.loads(data_product_value)

            price = varaint_data.get('price', '')
            description = varaint_data.get('meta_description', '')
            title = varaint_data.get('meta_title', '')
            link = varaint_data.get('link', '')

            features_dict = {}
            for features_data in varaint_data.get('features', []):
                features_dict[features_data.get('name', '')] = features_data.get('value', '')

            availability_message = varaint_data.get('availability_message', '')

            image_list = []
            for image_dict in varaint_data.get('images', [])[0].get('bySize', []):
                image_link = varaint_data.get('images', [])[0].get('bySize', {}).get(image_dict, {}).get('url', '')
                image_list.append(image_link)

            image_list = list(set(image_list))

            unit = varaint_data.get('attributes', {}).get('6', {}).get('name', '')
            price_float = float(re.search(r'\d+\.\d+', price).group())

            product_data = {
                'title': title,
                'selling_price': price_float,
                'unit': unit,
                'availability': availability_message,
                'description': description,
                'link': link,
                'features': features_dict,
                'images': image_list
            }
            return product_data

    return None

In [32]:
def scrape_variant_product_info(url):
    response = requests.get(url)

    product_info = {}
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
            
        title = soup.title.get_text()
        product_info['title'] = title

        description_element = soup.select_one('#product_tab_informations p:nth-child(1)')
        description = description_element.get_text(strip=True) if description_element else None
        product_info['description'] = description        

        # Extracting breadcrumb
        breadcrumb_elements = soup.select('.breadcrumb li span')
        breadcrumb_list = [element.get_text(strip=True) for element in breadcrumb_elements]
        product_info['breadcrumb'] = breadcrumb_list

        selling_price_tag = soup.find("meta", property="product:price:amount")
        selling_price_value = selling_price_tag["content"] if selling_price_tag else None
        
        selling_price_currency_tag = soup.find("meta", property="product:price:currency")
        selling_price_currency_value = selling_price_currency_tag["content"] if selling_price_currency_tag else None
        
        # if selling_price_value and selling_price_currency_value == "GBP":
        #     selling_price = "£" + selling_price_value
        # else:
        #     selling_price = None
        try:
            product_info['selling_price'] = float(selling_price_value)
        except:
            return None

        weight_element = soup.find(class_="productCard__weight")
        
        if weight_element:
            weight_text = weight_element.get_text(strip=True)
            
            unit = weight_text.split()[-1]
            
            product_info['unit'] = unit
        else:
            product_info['unit'] = None 

        message_tag = soup.find('p', class_='mailAlert__message')

        if message_tag and "This product is unavailable" in message_tag.get_text():
            availability = "Out of Stock"
        else:
            availability = "In Stock"
        product_info['availability'] = availability                      

        og_image_tag = soup.find("meta", property="og:image")
        og_image_url = og_image_tag["content"] if og_image_tag else None
        product_info['image'] = og_image_url
        
        # Extracting image URLs
        image_links = soup.select('.productImages__list li a')
        image_urls = [link['href'] for link in image_links]
        product_info['images'] = image_urls
        
        key_value_pairs = {}
        elements = soup.select('.wysiwyg-title-default')
        for element in elements:
            key = element.get_text(strip=True)
            next_p = element.find_next_sibling('p')
            value = next_p.get_text(strip=True) if next_p else None
            key_value_pairs[key] = value
        product_info['features'] = key_value_pairs

        product_info['url'] = url       
        
        ordered_keys = [
            'title',
            'description',
            'breadcrumb',
            'selling_price',
            'unit',
            'availability',
            'image',
            'images',
            'features',
            'url'
        ]

        ordered_product_info = {key: product_info[key] for key in ordered_keys if key in product_info}

        return ordered_product_info

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        print(url)
        return None


In [33]:
def scrape_product_info(url,category):
    response = requests.get(url)
    product_info = {}

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')


        title = soup.title.get_text()
        product_info['title'] = title  

        description_element = soup.select_one('#product_tab_informations p:nth-child(1)')
        description = description_element.get_text(strip=True) if description_element else None
        product_info['description'] = description                        

        breadcrumb_elements = soup.select('.breadcrumb li span')
        breadcrumb_list = [element.get_text(strip=True) for element in breadcrumb_elements]
        product_info['breadcrumb'] = breadcrumb_list

        image_links = soup.select('.productImages__list li a')
        image_urls = [link['href'] for link in image_links]
        product_info['images'] = image_urls

        key_value_pairs = {}
        elements = soup.select('.wysiwyg-title-default')
        for element in elements:
            key = element.get_text(strip=True)
            next_p = element.find_next_sibling('p')
            value = next_p.get_text(strip=True) if next_p else None
            key_value_pairs[key] = value
        product_info['features'] = key_value_pairs

        og_image_tag = soup.find("meta", property="og:image")
        og_image_url = og_image_tag["content"] if og_image_tag else None
        product_info['image'] = og_image_url

        selling_price_tag = soup.find("meta", property="product:price:amount")
        selling_price_value = selling_price_tag["content"] if selling_price_tag else None

        selling_price_currency_tag = soup.find("meta", property="product:price:currency")
        selling_price_currency_value = selling_price_currency_tag["content"] if selling_price_currency_tag else None

        # if selling_price_value and selling_price_currency_value == "GBP":
        #     selling_price = "£" + selling_price_value
        # else:
        #     selling_price = None
        try:
            product_info['selling_price'] = float(selling_price_value)
        except:
            return None

        weight_element = soup.find(class_="productCard__weight")
        if weight_element:
            weight_text = weight_element.get_text(strip=True)
            unit = weight_text.split()[-1]
            product_info['unit'] = unit
        else:
            product_info['unit'] = None

        product_info['url'] = url
        
        variant_products = soup.select('.linkedProducts__list li a')
        varinat_list=[]
        if variant_products:
            for product in variant_products:
                link = product['href']
                variant_response=scrape_variant_product_info(link)
                varinat_list.append(variant_response)
        try:
            varaint_json=get_product_data(soup)
            varinat_list.append(varaint_json)
        except:
            pass

        
        product_info['variants'] = varinat_list

        message_tag = soup.find('p', class_='mailAlert__message')
        if message_tag and "This product is unavailable" in message_tag.get_text():
            availability = "Out of Stock"
        else:
            availability = "In Stock"
        product_info['availability'] = availability
        product_info['category'] = category

    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        print(url)
        return None

    ordered_keys = [
        'title',
        'description',
        'breadcrumb',
        'images',
        'features',
        'image',
        'selling_price',
        'unit',
        'url',
        'variants',
        'availability',
        'category'
    ]

    ordered_product_info = {key: product_info[key] for key in ordered_keys if key in product_info}

    return ordered_product_info


url = "https://www.lechocolat-alainducasse.com/uk/easter-treats-milk#/77-size-150g"
category="XYZ"
product_data = scrape_product_info(url,category)
print(product_data)

None


In [41]:
import time
import json
import random

pdp_json_dict = []

for category_value in pdp_dict:
    print(category_value)
    for pdp_links in pdp_dict[category_value]:
        try:
            url = pdp_links
            category = category_value
            product_data = scrape_product_info(url, category)
            if product_data is not None:
                pdp_json_dict.append(product_data)
            time.sleep(random.randint(1, 3))
        except Exception as ex:
            print(pdp_links)
            print(ex)

Easter chocolates
Boxes
Failed to retrieve the page. Status code: 404
https://www.lechocolat-alainducasse.com/uk/coffret-carres-degustation-50-pieces
Bars
Failed to retrieve the page. Status code: 404
https://www.lechocolat-alainducasse.com/uk/coffret-carres-degustation-50-pieces
Gifts
Simple Pleasures
Failed to retrieve the page. Status code: 404
https://www.lechocolat-alainducasse.com/uk/coffret-carres-degustation-50-pieces
Breakfast & Snacks
Failed to retrieve the page. Status code: 404
https://www.lechocolat-alainducasse.com/uk/coffret-carres-degustation-50-pieces


In [42]:
pdp_json_dict

[{'title': 'Hide-and-Seek Box Signature Blend 75% Dark Chocolate & Madagascar 45% Milk Chocolate | Le Chocolat Alain Ducasse',
  'description': "Inside this playful and elegant box, you'll find eight hexa-birds to slip each into their packet (using the supplied pair of gloves) before hiding them inside or outside for an unforgettable Easter Flight. Ducks, geese, woodpeckers and peacocks in dark, milk and mendiant chocolate fly away: it's up to you to find them ! And don’t forget, the early bird catches the worm, so hurry!",
  'breadcrumb': ['Home',
   'Easter chocolates',
   'HIDE-AND-SEEK BOX - 75% Dark & 45% Milk Chocolate'],
  'images': ['https://www.lechocolat-alainducasse.com/uk/4074-product_cover_xl2x/the-hide-and-seek-box.jpg',
   'https://www.lechocolat-alainducasse.com/uk/4192-product_cover_xl2x/the-hide-and-seek-box.jpg',
   'https://www.lechocolat-alainducasse.com/uk/4076-product_cover_xl2x/the-hide-and-seek-box.jpg',
   'https://www.lechocolat-alainducasse.com/uk/4112-produ

In [43]:
with open('Lechocolat.json', 'w') as json_file:
    json.dump(pdp_json_dict, json_file)