In [9]:
!pip install selenium
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


## Import necessary modules

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import pandas as pd

## Configure Chrome options for headless browsing (without opening a browser window)

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless=new")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")   # Disable GPU hardware acceleration
chrome_options.add_argument("--no-sandbox")    # Disable the sandbox (required for headless mode)

In [4]:
# Automatically manage ChromeDriver using WebDriverManager
service = Service(ChromeDriverManager().install())

In [5]:
# Initialize the Chrome WebDriver with the specified service and options
driver = webdriver.Chrome(service=service, options=chrome_options)

In [6]:
# Dictionary of countries and their URL paths
countries = {"USA": "us/en", "France": "fr/fr" , "UK": "gb/en", "Japan":"jp/ja"}
# A list of collections to scrape, in this case only 'RADIOMIR'
collections = ['RADIOMIR', 'LUMINOR', 'SUBMERSIBLE', 'LUMINOR-DUE']

## Initialize scraping

In [9]:
# Base URL format for constructing collection-specific product pages
base_url = "https://www.panerai.com/{}/collections/watch-collection/{}.html"

# Loop through each country and its corresponding URL path
for country, country_url in countries.items():
    # List to store product information for the current country
    country_products = []
    
    # Loop through each collection to scrape its products
    for collection in collections:        
        try:
            # Convert the collection name to lowercase and format the URL
            collection_lower = collection.lower()
            url = base_url.format(country_url, collection_lower)
            driver.get(url)  # Open the collection page
            print(f"Scraping: {url}")

            # Wait for the product cards to load (up to 10 seconds)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pan-prod-ref-card-v2"))
            )

            # Find all product cards on the page
            product_cards = driver.find_elements(By.CLASS_NAME, "pan-prod-ref-card-v2")
            print(f"Found {len(product_cards)} products for collection: {collection}")
            
            # Loop through the product cards (limit to half of the products)
            i = 0
            for card in product_cards:
                if i >= len(product_cards) / 2:
                    break  # Stop after processing half of the products
                
                try:
                    # Extract the product's data from the 'data-tracking-product' attribute
                    product_link_element = card.find_element(By.CLASS_NAME, "pan-prod-ref-link-v2")
                    data_tracking = product_link_element.get_attribute("data-tracking-product")
                    data_tracking = json.loads(data_tracking.replace("&quot;", '"'))  # Parse the JSON

                    # Extract relevant product details from the parsed JSON
                    product_info = {
                        'country': country,
                        'name': data_tracking.get('name', 'N/A'),
                        'reference': data_tracking.get('reference', 'N/A'),
                        'collection': data_tracking.get('collection', 'N/A'),
                        'brand': data_tracking.get('brand', 'N/A'),
                        'price': data_tracking.get('price', 'N/A'),
                        'currency': data_tracking.get('currency', 'N/A'),
                        'availability': "Available" if data_tracking.get('isAvailable', 'false') == 'true' else "Out of Stock",
                        'product_url': product_link_element.get_attribute('href')
                    }

                except Exception as e:
                    print(f"Error processing product: {str(e)}")
                    continue  # Skip to the next product if an error occurs
                    
                # Extract the main product image URL
                try:
                    # Wait for the image element to be present in the DOM
                    img_element = WebDriverWait(card, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, ".pan-prod-ref-front-image-v2 img"))                        )

                    # Extract the image URL from either `data-src` or `src` attribute
                    main_image = img_element.get_attribute("data-src") or img_element.get_attribute("src")

                    # If the image URL contains "transform", remove the transform part
                    if main_image and "transform" in main_image:
                        main_image = main_image.split(".transform")[0]

                    # Construct the full image URL
                    product_info['image_url'] = "https://www.panerai.com" + main_image if main_image else "N/A"
                except Exception as e:
                    print(f"Error extracting image url: {e}")
                    product_info['image_url'] = "N/A"
                
                i += 1  # Increment the counter for the products processed
                country_products.append(product_info)  # Add the product information to the list
        
        except Exception as e:
            print(f"Error processing {collection}: {str(e)}")
            continue  # Skip to the next collection if an error occurs
    
        print(len(country_products))
        # Convert the collected product data into a pandas DataFrame
        df = pd.DataFrame(country_products)

        # Save the DataFrame as a CSV file
        df.to_csv(f"extracted_data/bronze/{country}_{collection}_2025.csv", index=False)

Scraping: https://www.panerai.com/us/en/collections/watch-collection/radiomir.html
Found 46 products for collection: RADIOMIR
23
Scraping: https://www.panerai.com/us/en/collections/watch-collection/luminor.html
Found 108 products for collection: LUMINOR
77
Scraping: https://www.panerai.com/us/en/collections/watch-collection/submersible.html
Found 78 products for collection: SUBMERSIBLE
116
Scraping: https://www.panerai.com/us/en/collections/watch-collection/luminor-due.html
Found 66 products for collection: LUMINOR-DUE
149
Scraping: https://www.panerai.com/fr/fr/collections/watch-collection/radiomir.html
Found 46 products for collection: RADIOMIR
23
Scraping: https://www.panerai.com/fr/fr/collections/watch-collection/luminor.html
Found 108 products for collection: LUMINOR
77
Scraping: https://www.panerai.com/fr/fr/collections/watch-collection/submersible.html
Found 77 products for collection: SUBMERSIBLE
116
Scraping: https://www.panerai.com/fr/fr/collections/watch-collection/luminor-d

## Close the browser once scraping is complete

In [None]:
driver.quit()