In [5]:
import requests
from bs4 import BeautifulSoup
import os

# Function to download images
def download_image(url, folder, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(os.path.join(folder, filename), 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)

# Create a folder to save images
output_folder = "shape_images"
os.makedirs(output_folder, exist_ok=True)

# URL of the target website
url = "https://www.brilliantearth.com/engagement-rings/"

# Set headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

# Send a GET request
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for unsuccessful requests

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Locate the main container
container = soup.find('div', class_="content tw-overflow-hidden tw-flex tw-flex-row tw-justify-center tw-w-full xl:tw-mx-[auto] xl:tw-my-[0] xl:tw-w-[88%]")
if container:
    shapes_list = container.find('ul', class_="slideContainer stones-list tw-flex tw-flex-row tw-mt-0 tw-pl-0 tw-mb-0 tw-list-none tw-list-image-none tw-text-center tw-w-full tw-justify-between lg:tw-overflow-x-hidden lg:tw-gap-0")

    if shapes_list:
        shapes = []
        for item in shapes_list.find_all('li'):
            shape_name_tag = item.find('p')
            image_tag = item.find('img')
            link_tag = item.find('a')

            if shape_name_tag and image_tag and link_tag:
                shape_name = shape_name_tag.text.strip()
                image_url = image_tag['src']
                shape_link = link_tag['href']

                # Save image
                image_filename = f"{shape_name}.jpg"
                download_image(image_url, output_folder, image_filename)

                # Append data to the list
                shapes.append({
                    'shape_name': shape_name,
                    'image_url': image_url,
                    'shape_link': shape_link
                })

        # Print the scraped data
        for shape in shapes:
            print(f"Shape Name: {shape['shape_name']}")
            print(f"Image URL: {shape['image_url']}")
            print(f"Shape Link: {shape['shape_link']}")
            print("-" * 40)
    else:
        print("Shapes list not found!")
else:
    print("Main container not found!")


HTTPError: 403 Client Error: Forbidden for url: https://www.brilliantearth.com/engagement-rings/

In [6]:
#Code for web scraping according to requirement of dataset
import requests

url = "https://heritagejewels.com.pk/collections/diamond-rings"
response = requests.get(url)
print(response.headers)



{'Date': 'Wed, 22 Jan 2025 11:19:41 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'x-sorting-hat-podid': '306', 'x-sorting-hat-shopid': '53893497015', 'x-storefront-renderer-rendered': '1', 'x-shopify-nginx-no-cookies': '1', 'set-cookie': 'secure_customer_sig=; path=/; expires=Thu, 22 Jan 2026 11:19:41 GMT; secure; HttpOnly; SameSite=Lax, localization=PK; path=/; expires=Thu, 22 Jan 2026 11:19:41 GMT; SameSite=Lax, cart_currency=PKR; path=/; expires=Wed, 05 Feb 2025 11:19:41 GMT; SameSite=Lax, _shopify_y=84e66693-0bba-44c9-9ce8-68af8b98a96b; domain=heritagejewels.com.pk; path=/; expires=Thu, 22 Jan 2026 17:19:41 GMT; SameSite=Lax, _shopify_s=586246b2-4693-4981-a32c-13cce7011359; domain=heritagejewels.com.pk; path=/; expires=Thu, 23 Jan 2025 17:19:41 GMT; SameSite=Lax, _tracking_consent=%7B%22con%22%3A%7B%22CMP%22%3A%7B%22a%22%3A%22%22%2C%22m%22%3A%22%22%2C%22p%22%3A%22%22%2C%22s%22%3A%22%22%7D%7D%2C%22v%22%3A%222.1%22%2C%2

In [3]:
import requests
from bs4 import BeautifulSoup
import csv

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main container holding the products
    main_container = soup.find('div', class_="t4s_box_pr_grid t4s-products t4s-text-center t4s_rationt t4s_position_8 t4s_cover t4s-row t4s-justify-content-center t4s-row-cols-2 t4s-row-cols-md-2 t4s-row-cols-lg-4 t4s-gx-md-30 t4s-gy-md-30 t4s-gx-10 t4s-gy-10")
    if main_container:
        print("Main Container found!")

        # Find all individual product containers
        product_containers = main_container.find_all('div', class_="t4s-product-card")
        print(f"Found {len(product_containers)} products.")

        # Prepare CSV file for output
        with open("heritage_diamond_rings.csv", "w", newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Title', 'Price', 'Link', 'Image URL', 'Description']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            # Iterate over each product container
            for product in product_containers:
                # Extract product title
                product_title_tag = product.find('a', class_="t4s-product-title")
                product_title = product_title_tag.text.strip() if product_title_tag else "No title"

                # Extract product price
                product_price_tag = product.find('span', class_="money")
                product_price = product_price_tag.text.strip() if product_price_tag else "No price"

                # Extract product link
                product_link_tag = product.find('a', class_="t4s-product-title", href=True)
                product_link = f"https://heritagejewels.com.pk{product_link_tag['href']}" if product_link_tag else "No link"

                # Extract product image URL
                product_image_tag = product.find('img', class_="t4s-product-image")
                product_image_url = product_image_tag['src'] if product_image_tag else "No image URL"

                # Request the product's detailed page for the description
                product_description = "No description"
                if product_link != "No link":
                    product_response = requests.get(product_link)
                    if product_response.status_code == 200:
                        product_soup = BeautifulSoup(product_response.content, 'html.parser')
                        description_tag = product_soup.find('div', class_="product-single__description rte")
                        product_description = description_tag.get_text(separator=' ', strip=True) if description_tag else "No description"

                # Write product data to CSV
                writer.writerow({
                    'Title': product_title,
                    'Price': product_price,
                    'Link': product_link,
                    'Image URL': product_image_url,
                    'Description': product_description
                })

                # Print product data to console for debugging
                print(f"Title: {product_title}")
                print(f"Price: {product_price}")
                print(f"Link: {product_link}")
                print(f"Image URL: {product_image_url}")
                print(f"Description: {product_description}\n")

    else:
        print("Main container not found. Check the HTML structure or class names.")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Main Container found!
Found 0 products.


In [9]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except:
    print("Timeout waiting for page to load.")
    driver.quit()

# Find the main container holding the products
product_containers = driver.find_elements(By.CSS_SELECTOR, ".t4s-product")

if product_containers:
    print(f"Found {len(product_containers)} products.")
    
    # Loop through each product container and extract data
    for product in product_containers:
        try:
            # Extract product title
            title = product.find_element(By.CSS_SELECTOR, "a").text.strip()

            # Extract product price using the provided selector
            try:
                price = product.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
            except:
                price = "No price available"

            # Extract product link
            product_link = product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()

            # Extract product image URL
            image_url = product.find_element(By.CSS_SELECTOR, "img").get_attribute("src").strip()

            # Print the data
            print(f"Title: {title}")
            print(f"Price: {price}")
            print(f"Link: {product_link}")
            print(f"Image URL: {image_url}")
            print()

        except Exception as e:
            print(f"Error extracting data for a product: {e}")

else:
    print("No products found. Check the CSS selectors or the page structure.")

# Close the WebDriver
driver.quit()


Found 13 products.
Title: 
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

Title: 
Price: Rs.605,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-5-carat-lab-grown-diamond-ring-lgb006
Image URL: data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

Title: 
Price: Rs.437,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-carat-lab-grown-diamond-ring-lgb006
Image URL: data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

Title: 
Price: Rs.195,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/0-50-carat-lab-grown-diamond-ring-lgb005
Image URL: data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==

Title: 
Price: Rs.422,000.00
Link: https://heritagejewels.com.pk/collections

In [1]:
#Scrap description of each product 
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except:
    print("Timeout waiting for page to load.")
    driver.quit()

# Find the main container holding the products
product_containers = driver.find_elements(By.CSS_SELECTOR, ".t4s-product")

if product_containers:
    print(f"Found {len(product_containers)} products.")
    
    # Loop through each product container and extract data
    for product in product_containers:
        try:
            # Extract product title
            title = product.find_element(By.CSS_SELECTOR, "a").text.strip()

            # Extract product price using the provided selector
            try:
                price = product.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
            except:
                price = "No price available"

            # Extract product link
            product_link = product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()

            # Extract product image URL
            image_url = product.find_element(By.CSS_SELECTOR, "img").get_attribute("src").strip()

            # Print the product details
            print(f"Title: {title}")
            print(f"Price: {price}")
            print(f"Link: {product_link}")
            print(f"Image URL: {image_url}")

            # Open the product link to scrape the description
            driver.get(product_link)

            # Wait for the product page to load and extract description
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#ProductInfo-template--template--23621180457267__main__main .t4s-pr__richtext")))
                description_list = driver.find_elements(By.CSS_SELECTOR, "#ProductInfo-template--template--23621180457267__main__main .t4s-pr__richtext ul li")
                
                description = ""
                for item in description_list:
                    description += item.text.strip() + " "
                
                print(f"Description: {description}")
            
            except Exception as e:
                print(f"Error extracting description: {e}")

            print("\n---\n")

        except Exception as e:
            print(f"Error extracting data for a product: {e}")

else:
    print("No products found. Check the CSS selectors or the page structure.")

# Close the WebDriver
driver.quit()


Found 13 products.
Title: 
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging 

---

Error extracting data for a product: Message: stale element reference: stale element not found
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x0032FD53+23747]
	(No symbol) [0x002B7D54]
	(No symbol) [0x0018BE53]
	(No symbol) [0x0019B889]
	(No symbol) [0x0019A945]
	(No symbol) [0x001922B3]
	(No symbol) [0x00190578]
	(No symbol) [0x0019387A]
	(No symbol) [0x001938F7]
	(No symbol) [0x001CF8B9]
	(No symbol) [0x001CFEEB]
	(

In [7]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except Exception as e:
    print(f"Timeout waiting for page to load: {e}")
    driver.quit()

# Find product links dynamically to avoid stale references
try:
    product_links = [
        product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()
        for product in driver.find_elements(By.CSS_SELECTOR, ".t4s-product")
    ]
except Exception as e:
    print(f"Error extracting product links: {e}")
    driver.quit()

print(f"Found {len(product_links)} products.")

# Loop through product links to scrape details
for product_link in product_links:
    try:
        # Navigate to the product page
        driver.get(product_link)

        # Wait for the product page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".t4s-pr__richtext"))
        )

        # Extract product title
        try:
            title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
        except:
            title = "No title available"

        # Extract product price
        try:
            price = driver.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
        except:
            price = "No price available"

        # Extract product image
        try:
            image_url = driver.find_element(By.CSS_SELECTOR, "img").get_attribute("src").strip()
        except:
            image_url = "No image URL available"

        # Extract product description
        try:
            description_list = driver.find_elements(By.CSS_SELECTOR, ".t4s-pr__richtext ul li")
            description = " ".join([item.text.strip() for item in description_list])
        except:
            description = "No description available"

        # Print the product details
        print(f"Title: {title}")
        print(f"Price: {price}")
        print(f"Link: {product_link}")
        print(f"Image URL: {image_url}")
        print(f"Description: {description}")
        print("\n---\n")

    except Exception as e:
        print(f"Error processing product link {product_link}: {e}")

# Close the WebDriver
driver.quit()


Found 13 products.
Title: 2 Carat Lab Grown Diamond Ring - LGB008
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging

---

Title: 1.5 Carat Lab Grown Diamond Ring - LGB007
Price: Rs.605,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-5-carat-lab-grown-diamond-ring-lgb006
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging

---

Title: 1 Carat Lab Grown Diamond Ring - LGB006
Price: Rs.437,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-

In [8]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except Exception as e:
    print(f"Timeout waiting for page to load: {e}")
    driver.quit()

# Find product links dynamically to avoid stale references
try:
    product_links = [
        product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()
        for product in driver.find_elements(By.CSS_SELECTOR, ".t4s-product")
    ]
except Exception as e:
    print(f"Error extracting product links: {e}")
    driver.quit()

print(f"Found {len(product_links)} products.")

# CSV file setup
csv_file = "products.csv"
fields = ["Title", "Price", "Link", "Image URL", "Description"]

# Write headers to the CSV file
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(fields)

# Loop through product links to scrape details
for product_link in product_links:
    try:
        # Navigate to the product page
        driver.get(product_link)

        # Wait for the product page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".t4s-pr__richtext"))
        )

        # Extract product title
        try:
            title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
        except:
            title = "No title available"

        # Extract product price
        try:
            price = driver.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
        except:
            price = "No price available"

        # Extract product image
        try:
            image_url = driver.find_element(By.CSS_SELECTOR, "img").get_attribute("src").strip()
        except:
            image_url = "No image URL available"

        # Extract product description
        try:
            description_list = driver.find_elements(By.CSS_SELECTOR, ".t4s-pr__richtext ul li")
            description = " ".join([item.text.strip() for item in description_list])
        except:
            description = "No description available"

        # Print the product details
        print(f"Title: {title}")
        print(f"Price: {price}")
        print(f"Link: {product_link}")
        print(f"Image URL: {image_url}")
        print(f"Description: {description}")
        print("\n---\n")

        # Write data to CSV file
        with open(csv_file, "a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([title, price, product_link, image_url, description])

    except Exception as e:
        print(f"Error processing product link {product_link}: {e}")

# Close the WebDriver
driver.quit()

print(f"Data has been saved to {csv_file}.")


Found 12 products.
Title: 2 Carat Lab Grown Diamond Ring - LGB008
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging

---

Title: 1.5 Carat Lab Grown Diamond Ring - LGB007
Price: Rs.605,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-5-carat-lab-grown-diamond-ring-lgb006
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging

---

Title: 1 Carat Lab Grown Diamond Ring - LGB006
Price: Rs.437,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-

In [9]:
#Csv saved with colous column
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except Exception as e:
    print(f"Timeout waiting for page to load: {e}")
    driver.quit()

# Find product links dynamically to avoid stale references
try:
    product_links = [
        product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()
        for product in driver.find_elements(By.CSS_SELECTOR, ".t4s-product")
    ]
except Exception as e:
    print(f"Error extracting product links: {e}")
    driver.quit()

print(f"Found {len(product_links)} products.")

# CSV file setup
csv_file = "products_with_color.csv"
fields = ["Title", "Price", "Link", "Image URL", "Description", "Colour"]

# Write headers to the CSV file
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(fields)

# Function to extract color from the description
def extract_color(description):
    match = re.search(r"(\b[A-Z]+\s*[\+\-]?\s*Colour\b)", description)
    return match.group(1) if match else "No color specified"

# Loop through product links to scrape details
for product_link in product_links:
    try:
        # Navigate to the product page
        driver.get(product_link)

        # Wait for the product page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".t4s-pr__richtext"))
        )

        # Extract product title
        try:
            title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
        except:
            title = "No title available"

        # Extract product price
        try:
            price = driver.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
        except:
            price = "No price available"

        # Extract product image
        try:
            image_url = driver.find_element(By.CSS_SELECTOR, "img").get_attribute("src").strip()
        except:
            image_url = "No image URL available"

        # Extract product description
        try:
            description_list = driver.find_elements(By.CSS_SELECTOR, ".t4s-pr__richtext ul li")
            description = " ".join([item.text.strip() for item in description_list])
        except:
            description = "No description available"

        # Extract color from the description
        color = extract_color(description)

        # Print the product details
        print(f"Title: {title}")
        print(f"Price: {price}")
        print(f"Link: {product_link}")
        print(f"Image URL: {image_url}")
        print(f"Description: {description}")
        print(f"Colour: {color}")
        print("\n---\n")

        # Write data to CSV file
        with open(csv_file, "a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([title, price, product_link, image_url, description, color])

    except Exception as e:
        print(f"Error processing product link {product_link}: {e}")

# Close the WebDriver
driver.quit()

print(f"Data has been saved to {csv_file}.")


Found 13 products.
Title: 2 Carat Lab Grown Diamond Ring - LGB008
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging
Colour: G+ Colour

---

Title: 1.5 Carat Lab Grown Diamond Ring - LGB007
Price: Rs.605,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-5-carat-lab-grown-diamond-ring-lgb006
Image URL: https://heritagejewels.com.pk/cdn/shop/files/png.png?v=1729942276&width=95
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging
Colour: G+ Colour

---

Title: 1 Carat Lab Grown Diamond Ring - LGB006
Price: Rs.437,000.00
Link: https://heritagejewels.com.pk/c

In [10]:
#csv saved with colour and correct image url  ---image url not working
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# URL of the webpage to scrape
url = "https://heritagejewels.com.pk/collections/diamond-rings"

# Setup Chrome options
options = Options()
options.headless = True  # Run in headless mode (without opening a browser window)

# Use webdriver-manager to automatically manage and install ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
driver.get(url)

# Wait for the products to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.t4s-product')))
except Exception as e:
    print(f"Timeout waiting for page to load: {e}")
    driver.quit()

# Find product links dynamically to avoid stale references
try:
    product_links = [
        product.find_element(By.CSS_SELECTOR, "a").get_attribute("href").strip()
        for product in driver.find_elements(By.CSS_SELECTOR, ".t4s-product")
    ]
except Exception as e:
    print(f"Error extracting product links: {e}")
    driver.quit()

print(f"Found {len(product_links)} products.")

# CSV file setup
csv_file = "products_with_color_and_correct_images.csv"
fields = ["Title", "Price", "Link", "Image URL", "Description", "Colour"]

# Write headers to the CSV file
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(fields)

# Function to extract color from the description
def extract_color(description):
    match = re.search(r"(\b[A-Z]+\s*[\+\-]?\s*Colour\b)", description)
    return match.group(1) if match else "No color specified"

# Loop through product links to scrape details
for product_link in product_links:
    try:
        # Navigate to the product page
        driver.get(product_link)

        # Wait for the product page to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".t4s-pr__richtext"))
        )

        # Extract product title
        try:
            title = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
        except:
            title = "No title available"

        # Extract product price
        try:
            price = driver.find_element(By.CSS_SELECTOR, ".t4s-product-price").text.strip()
        except:
            price = "No price available"

        # Extract product image
        try:
            image_element = driver.find_element(By.CSS_SELECTOR, 
                "#shopify-section-template--23621180457267__main .t4s-product__media-item.is-selected > div > img"
            )
            image_url = image_element.get_attribute("src").strip()
        except Exception as e:
            image_url = f"No image URL available ({e})"

        # Extract product description
        try:
            description_list = driver.find_elements(By.CSS_SELECTOR, ".t4s-pr__richtext ul li")
            description = " ".join([item.text.strip() for item in description_list])
        except:
            description = "No description available"

        # Extract color from the description
        color = extract_color(description)

        # Print the product details
        print(f"Title: {title}")
        print(f"Price: {price}")
        print(f"Link: {product_link}")
        print(f"Image URL: {image_url}")
        print(f"Description: {description}")
        print(f"Colour: {color}")
        print("\n---\n")

        # Write data to CSV file
        with open(csv_file, "a", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([title, price, product_link, image_url, description, color])

    except Exception as e:
        print(f"Error processing product link {product_link}: {e}")

# Close the WebDriver
driver.quit()

print(f"Data has been saved to {csv_file}.")


Found 13 products.
Title: 2 Carat Lab Grown Diamond Ring - LGB008
Price: Rs.902,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/2-carat-lab-grown-diamond-ring-lgb001
Image URL: data:image/svg+xml,%3Csvg%20viewBox%3D%220%200%201667%201667%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3C%2Fsvg%3E
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging
Colour: G+ Colour

---

Title: 1.5 Carat Lab Grown Diamond Ring - LGB007
Price: Rs.605,000.00
Link: https://heritagejewels.com.pk/collections/diamond-rings/products/1-5-carat-lab-grown-diamond-ring-lgb006
Image URL: data:image/svg+xml,%3Csvg%20viewBox%3D%220%200%201667%201667%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3C%2Fsvg%3E
Description: VVS2+ Clarity G+ Colour GIA & IGI Certified Exchange/Upgrade: 70% Refund: 60% Yellow & White Gold Expert Craftmanship Premium Packaging
Colour: G+ Colo