In [5]:
import os
import requests
from bs4 import BeautifulSoup


In [None]:

# === Step 1: Load the HTML file ===
with open('smartprix.html', 'r', encoding='utf-8') as f:
    html = f.read()

# === Step 2: Parse HTML ===
soup = BeautifulSoup(html, 'html.parser')

# === Step 3: Find all containers (adjust this tag/class as needed) ===
containers = soup.find_all('div', class_='sm-img-wrap')

# === Step 4: Prepare directory ===
os.makedirs('sm_images', exist_ok=True)



In [None]:
# === Step 5: Loop through containers and extract images ===
# === In this case, images of the same smartphone model but with different RAM and storage variants are all being saved to the folder, resulting in duplicate images of the same device. ===
for i, container in enumerate(containers):
    try:
        img_tag = container.find('img', class_='sm-img')
        if not img_tag:
            continue

        img_url = img_tag.get('src')
        if img_url.startswith('//'):
            img_url = 'https:' + img_url

        alt_text = img_tag.get('alt', f'image_{i+1}')
        alt_clean = alt_text.replace(' ', '_').replace('/', '-')  # for safe filename

        # Download image
        img_data = requests.get(img_url, timeout=10).content
        with open(f'sm_images/{alt_clean}_{i+1}.webp', 'wb') as f:
            f.write(img_data)

        print(f"✅ Saved: {alt_clean}_{i+1}.webp")

    except Exception as e:
        print(f"❌ Error for container {i+1}: {e}")

In [None]:
# === Step 6: Track already-saved base models ===
# === In this case,duplicate images of the same device are removes stored in saved models. ===
saved_models = set()

# === Step 7: Loop and download unique images ===
for i, container in enumerate(containers):
    try:
        img_tag = container.find('img', class_='sm-img')
        if not img_tag:
            continue

        img_url = img_tag.get('src')
        if img_url.startswith('//'):
            img_url = 'https:' + img_url

        alt_text = img_tag.get('alt', f'image_{i+1}')
        base_model = alt_text.split('(')[0].strip()  # Get model before '('

        # Check if model already saved
        if base_model in saved_models:
            print(f"🔁 Skipped duplicate model: {base_model}")
            continue

        saved_models.add(base_model)  # Mark model as saved

        safe_name = re.sub(r'[^\w\-_()]', '_', base_model)
        file_name = f'{safe_name}_{i+1}.webp'

        # Download and save
        img_data = requests.get(img_url, timeout=10).content
        with open(os.path.join('sm_images', file_name), 'wb') as f:
            f.write(img_data)

        print(f"✅ Saved: {file_name}")

    except Exception as e:
        print(f"❌ Error at image {i+1}: {e}")