In [13]:
pip install requests openpyxl pillow


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import os
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

# Create output folder for scraped images
os.makedirs("image_scraped", exist_ok=True)

# Load HTML file
with open("Amazon.in_shoes.html", "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse HTML
soup = BeautifulSoup(html_content, "html.parser")
product_divs = soup.find_all("div", class_="s-result-item")
print("Total product divs found:", len(product_divs))

# Setup Excel workbook
wb = Workbook()
ws = wb.active
ws.append(["Name", "Price", "Reviews", "Rating", "Last Download"])  # Header row

row = 2  # Start from second row

for div in product_divs:
    # Name
    try:
        name_tag = div.find("span", class_=lambda x: x and "a-text-normal" in x)
        name = name_tag.get_text(strip=True) if name_tag else ""
    except:
        name = ""

    # Price
    try:
        price = div.find("span", class_="a-price-whole").get_text(strip=True)
    except:
        price = ""

    # Rating
    try:
        rating = div.find("span", class_="a-icon-alt").get_text(strip=True)
    except:
        rating = ""

    # Reviews
    try:
        reviews = div.find("span", class_="a-size-base s-underline-text").get_text(strip=True)
    except:
        reviews = ""

    # Last downloads
    try:
        lastdownloads = div.find("span", class_="a-size-base a-color-secondary").get_text(strip=True)
    except:
        lastdownloads = ""

    # Image URL
    try:
        img_tag = div.find("img", class_="s-image")
        img_url = img_tag["src"] if img_tag else ""
    except:
        img_url = ""

    # Skip if no useful info
    if not (name or price or rating or img_url):
        continue

    # Save image to image_scraped folder
    if img_url:
        try:
            img_data = requests.get(img_url, timeout=10).content
            image_name = f"image_scraped/product_{row}.jpg"
            with open(image_name, "wb") as img_file:
                img_file.write(img_data)
            print(f"✅ Image saved: {image_name}")
        except Exception as e:
            print(f"❌ Failed to download image for row {row}: {e}")

    # Write product data into Excel
    ws.cell(row=row, column=1, value=name)
    ws.cell(row=row, column=2, value=price)
    ws.cell(row=row, column=3, value=reviews)
    ws.cell(row=row, column=4, value=rating)
    ws.cell(row=row, column=5, value=lastdownloads)

    row += 1

# Save workbook
wb.save("amazon_shoes_data_final.xlsx")
print("✅ Excel file saved as amazon_shoes_data.xlsx")


Total product divs found: 84
✅ Image saved: image_scraped/product_4.jpg
✅ Image saved: image_scraped/product_5.jpg
✅ Image saved: image_scraped/product_6.jpg
✅ Image saved: image_scraped/product_7.jpg
✅ Image saved: image_scraped/product_8.jpg
✅ Image saved: image_scraped/product_9.jpg
✅ Image saved: image_scraped/product_10.jpg
✅ Image saved: image_scraped/product_11.jpg
✅ Image saved: image_scraped/product_12.jpg
✅ Image saved: image_scraped/product_13.jpg
✅ Image saved: image_scraped/product_14.jpg
✅ Image saved: image_scraped/product_15.jpg
✅ Image saved: image_scraped/product_16.jpg
✅ Image saved: image_scraped/product_17.jpg
✅ Image saved: image_scraped/product_18.jpg
✅ Image saved: image_scraped/product_19.jpg
✅ Image saved: image_scraped/product_20.jpg
✅ Image saved: image_scraped/product_21.jpg
✅ Image saved: image_scraped/product_22.jpg
✅ Image saved: image_scraped/product_23.jpg
✅ Image saved: image_scraped/product_24.jpg
✅ Image saved: image_scraped/product_25.jpg
✅ Image s