In [3]:
import requests
import pandas as pd
import numpy as np
import os
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

IMAGE_FOLDER = "product_images"
os.makedirs(IMAGE_FOLDER, exist_ok=True)

CSV_FILE = "amazon_data.csv"
MAX_PRODUCTS_PER_CATEGORY = 60

def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": "productTitle"})
        return title.text.strip() if title else None
    except:
        return None

def get_price(soup):
    try:
        price = soup.find("span", attrs={"class": "a-price-whole"})
        return price.text.strip() if price else None
    except:
        return None

def get_rating(soup):
    try:
        rating = soup.find("span", attrs={"class": "a-icon-alt"})
        return rating.text.split()[0] if rating else None
    except:
        return None

def get_image(soup, product_id):
    try:
        img_tag = soup.find("img", attrs={"id": "landingImage"})
        img_url = img_tag["src"] if img_tag else None
        if img_url:
            img_extension = os.path.splitext(urlparse(img_url).path)[-1]
            img_filename = f"{product_id}{img_extension}"

            img_path = os.path.join(IMAGE_FOLDER, img_filename)

            img_data = requests.get(img_url, timeout=10).content
            with open(img_path, "wb") as img_file:
                img_file.write(img_data)

            return img_filename
        return None
    except:
        return None

def scrape_amazon_products(url, category, gender, style):
    try:
        webpage = requests.get(url, headers=HEADERS, timeout=10)
        webpage.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return {  
            "title": [], "price": [], "rating": [], "product_link": [], 
            "image_filename": [], "category": [], "gender": [], "style": []
        }  

    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={"class": "a-link-normal s-no-outline"})

    links_list = [link.get("href") for link in links if link.get("href")][:MAX_PRODUCTS_PER_CATEGORY]

    data = {
        "title": [], "price": [], "rating": [], "product_link": [], 
        "image_filename": [], "category": [], "gender": [], "style": []
    }

    for index, link in enumerate(links_list):
        if index >= MAX_PRODUCTS_PER_CATEGORY:
            break

        full_link = urljoin("https://www.amazon.com", link)
        product_id = f"product_{int(time.time())}_{index}"  

        try:
            new_webpage = requests.get(full_link, headers=HEADERS, timeout=10)
            new_webpage.raise_for_status()
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")

            title = get_title(new_soup)
            price = get_price(new_soup)
            rating = get_rating(new_soup)
            image_filename = get_image(new_soup, product_id)

            if title:  
                data["title"].append(title)
                data["price"].append(price)
                data["rating"].append(rating)
                data["product_link"].append(full_link)
                data["image_filename"].append(image_filename)
                data["category"].append(category)
                data["gender"].append(gender)
                data["style"].append(style)

            time.sleep(2)  

        except requests.exceptions.RequestException as e:
            # print(f"Error fetching product page {full_link}: {e}")
            continue  

    return data 

if __name__ == "__main__":
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
        "Accept-Language": "en-US, en;q=0.5",
    }

    URLS = [
        ("https://www.amazon.com/s?k=formal+shirts+for+men", "Shirt", "Men", "Formal"),
        ("https://www.amazon.com/s?k=half+sleeves+shirts+for+men", "Shirt", "Men", "Casual"),
        ("https://www.amazon.com/s?k=formal+pants+for+men","Pants","Men","Formal"),
        ("https://www.amazon.com/s?k=jeans+for+men+baggy","Pants","Men","Casual"),
    ]

    final_data = {
        "title": [], "price": [], "rating": [], "product_link": [], 
        "image_filename": [], "category": [], "gender": [], "style": []
    }

    for url, category, gender, style in URLS:
        scraped_data = scrape_amazon_products(url, category, gender, style)

        for key in final_data.keys():
            final_data[key].extend(scraped_data[key])

    amazon_df = pd.DataFrame.from_dict(final_data)
    amazon_df.replace("", np.nan, inplace=True)
    amazon_df.dropna(subset=["title"], inplace=True)

    try:
        if os.path.exists(CSV_FILE):
            amazon_df.to_csv(CSV_FILE, mode="a", header=False, index=False)
        else:
            amazon_df.to_csv(CSV_FILE, index=False)
    except Exception as e:
        print(f"Error saving CSV: {e}")

    print(amazon_df)
    print("Data saved successfully!")

                                                 title price rating  \
0    Amazon Essentials Henley Long Sleeve Shirts fo...   19.    4.6   
1    J.VER Men's Dress Shirts Solid Long Sleeve Str...   23.    4.4   
2    J.VER Men's Dress Shirts Stretch Stain Shield ...  None    4.5   
3    Men's Stretch Wrinkle Free Dress Shirts Formal...  None    4.4   
4    Van Heusen Men's Dress Shirt Regular Fit Popli...   38.    4.4   
..                                                 ...   ...    ...   
235  Ylingjun Mens Baggy Hip Hop Jeans Casual Loose...   63.    4.3   
236  Baggy Cargo Pants for Men Women Wide Leg Jeans...   44.    4.0   
237  Men's Baggy Vintage Jeans Loose Fit Denim Pant...   40.    4.4   
238  Men Grunge Jeans Vintage Baggy Wide Leg Hip Ho...   60.    4.7   
239                   mnml Men's Baggy Every Day Denim   37.    3.6   

                                          product_link  \
0    https://www.amazon.com/sspa/click?ie=UTF8&spc=...   
1    https://www.amazon.com/J-V