# Amazon Web Scrape

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import datetime
import os

url = 'https://www.amazon.in/XYXX-Regular-T-Shirt-XY_CR14_Polo-Tshirt_1_Misty/dp/B0CZL9H7J5/ref=sr_1_2_sspa?crid=2V13BWZBLIWF4&dib=eyJ2IjoiMSJ9.Wv-NxNGBS-LNPPftSOJQ68z7fd_WfWoOoBduaf1s3KtilvW6blMv3eov2nPWzdqnvEjlNSmzqd7JyUGYuCGC6uJh64lv3zo0kg9y_D0DiNltpDxz9uK891YW_8mhDON79crZoDAZPTqs7RGWThEvPuI0E2lgEZDed2jt2D-5i2qcFri-PoEZtmrgQM1rtxlrIKGJs7JH2WswiIx9521emxU-mgvEZyBkka8iuRJcFSV5Xf_5zHOHV16qKm-fSoIIoHH_tWV0PD6GsBgitLXZSJTDEEZ3_caEcvFMY_KK2BI.TjYKzcxWkbQ8EAfOh6saw5Ah1Oz_WwMvFsBwJ3nxY7Q&dib_tag=se&keywords=data%2Banalyst%2Btshirt&qid=1750093733&sprefix=data%2Banalyst%2Btshirt%2Caps%2C305&sr=8-2-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&th=1&psc=1'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"
}

def check_price():
    product_title = "Title not found"
    extracted_price = "Price not found"

    print(f"\n[{datetime.datetime.now()}] Attempting to fetch the Amazon page...")

    try:
        page = requests.get(url, headers=headers, timeout=15)
        page.raise_for_status()
        print(f"HTTP Status Code: {page.status_code}")

        if page.status_code == 200:
            soup1 = BeautifulSoup(page.content, 'html.parser')
            soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

            print("\n--- Start of Page Content (first 500 characters for debug) ---")
            print(soup1.prettify()[:500])
            print("\n--- End of Page Content (first 500 characters for debug) ---\n")

            product_title_tag = soup1.find('span', {'id': 'productTitle'})
            if product_title_tag:
                product_title = product_title_tag.get_text(strip=True)
                print(f"Extracted Product Title: {product_title}")
            else:
                print("Product title not found. The page structure might have changed, or the request might have been blocked/redirected to a CAPTCHA.")

            price_tag_whole = soup1.find('span', {'class': 'a-price-whole'})
            price_tag_fraction = soup1.find('span', {'class': 'a-price-fraction'})

            if price_tag_whole:
                whole_price = price_tag_whole.get_text(strip=True)
                fraction = price_tag_fraction.get_text(strip=True) if price_tag_fraction else ""

                if fraction and fraction != '00':
                    extracted_price = f"{whole_price}.{fraction}"
                else:
                    extracted_price = f"{whole_price}"
                print(f"Extracted Price: {extracted_price}")

            else:
                main_price_container = soup1.find('span', {'class': 'a-price', 'data-a-color': 'price'})
                if main_price_container:
                    extracted_price = main_price_container.get_text(strip=True)
                    print(f"Extracted Price (from container): {extracted_price}")
                else:
                    print("Price element (a-price-whole, a-price, etc.) not found using common selectors.")
                    print("Please inspect the '--- Start of Page Content ---' above to locate the correct HTML elements for the price.")
                    print("Look for elements containing the price text and their associated IDs or classes.")

        elif page.status_code == 503:
            print("Received a 503 Service Unavailable error. This often indicates Amazon is actively blocking the request or has detected automated access.")
        elif page.status_code == 404:
            print("Received a 404 Not Found error. The URL might be incorrect or the product no longer exists.")
        else:
            print(f"Received an unexpected HTTP status code: {page.status_code}")
            print("Content received (first 500 chars):", page.content.decode('utf-8', errors='ignore')[:500])

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the request: {e}")
        print("This could be due to network issues, DNS resolution failure, or Amazon blocking the request.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    today = datetime.date.today().strftime('%Y-%m-%d')
    file_name = r"C:\Users\Ankur\OneDrive\Documents\data analysis\python stuffs\Amazon_Web_Scrape_Dataset_Poject.csv"
    header = ['Title', 'Price', 'Date_Updated']
    data = [product_title, extracted_price, today]

    os.makedirs(os.path.dirname(file_name), exist_ok=True)

    file_exists = os.path.exists(file_name)
    is_empty = not file_exists or os.stat(file_name).st_size == 0

    try:
        with open(file_name, 'a+', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            if is_empty:
                writer.writerow(header)
            writer.writerow(data)
        print(f"Data written to {file_name} successfully.")
    except Exception as e:
        print(f"Error writing to CSV file: {e}")

print("\nStarting daily price check. Press Ctrl+C to stop.")
while(True):
    check_price()
    time.sleep(86400)



Starting daily price check. Press Ctrl+C to stop.

[2025-06-17 00:55:28.135258] Attempting to fetch the Amazon page...
HTTP Status Code: 200

--- Start of Page Content (first 500 characters for debug) ---
<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in">
 <!-- sp:feature:head-start -->
 <head>
  <script>
   var aPageStart = (new Date()).getTime();
  </script>
  <meta charset="utf-8"/>
  <!-- sp:end-feature:head-start -->
  <!-- sp:feature:csm:head-open-part1 -->
  <script type="text/javascript">
   var ue_t0=ue_t0||+new Date();
  </script>
  <!-- sp:end-feature:csm:head-open-part1 -->
  <!-- sp:feature:cs-optimization -->
  <meta content="on" http-equiv="x-dns-prefetch-co

--- End of Page Content (first 500 characters for debug) ---

Extracted Product Title: XYXX Men's Nova 100% Combed Cotton Regular Fit Polo T-Shirt
Extracted Price: 549
Data written to C:\Users\Ankur\OneDrive\Documents\data analysis\python stuffs\Amazon_Web_Scrape_Dataset_Poject.csv successfu