In [146]:
# Import required libraries
import requests  # For sending HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd  # For data manipulation and creating DataFrames


In [148]:
# Define a function to scrape data from the provided URL
def scrape(url):
    # Define headers to simulate a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
    }

    # Send a GET request to the specified URL with headers
    response = requests.get(url, headers=headers)
    # Parse the response text into an HTML format using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize empty lists to store product details
    products = []
    prices = []
    ratings = []
    images = []

    # Find all product names using their specific HTML class and append text to products list
    p_name = soup.find_all('a', class_="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal")
    for i in p_name:
        products.append(i.text)

    # Find all product prices using their specific HTML class and append text to prices list
    p_price = soup.find_all('span', class_="a-price-whole")
    for i in p_price:
        prices.append(i.text)

    # Find all product ratings using their specific HTML class and append text to ratings list
    p_rating = soup.find_all('span', class_="a-size-base s-underline-text")
    for i in p_rating:
        ratings.append(i.text)

    # Find all image elements and get their 'src' attribute to get the image URL
    p_images = soup.find_all('img')
    k = 0  # Counter to limit the number of images saved
    for i in p_images:
        images.append(i.get('src'))
        k += 1
        # Stop collecting images after 20 entries to limit the data
        if k == 20:
            break


    # Ensure all lists have the same length by padding with 'N/A' for missing values
    max_len = max(len(products), len(prices), len(ratings), len(images))
    products.extend(['N/A'] * (max_len - len(products)))
    prices.extend(['N/A'] * (max_len - len(prices)))
    ratings.extend(['N/A'] * (max_len - len(ratings)))
    images.extend(['N/A'] * (max_len - len(images)))


    # Create a DataFrame from the collected product data
    df = pd.DataFrame({
        'Product': products,
        'Price': prices,
        'Rating': ratings,
        'Image_url': images
    })

    return df  # Return the DataFrame with scraped data

In [150]:
# Scrape data from the specified Amazon page URL
df = scrape("https://www.amazon.in/s?k=mobile+phone+under+30000")

In [151]:
# Save the scraped data to a CSV file named 'amazon_data.csv'
df.to_csv('filp_data.csv')


In [152]:
# Create a new directory named 'amazon' to store product images
import os
os.makedirs('flip', exist_ok=True)


In [153]:
# Loop over each image URL in the DataFrame and save each image
for i, url in enumerate(df['Image_url']):
    try:
        # Send a GET request to each image URL to fetch image data
        response = requests.get(url)
        # Write the image data to a file in the 'amazon' directory
        with open(f"flip/{i}.jpg", 'wb') as f:
            f.write(response.content)
    except:
        pass  # Ignore any errors in downloading images