# Task 1. Python Script for Web Scraping and Automation

# Aim

The aim of this Python script is to automate web scraping of specific data from a website and perform data processing, cleaning, analysis  and performing automation to update. Here we are choosing Amazon website and analyzing data for the product of wireless keyword and mouse.

# Defining Libraries

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import schedule
import time

# Defining Scrape_amazon method to scrap

In [6]:
# Define a function to scrape data from Amazon
def amazon_scrape():

    print("Scraping Started...")    
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.5"
    }

    # Define the base URL for the Amazon search
    base_url = "https://www.amazon.in/s?k=wireless+keyboard+and+mouse&crid=2PHFKUECNYV2P&sprefix=wireless+key%2Caps%2C2170&ref=nb_sb_ss_ts-doa-p_1_11"

    # Initialize lists to store product information
    title = []
    price = []
    rating = []
    available = []

    # Loop through pages to fetch data
    for page in range(1, 6):
        URL = f"{base_url}&page={page}"  
        webpage = requests.get(URL, headers=HEADERS)

        if webpage.status_code == 200:
            soup = BeautifulSoup(webpage.content, "html.parser")
        else:
            print(f"Failed to retrieve page {page}")
            continue
        
        # Find product links on the page
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

        for link in links:
            product_link = "https://www.amazon.in" + link['href']

            try:
                new_webpage = requests.get(product_link, headers=HEADERS)
                new_webpage.raise_for_status()
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                # Extract product title
                product_title = new_soup.find("span", attrs={"id": 'productTitle'})
                if product_title:
                    title.append(product_title.text.strip())
                    print(product_title.text.strip())
                else:
                    title.append("N/A")

                # Extract product price
                product_price = new_soup.find("span", attrs={"class": "a-price-whole"})
                if product_price:
                    price.append(product_price.text.strip())
                    print(product_price.text.strip())
                else:
                    price.append("N/A")

                # Extract product rating
                product_rating = new_soup.find("span",attrs = {"data-hook":"total-review-count" })
                if product_rating:
                    rating_text = re.sub(r'\D','',product_rating.text)
                    if rating_text:
                        rating.append(rating_text) 
                    else:
                        rating.append(0)
                    print(rating_text)
                else:
                    rating.append(0)

                # Extract product availability
                product_available = new_soup.find("span", attrs={"class": "a-size-medium a-color-success"})
                if product_available:
                    available.append(product_available.text.strip())
                    print(product_available.text.strip())
                else:
                    available.append("N/A")

            except requests.exceptions.RequestException as e:
                print("Failed to retrieve product page:", e)

    # Create a DataFrame from the scraped data and save it to a CSV file
    data = {'Title': title,
            'Price': price,
            'Rating': rating,
            'Availability': available,
            }

    amazon_df = pd.DataFrame(data)
    amazon_df.to_csv("Amazon_data.csv", index=False, header=True, sep=',')

    

    

# Function call to scrape 

In [7]:
# Defining Function call to amazon_scrape() method
amazon_scrape()


Scraping Started...
HP USB Wireless Spill Resistance Keyboard and Mouse Set with 10m Working Range 2.4G Wireless Technology / 3 Years Warranty (4SC12PA), Black
1,098.
8862
In stock
Amazon Basics Wireless Rechargeable Dual Connectivity Keyboard and Mouse Set - Multidevice Wireless Keyboard Full Size and Mouse, for Windows, Mac, Android, Smart TV, Tablet, PC, Laptop
1,999.
39
In stock
Zebronics Zeb-Companion 107 USB Wireless Keyboard and Mouse Set with Nano Receiver (Black)
549.
15067
In stock
Dell USB Wireless Keyboard and Mouse Set- KM3322W, Anti-Fade & Spill-Resistant Keys, up to 36 Month Battery Life, 3Y Advance Exchange Warranty, Black
1,248.
7540
In stock
HP USB Wireless Spill Resistance Keyboard and Mouse Set with 10m Working Range 2.4G Wireless Technology / 3 Years Warranty (4SC12PA), Black
1,098.
8862
In stock
Failed to retrieve product page: HTTPSConnectionPool(host='www.amazon.inhttps', port=443): Max retries exceeded with url: //aax-eu.amazon.in/x/c/RHsAFvEnEBU7RjMfKPpjqHQAAA

# Data Cleaning and Analysis

In [8]:
# Read the scraped data
amazon_df = pd.read_csv("Amazon_data.csv")

# Data Cleaning
amazon_df["Title"] = amazon_df['Title'].str.strip()
amazon_df['Price'] = amazon_df['Price'].str.replace('₹', '').str.replace(',', '').astype(float)
amazon_df['Rating'] = amazon_df['Rating'].astype(int)
amazon_df['Availability'] = amazon_df['Availability'].str.strip()

# Handling Missing Data
amazon_df = amazon_df.replace("N/A", pd.NA).dropna(how="any").replace(pd.NA, "N/A")

# Data Analysis
average_price = amazon_df['Price'].mean()
highest_rated_product = amazon_df[amazon_df['Rating'] == amazon_df['Rating'].max()]
number_of_available_items = len(amazon_df[amazon_df['Availability'] == 'In stock'])

print(f"Average Price: ₹{average_price:.2f}")
print("Highest Rated Product:")
print(highest_rated_product)
print(f"Number of Available Items: {number_of_available_items}")

# Save cleaned data to another CSV file
amazon_df.to_csv("Amazon_data_clean.csv", index=False, header=True, sep=',')

print("Scraping completed")





Average Price: ₹1621.85
Highest Rated Product:
                                               Title   Price  Rating  \
5  Logitech MK215 Wireless Keyboard and Mouse Com...  1199.0   27769   

  Availability  
5     In stock  
Number of Available Items: 96
Scraping completed


# Defining update_daily() method to automate the updation schedule

In [9]:
# Defining Update method 
def update_daily():
    print("Updating data...")
    amazon_scrape()


# Scheduling the update 

In [10]:
# Schedule the update to occur every day at 12:00 PM (adjust the time as needed)
schedule.every().day.at("12:00").do(update_daily)


Every 1 day at 12:00:00 do update_daily() (last run: [never], next run: 2023-11-08 12:00:00)

# Main Loop for Scheduling

In [11]:
# Main loop for Scheduling
while True:
    schedule.run_pending()
    time.sleep(1)
