In [18]:
# Importing necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import random
import re

In [None]:
# Function to determine the maximum number of pages available for the given search query.
def get_max_pages(url, headers):
    """
    Logic:
    1. Send a GET request to fetch the webpage content.
    2. Parse the HTML using BeautifulSoup.
    3. Identify pagination elements and extract the second last number (max page count).
    4. If no pagination found, return 1 (default to prevent errors).
    """
    try:
        response = requests.get(url, headers=headers)                    # Fetch the webpage
        response.raise_for_status()                                      # Raise an exception for HTTP errors

        soup = BeautifulSoup(response.content, 'html.parser')            # Parse HTML content
        page_numbers = soup.select(".s-pagination-item")                 # Find pagination elements

        if page_numbers:
            return int(page_numbers[-2].text.strip())                    # Extract the second last page number
        return 1                                                         # Default to 1 page if pagination isn't found
    except requests.exceptions.RequestException as e:
        print(f"Error getting max pages: {e}")
        return 1                                                         # Return 1 in case of errors to continue execution safely



In [None]:
#Function to scrape laptop details from an Amazon page.
def scrape_amazon_laptops(page, headers):
    """
    Logic:
    1. Construct the URL for the given page number.
    2. Send a GET request to retrieve page content.
    3. Parse the HTML using BeautifulSoup.
    4. Extract required details: title, image, rating, price, and ad status.
    5. Format rating to extract only numeric value.
    6. Append extracted details to a list and return.
    """
    url = f"https://www.amazon.in/s?k=laptops&page={page}"               # URL for current page

    try:
        response = requests.get(url, headers=headers)                    # Fetch webpage content
        response.raise_for_status()                                      # Raise error for bad HTTP responses
    except requests.exceptions.RequestException as e:
        print(f"Skipping page {page}, request failed: {e}")
        return []                                                        # Return an empty list on failure

    soup = BeautifulSoup(response.content, 'html.parser')                # Parse HTML content
    laptops = []                                                         # List to store extracted laptop data

    for item in soup.select('.s-main-slot .s-result-item'):              # Loop through all laptop items
        # Extract the laptop title
        title_tag = item.find("h2")
        title = title_tag.text.strip() if title_tag else 'No Title'

        # Extract the laptop image URL
        image_tag = item.find("img", class_="s-image")
        image_url = image_tag.get('src', 'No Image') if image_tag else 'No Image'

        # Extract the rating and format it
        rating_tag = item.select_one(".a-icon-star-small .a-icon-alt")
        if rating_tag:
            """
            Extracts numeric rating from text format like '4.5 out of 5 stars'.

            Logic:
            - Use regex to find numeric value before "out of 5 stars".
            - If match found, extract the rating.
            - If no match, assign "No Rating".
            """
            rating_match = re.search(r"([\d.]+) out of 5 stars", rating_tag.text.strip())
            rating = rating_match.group(1) if rating_match else "No Rating"
        else:
            rating = "No Rating"

                # Extract price details (some products may have fraction part)
            price_whole = item.select_one(".a-price-whole")  # Whole part of price
            price_fraction = item.select_one(".a-price-fraction")  # Decimal part

        if price_whole and price_fraction:
            """
            Constructing the full price string without currency symbols.

            Logic:
            - If both whole and fraction parts exist, concatenate them to form a numeric value.
            - If only the whole part exists, use it as the price.
            - If neither is found, assign 'No Price'.
            """
            price = f"{price_whole.text.strip()}.{price_fraction.text.strip()}"
        elif price_whole:
            price = price_whole.text.strip()  # Use only whole part if fraction is missing
        else:
            price = 'No Price'


        # Check if the listing is a sponsored (ad) product
        ad_tag = item.find("span", string=lambda text: text and "Sponsored" in text)
        ad_result = "Sponsored" if ad_tag else "Organic"

        # Append extracted data to list
        laptops.append([image_url, title, rating, price, ad_result])

    return laptops  # Return scraped data for the page


[]

In [None]:
# Initial setup
url = "https://www.amazon.in/s?k=laptops"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}


In [None]:
# Get max number of pages
max_pages = get_max_pages(url, headers)
print(f"Found {max_pages} pages.")                                        # Print the total number of pages to be scraped


Found 20 pages.


In [None]:
# Scrape all pages
all_laptops = []

for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...")
    """
    Adding a delay between requests.

    Logic:
    - Scraping too fast can lead to getting blocked by Amazon.
    - Using `random.uniform(2,5)` to introduce random delay between 2-5 seconds.
    """
    all_laptops.extend(scrape_amazon_laptops(page, headers))
    time.sleep(random.uniform(2, 5))                                        # Random delay to avoid bot detection


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...


In [None]:
# Convert list to DataFrame and clean up data
df = pd.DataFrame(all_laptops, columns=['Image', 'Title', 'Rating', 'Price', 'Ad/Organic Result'])

"""
Filtering out products with missing prices.

Logic:
- Some products may not have a displayed price.
- Keeping only products where price is available.
- Resetting index for a cleaner CSV output.
"""
df = df[df['Price'] != 'No Price']
df.reset_index(drop=True, inplace=True)

In [None]:
# for saving the file in drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Generate timestamped filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f'laptops_data_{timestamp}.csv'

# Save location inside Google Drive
drive_folder = "/content/drive/MyDrive/Globussoft Assignment/Task 1/"
csv_path = f"{drive_folder}{csv_filename}"


# Save extracted data to CSV file
df.to_csv(csv_path, index=False)


print(f"Data saved to Google Drive at: {csv_path}")



Data saved to Google Drive at: /content/drive/MyDrive/Globussoft Assignment/Task 1/laptops_data_20250324_153425.csv


In [None]:
df.head(5)

Unnamed: 0,Image,Title,Rating,Price,Ad/Organic Result
0,https://m.media-amazon.com/images/I/71EJdFiOw5...,"ASUS Zenbook A14, Snapdragon X Elite Processor...",No Rating,129990,Sponsored
1,https://m.media-amazon.com/images/I/81v5jg9AAK...,"ASUS Vivobook 16, Snapdragon X Processor,(Qual...",5.0,129990,Sponsored
2,https://m.media-amazon.com/images/I/71VRrc7V-P...,"ASUS Vivobook Go 14, AMD Ryzen 3 7320U, 8GB RA...",3.9,129990,Organic
3,https://m.media-amazon.com/images/I/61fDHkQ6Mq...,Acer Aspire Lite AMD Ryzen 5-5625U Premium Thi...,3.8,129990,Organic
4,https://m.media-amazon.com/images/I/41XYpjPaft...,"Lenovo V14 G3 14"" FHD Laptop, Intel Core i5-12...",3.0,129990,Organic


In [None]:
df.shape

(119, 5)

In [None]:
df_filtered = df[(df["Image"] == 'No Image') & (df['Title']=='No Title') & (df['Rating'] =='No Rating')]
df_filtered.shape

(1, 5)

In [None]:
df_irrelevent_items=df[(df['Rating']=='No Rating') & (df['Title'] == 'No Title') & (df["Image"]=='No Image')]
df_irrelevent_items.shape

(1, 5)

In [None]:
# Create a cleaned copy of df
cleaned_df = df.copy()

# Reset index of the cleaned DataFrame before dropping
cleaned_df.reset_index(drop=True, inplace=True)

# Now drop rows safely
cleaned_df.drop(df_irrelevent_items.index, inplace=True, errors="ignore")

# Save the cleaned DataFrame to a CSV file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_filename = f'Cleaned_laptops_data_{timestamp}.csv'
cleaned_df.to_csv(csv_filename, index=False)

print(f"Data saved to {csv_filename}")

Data saved to Cleaned_laptops_data_20250324_153208.csv


In [None]:
cleaned_df.shape

(118, 5)

In [None]:
cleaned_df.head(5)

Unnamed: 0,Image,Title,Rating,Price,Ad/Organic Result
0,https://m.media-amazon.com/images/I/71EJdFiOw5...,"ASUS Zenbook A14, Snapdragon X Elite Processor...",No Rating,129990,Sponsored
1,https://m.media-amazon.com/images/I/81v5jg9AAK...,"ASUS Vivobook 16, Snapdragon X Processor,(Qual...",5.0,129990,Sponsored
2,https://m.media-amazon.com/images/I/71VRrc7V-P...,"ASUS Vivobook Go 14, AMD Ryzen 3 7320U, 8GB RA...",3.9,129990,Organic
3,https://m.media-amazon.com/images/I/61fDHkQ6Mq...,Acer Aspire Lite AMD Ryzen 5-5625U Premium Thi...,3.8,129990,Organic
4,https://m.media-amazon.com/images/I/41XYpjPaft...,"Lenovo V14 G3 14"" FHD Laptop, Intel Core i5-12...",3.0,129990,Organic
