In [22]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [23]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={"class":'a-price-whole'}).text

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star-mini a-star-mini-3-5 mvt-cm-cr-review-stars-mini'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

# Function to extract Brand
def get_brand(soup):
    try:
        brand = soup.find("span", attrs={"class":'a-size-base po-break-word'}).text.strip()
    except AttributeError:
        
        brand = "Not Available"	

    return brand

# Function to extract Offer
def get_off(soup):
    try:
       off = soup.find("span", attrs={"class":"a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage"}).text
    except AttributeError:
        off = "Not Available"

    return off
        

def get_bought(soup):
    try:
        bought=soup.find("span", attrs={"id":'social-proofing-faceout-title-tk_bought'}).text.strip()
    except AttributeError:
        
        bought = "Not Available"	

    return bought

In [24]:
if __name__ == '__main__':

    # Add your user agent
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    # Base URL for pagination
    BASE_URL = "https://www.amazon.in/s?k=laptop&crid=2R49WXP0T6MHX"

    # Number of pages to scrape
    NUM_PAGES = 4  

    # Store product links
    links_list = []

    # Loop through multiple pages
    for page in range(1, NUM_PAGES + 1):
        print(f"Scraping Page {page}...")

        # Construct paginated URL
        url = f"{BASE_URL}&page={page}"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        # Fetch product links
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
        for link in links:
            links_list.append("https://www.amazon.in" + link.get('href'))

        # Delay to prevent getting blocked
        time.sleep(2)

    print(f"Total Products Found: {len(links_list)}")

    # Dictionary to store product data
    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [], "brand": [], "category": [], "off": [], "bought": []}

    # Loop for extracting product details from each link 
    for idx, link in enumerate(links_list):
        print(f"Scraping product {idx+1} of {len(links_list)}")

        new_webpage = requests.get(link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Append extracted data (ensuring consistent length)
        d['title'].append(get_title(new_soup) or "")
        d['price'].append(get_price(new_soup) or "")
        d['rating'].append(get_rating(new_soup) or "")
        d['reviews'].append(get_review_count(new_soup) or "")
        d['availability'].append(get_availability(new_soup) or "")
        d['brand'].append(get_brand(new_soup) or "")
        d['category'].append("Laptop")  # Manually assign 'Laptop' as category
        d['off'].append(get_off(new_soup) or "")
        d['bought'].append(get_bought(new_soup) or "")

        # Delay between product requests to avoid detection
        time.sleep(0.5)

    # Save data to CSV
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)

    print("Scraping completed. Data saved to 'amazon_data.csv'")


Scraping Page 1...
Scraping Page 2...
Scraping Page 3...
Scraping Page 4...
Total Products Found: 88
Scraping product 1 of 88
Scraping product 2 of 88
Scraping product 3 of 88
Scraping product 4 of 88
Scraping product 5 of 88
Scraping product 6 of 88
Scraping product 7 of 88
Scraping product 8 of 88
Scraping product 9 of 88
Scraping product 10 of 88
Scraping product 11 of 88
Scraping product 12 of 88
Scraping product 13 of 88
Scraping product 14 of 88
Scraping product 15 of 88
Scraping product 16 of 88
Scraping product 17 of 88
Scraping product 18 of 88
Scraping product 19 of 88
Scraping product 20 of 88
Scraping product 21 of 88
Scraping product 22 of 88
Scraping product 23 of 88
Scraping product 24 of 88
Scraping product 25 of 88
Scraping product 26 of 88
Scraping product 27 of 88
Scraping product 28 of 88
Scraping product 29 of 88
Scraping product 30 of 88
Scraping product 31 of 88
Scraping product 32 of 88
Scraping product 33 of 88
Scraping product 34 of 88
Scraping product 35 of 8

In [25]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,brand,category,off,bought
0,Lenovo V15 G4 AMD Ryzen 5 7520U 15.6 inch FHD ...,34990.,3.0 out of 5 stars,1 rating,In stock,Lenovo,Laptop,-42%,Not Available
1,"Apple 2025 MacBook Air (13-inch, Apple M4 chip...",99900.,Previous page,,In stock,Apple,Laptop,Not Available,Not Available
2,"Lenovo V15 G3 (2024), Intel Core i3 12th Gen 1...",32980.,4.2 out of 5 stars,4 ratings,In stock,Lenovo,Laptop,-60%,50+ bought in past month
3,"Acer Aspire Lite, AMD Ryzen 5 5625U Processor(...",34490.,4.0 out of 5 stars,"1,121 ratings",In stock,acer,Laptop,-42%,1K+ bought in past month
4,Lᥱnovo ThinkPad Touch Screen T490 Laptop Intᥱl...,22990.,Previous page,,In stock,Generic,Laptop,-8%,Not Available
...,...,...,...,...,...,...,...,...,...
83,Lenovo IdeaPad 3 14 Inch FHD Laptop (12th Gen ...,28891.,Previous page,,Only 1 left in stock.,Lenovo,Laptop,Not Available,Not Available
84,Apple 2024 MacBook Pro Laptop with M4 Pro chip...,191990.,5.0 out of 5 stars,5 ratings,In stock,Apple,Laptop,-4%,50+ bought in past month
85,Lenovo V15 G4 AMD Athlon Silver 7120U Laptop 8...,24990.,3.9 out of 5 stars,133 ratings,In stock,Lenovo,Laptop,-34%,50+ bought in past month
86,"Apple 2025 MacBook Air (13-inch, Apple M4 chip...",99900.,Previous page,,In stock,Apple,Laptop,Not Available,Not Available


In [26]:
if __name__ == '__main__':

    # Add your user agent
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    # Base URL for pagination
    BASE_URL = "https://www.amazon.in/s?k=phones&crid=Z3FWW1OLZ58"

    # Number of pages to scrape
    NUM_PAGES = 4  

    # Store product links
    links_list = []

    # Loop through multiple pages
    for page in range(1, NUM_PAGES + 1):
        print(f"Scraping Page {page}...")

        # Construct paginated URL
        url = f"{BASE_URL}&page={page}"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        # Fetch product links
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
        for link in links:
            links_list.append("https://www.amazon.in" + link.get('href'))

        # Delay to prevent getting blocked
        time.sleep(2)

    print(f"Total Products Found: {len(links_list)}")

    # Dictionary to store product data
    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [], "brand": [], "category": [], "off": [], "bought": []}

    # Loop for extracting product details from each link 
    for idx, link in enumerate(links_list):
        print(f"Scraping product {idx+1} of {len(links_list)}")

        new_webpage = requests.get(link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Append extracted data (ensuring consistent length)
        d['title'].append(get_title(new_soup) or "")
        d['price'].append(get_price(new_soup) or "")
        d['rating'].append(get_rating(new_soup) or "")
        d['reviews'].append(get_review_count(new_soup) or "")
        d['availability'].append(get_availability(new_soup) or "")
        d['brand'].append(get_brand(new_soup) or "")
        d['category'].append("SmartPhone")  # Manually assign 'Laptop' as category
        d['off'].append(get_off(new_soup) or "")
        d['bought'].append(get_bought(new_soup) or "")

        # Delay between product requests to avoid detection
        time.sleep(0.5)

    # Save data to CSV
    amazon_df_2 = pd.DataFrame.from_dict(d)
    amazon_df_2.to_csv("amazon_data_2.csv", header=True, index=False)

    print("Scraping completed. Data saved to 'amazon_data_2.csv'")


Scraping Page 1...
Scraping Page 2...
Scraping Page 3...
Scraping Page 4...
Total Products Found: 88
Scraping product 1 of 88
Scraping product 2 of 88
Scraping product 3 of 88
Scraping product 4 of 88
Scraping product 5 of 88
Scraping product 6 of 88
Scraping product 7 of 88
Scraping product 8 of 88
Scraping product 9 of 88
Scraping product 10 of 88
Scraping product 11 of 88
Scraping product 12 of 88
Scraping product 13 of 88
Scraping product 14 of 88
Scraping product 15 of 88
Scraping product 16 of 88
Scraping product 17 of 88
Scraping product 18 of 88
Scraping product 19 of 88
Scraping product 20 of 88
Scraping product 21 of 88
Scraping product 22 of 88
Scraping product 23 of 88
Scraping product 24 of 88
Scraping product 25 of 88
Scraping product 26 of 88
Scraping product 27 of 88
Scraping product 28 of 88
Scraping product 29 of 88
Scraping product 30 of 88
Scraping product 31 of 88
Scraping product 32 of 88
Scraping product 33 of 88
Scraping product 34 of 88
Scraping product 35 of 8

In [27]:
if __name__ == '__main__':

    # Add your user agent
    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    # Base URL for pagination
    BASE_URL = "https://www.amazon.in/s?k=headphones&crid=39EWCXSJBVWR4"

    # Number of pages to scrape
    NUM_PAGES = 4  

    # Store product links
    links_list = []

    # Loop through multiple pages
    for page in range(1, NUM_PAGES + 1):
        print(f"Scraping Page {page}...")

        # Construct paginated URL
        url = f"{BASE_URL}&page={page}"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")

        # Fetch product links
        links = soup.find_all("a", attrs={'class': 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
        for link in links:
            links_list.append("https://www.amazon.in" + link.get('href'))

        # Delay to prevent getting blocked
        time.sleep(2)

    print(f"Total Products Found: {len(links_list)}")

    # Dictionary to store product data
    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [], "brand": [], "category": [], "off": [], "bought": []}

    # Loop for extracting product details from each link 
    for idx, link in enumerate(links_list):
        print(f"Scraping product {idx+1} of {len(links_list)}")

        new_webpage = requests.get(link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Append extracted data (ensuring consistent length)
        d['title'].append(get_title(new_soup) or "")
        d['price'].append(get_price(new_soup) or "")
        d['rating'].append(get_rating(new_soup) or "")
        d['reviews'].append(get_review_count(new_soup) or "")
        d['availability'].append(get_availability(new_soup) or "")
        d['brand'].append(get_brand(new_soup) or "")
        d['category'].append("HeadPhones")  # Manually assign 'Laptop' as category
        d['off'].append(get_off(new_soup) or "")
        d['bought'].append(get_bought(new_soup) or "")

        # Delay between product requests to avoid detection
        time.sleep(0.5)

    # Save data to CSV
    amazon_df_3 = pd.DataFrame.from_dict(d)
    amazon_df_3.to_csv("amazon_data_3.csv", header=True, index=False)

    print("Scraping completed. Data saved to 'amazon_data_3.csv'")


Scraping Page 1...
Scraping Page 2...
Scraping Page 3...
Scraping Page 4...
Total Products Found: 88
Scraping product 1 of 88
Scraping product 2 of 88
Scraping product 3 of 88
Scraping product 4 of 88
Scraping product 5 of 88
Scraping product 6 of 88
Scraping product 7 of 88
Scraping product 8 of 88
Scraping product 9 of 88
Scraping product 10 of 88
Scraping product 11 of 88
Scraping product 12 of 88
Scraping product 13 of 88
Scraping product 14 of 88
Scraping product 15 of 88
Scraping product 16 of 88
Scraping product 17 of 88
Scraping product 18 of 88
Scraping product 19 of 88
Scraping product 20 of 88
Scraping product 21 of 88
Scraping product 22 of 88
Scraping product 23 of 88
Scraping product 24 of 88
Scraping product 25 of 88
Scraping product 26 of 88
Scraping product 27 of 88
Scraping product 28 of 88
Scraping product 29 of 88
Scraping product 30 of 88
Scraping product 31 of 88
Scraping product 32 of 88
Scraping product 33 of 88
Scraping product 34 of 88
Scraping product 35 of 8

In [28]:
import pandas as pd
import glob

# List of CSV files to merge
csv_files = ["amazon_data.csv", "amazon_data_2.csv", "amazon_data_3.csv"]  # Replace with actual filenames

# Read and merge all CSV files
df_list = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged file
merged_df.to_csv("tech_gadgets_data.csv", index=False)

print("CSV files merged successfully into 'merged_data.csv'")

CSV files merged successfully into 'merged_data.csv'
