# Amazon Web Scrapping Project

In [2]:
# importing Libraries 

from bs4 import BeautifulSoup
import requests
import pandas as pd 
import numpy as np 
import datetime

In [4]:
# Defining The Functions which we will be using to extract the required data from the webpage

def get_brand(soup): # Brand Name of the Product
    try :
        brand = soup.find(id="bylineInfo")
        return brand.text.strip()[6:]
        
    except AttributeError:
        return ""  
    
def get_title(soup): # Title of the Product
    try :
        title = soup.find(id="productTitle")
        return title.text.strip()

    except AttributeError :
        return ""

def get_price(soup): # Price of the product
    try :
        price = soup.find("span", attrs={"class":"a-price-whole"})
        return price.text.strip()

    except AttributeError :
        return ""

def get_review_count(soup) : # Review Count of the Product 
    try :
        review = soup.find("span", attrs={"class":"a-icon-alt"})
        return review.text.strip()

    except AttributeError:
     return ""
    
def get_color(soup): # Colour of the Product if Mentioned
    try :
        color = soup.find("span", attrs={"class":"selection"})
        return color.text.strip()

    except AttributeError :
        return ""

def get_availablity(soup) : # Availability of the Product
    try :
        availablity = soup.find("span", attrs={"class":"a-size-base a-color-price a-text-bold"})
        return availablity.text.strip()

    except AttributeError :
        return "Available "

def get_date(): # Date on which we are extracing data

    return datetime.date.today()

In [58]:
if __name__ == "__main__":

    current_page = 1
    proceed = True

    # Initialize a dataframe to hold all data
    d = {'Date':[], 'Brand':[], 'Title':[], 'Color':[], 'Price':[], 'Review':[], 'Availability':[], 'product_link':[]}

    while proceed:
        print("Currently scrapping page number: " + str(current_page))

        # Update URL to include the correct pagination query parameter '&page='
        URL = f"https://www.amazon.in/s?k=sneakers+for+men+nike&page={current_page}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
            "Accept-Encoding": "gzip, deflate",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "DNT": "1",
            "Connection": "close",
            "Upgrade-Insecure-Requests": "1"
        }

        webpage = requests.get(URL, headers=headers)
        soup = BeautifulSoup(webpage.content, 'html.parser')
        
        # Check if there are no more results to stop the loop
        if not soup.find_all("a", attrs={"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"}):
            print("No more products found. Exiting.")
            proceed = False
            continue

        links = soup.find_all("a", attrs={"class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
        links_list = []

        for link in links:
            links_list.append(link.get('href'))

        for link in links_list:
            new_webpage = requests.get("https://www.amazon.in/" + link, headers=headers)
            new_soup = BeautifulSoup(new_webpage.content, 'html.parser')

            d['Date'].append(get_date())
            d['Brand'].append(get_brand(new_soup))
            d['Title'].append(get_title(new_soup))
            d['Price'].append(get_price(new_soup))
            d['Color'].append(get_color(new_soup))
            d['Review'].append(get_review_count(new_soup))
            d['Availability'].append(get_availablity(new_soup))
            d['product_link'].append("https://www.amazon.in/" + link)

        # Move to the next page
        current_page += 1

    # After all pages are scraped, save the data to csv
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['Title'] = amazon_df['Title'].replace('', np.nan)
    amazon_df = amazon_df.dropna(subset=['Title'])
    
    # Save after scraping all pages
    amazon_df.to_csv(r"E:\Web_scrapping_Project\amazon_WebScrapping_Project.csv", header=True, index=False)

Currently scrapping page number: 1
Currently scrapping page number: 2
Currently scrapping page number: 3
No more products found. Exiting.


In [59]:
amazon_df

Unnamed: 0,Date,Brand,Title,Color,Price,Review,Availability,product_link
3,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Sneaker,GREY,1978,4.2 out of 5 stars,Available,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
4,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Canvas Sneaker,NAVY,2309,4.1 out of 5 stars,Available,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
5,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Sneaker,OFF WHITE,2063,4.1 out of 5 stars,Available,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
6,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Sneaker,Black,2751,5.0 out of 5 stars,Available,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
7,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Sneaker,OFF WHITE,2063,4.0 out of 5 stars,Available,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
12,2024-10-20,Nike,Nike Mens Court Vision Lo NnRunning Shoe,BLACK/WHITE-BLACK,3746,3.8 out of 5 stars,Available,https://www.amazon.in//Nike-Court-Vision-White...
17,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Canvas Sneaker,OFF WHITE,1700,3.6 out of 5 stars,Available,https://www.amazon.in//U-S-POLO-ASSN-Sneaker-W...
22,2024-10-20,the U.S. POLO ASSN. Store,U.S. POLO ASSN. Men's Canvas Sneaker,OFF WHITE,2376,4.1 out of 5 stars,Available,https://www.amazon.in//U-S-POLO-ASSN-Sneakers-...
33,2024-10-20,Nike,Nike Mens Blazer Low '77 VNTG Sneaker,WHITE/BLACK-SAIL,5396,4.1 out of 5 stars,Only 1 left in stock.,https://www.amazon.in//Nike-Blazer-Basketball-...
44,2024-10-20,Nike,NIKE mens Court Legacy Nn Running Shoe,WHITE/WHITE-BLACK,3018,3.8 out of 5 stars,Only 5 left in stock.,https://www.amazon.in//Nike-Court-Legacy-Numer...
