# Web Scrapping Amazon Product [TV]

## Requests Module

In [1]:
!pip install requests

#The requests library is used for making HTTP requests to a specific URL and returns the response. 



*`Python requests module has several built-in methods to make HTTP requests to specified URI using GET, POST, PUT, PATCH, or HEAD requests`*

**1. Informational responses (100 – 199)**

**2. Successful responses (200 – 299)**

**3. Redirection messages (300 – 399)**

**4. Client error responses (400 – 499)**

**5. Server error responses (500 – 599)**

In [2]:
import requests

# Making a GET request
r = requests.get('https://www.amazon.in/s?k=TV&rh=n%3A1389396031&ref=nb_sb_noss/')

# check status code for response received
# success code - 200
print(r)

# print content of request
print(r.content)


<Response [503]>
b'<html>\n<head>\n<title>503 - Service Unavailable Error</title>\n</head>\n<body bgcolor="#FFFFFF" text="#000000">\n\n<!--\n        To discuss automated access to Amazon data please contact api-services-support@amazon.com.\n        For information about migrating to our APIs refer to our Marketplace APIs at https://developer.amazonservices.in/ref=rm_5_sv, or our Product Advertising API at https://affiliate-program.amazon.in/gp/advertising/api/detail/main.html/ref=rm_5_ac for advertising use cases.\n-->\n\n<center>\n<a href="https://www.amazon.in/ref=cs_503_logo/">\n<img src="https://images-eu.ssl-images-amazon.com/images/G/31/x-locale/communities/people/logo.gif" width=200 height=45 alt="Amazon.in" border=0></a>\n<p align=center>\n<font face="Verdana,Arial,Helvetica">\n<font size="+2" color="#CC6600"><b>Oops!</b></font><br>\n<b>It\'s rush hour and traffic is piling up on that page. Please try again in a short while.<br>If you were trying to place an order, it will not 

In [3]:
import requests
from time import sleep
from random import randint

# Function to handle retries
def get_amazon_page(url, retries=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            print("Request successful")
            return response.content
        elif response.status_code == 503:
            print(f"503 Service Unavailable. Retrying ({i+1}/{retries})...")
            sleep(randint(1, 5))  # Wait a few seconds before retrying
        else:
            print(f"Unexpected status code: {response.status_code}")
            break
    
    return None

# URL for the GET request
url = 'https://www.amazon.in/s?k=TV&rh=n%3A1389396031&ref=nb_sb_noss/'

# Get the content
content = get_amazon_page(url)

# Check if content was retrieved
if content:
    print("Content retrieved successfully")
else:
    print("Failed to retrieve content after retries")


Request successful
Content retrieved successfully


## BeautifulSoup Library

1. **Importing Libraries:** The code imports the requests library for making HTTP requests and the BeautifulSoup class from the bs4 library for parsing HTML.
1. **Making a GET Request:** It sends a GET request to ‘https://www.geeksforgeeks.org/python-programming-language/’ and stores the response in the variable r.
1. **Checking Status Code:** It prints the status code of the response, typically 200 for success.
1. **Parsing the HTML:** The HTML content of the response is parsed using BeautifulSoup and stored in the variable soup.
1. **Printing the Prettified HTML:** It prints the prettified version of the parsed HTML content for readability and analysis.

In [4]:
!pip install beautifulsoup4



In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape
url = "https://www.amazon.in/s?k=TV&rh=n%3A1389396031&ref=nb_sb_noss"

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


In [6]:
# Send HTTP request
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
# Lists to hold the extracted data
product_names = []
ratings = []
base_prices = []
mrps = []
service_types = []

In [8]:
# Extracting data
for item in soup.find_all('div', class_='s-main-slot s-result-list s-search-results sg-row'):
    for product in item.find_all('div', class_='s-result-item'):
        # Product Name
        name_tag = product.find('span', class_='a-text-normal')
        product_name = name_tag.text.strip() if name_tag else 'N/A'
        
        # Rating
        rating_tag = product.find('span', class_='a-icon-alt')
        rating = rating_tag.text.strip() if rating_tag else 'N/A'
        
        # Base Price
        base_price_tag = product.find('span', class_='a-price-whole')
        base_price = base_price_tag.text.strip() if base_price_tag else 'N/A'
        
        # MRP
        mrp_tag = product.find('span', class_='a-price a-text-price')
        mrp = mrp_tag.text.strip() if mrp_tag else 'N/A'
        
        # Service Type
        service_type = 'N/A'  # Amazon doesn't typically list a specific "Service Type" on this page
        
        product_names.append(product_name)
        ratings.append(rating)
        base_prices.append(base_price)
        mrps.append(mrp)
        service_types.append(service_type)

In [9]:
# Create a DataFrame
df = pd.DataFrame({
    'Product_name': product_names,
    'Rating': ratings,
    'Base_Price': base_prices,
    'MRP': mrps,
    'Service_type': service_types
})

In [10]:
# Save DataFrame to CSV
df.to_csv('amazon_tv_data.csv', index=False)

# print("Data saved to amazon_tv_data.csv")

## Extract Data from multiple Pages

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def extract_data_from_page(soup):
    product_names = []
    ratings = []
    base_prices = []
    mrps = []
    service_types = []

    for item in soup.find_all('div', class_='s-main-slot s-result-list s-search-results sg-row'):
        for product in item.find_all('div', class_='s-result-item'):
            # Product Name
            name_tag = product.find('span', class_='a-text-normal')
            product_name = name_tag.text.strip() if name_tag else 'N/A'
            
            # Rating
            rating_tag = product.find('span', class_='a-icon-alt')
            rating = rating_tag.text.strip() if rating_tag else 'N/A'
            
            # Base Price
            base_price_tag = product.find('span', class_='a-price-whole')
            base_price = base_price_tag.text.strip() if base_price_tag else 'N/A'
            
            # MRP
            mrp_tag = product.find('span', class_='a-price a-text-price')
            mrp = mrp_tag.text.strip() if mrp_tag else 'N/A'
            
            # Service Type
            service_type = 'N/A'  # Amazon doesn't typically list a specific "Service Type" on this page
            
            product_names.append(product_name)
            ratings.append(rating)
            base_prices.append(base_price)
            mrps.append(mrp)
            service_types.append(service_type)
    
    return product_names, ratings, base_prices, mrps, service_types

def main():
    base_url = "https://www.amazon.in/s?k=TV&rh=n%3A1389396031&page="
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    all_product_names = []
    all_ratings = []
    all_base_prices = []
    all_mrps = []
    all_service_types = []
    
    for page in range(1, 11):  # Scraping pages 1 to 10
        print(f"Scraping page {page}...")
        response = requests.get(base_url + str(page), headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        product_names, ratings, base_prices, mrps, service_types = extract_data_from_page(soup)
        
        all_product_names.extend(product_names)
        all_ratings.extend(ratings)
        all_base_prices.extend(base_prices)
        all_mrps.extend(mrps)
        all_service_types.extend(service_types)
        
        # To avoid being blocked by Amazon, introduce a delay between requests
        time.sleep(2)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Product_name': all_product_names,
        'Rating': all_ratings,
        'Base_Price': all_base_prices,
        'MRP': all_mrps,
        'Service_type': all_service_types
    })
    
    


In [12]:
# Save DataFrame to CSV
df.to_csv('amazon_tv_data_multiple_pages.csv', index=False)
# print("Data saved to amazon_tv_data_multiple_pages.csv")

if __name__ == "__main__":
    main()