<a href="https://colab.research.google.com/github/Dhruv1603/Web-Scraping/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WEB-SCRAPING
#Python
# Requests
# BeautifulSoup

##scraping amazon.com site to get Name, Price, Review and Link to buy iphones.
This is only for practicing webscraping.

In [1]:
# Import required lib

import time
import requests
import numpy as np
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup


In [2]:
# URL and headers

url = 'https://www.amazon.com/s?k=iphone+15&crid=20TB4HP0AFTUA&sprefix=iphone+15%2Caps%2C96&ref=nb_sb_noss_1'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.amazon.com/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'TE': 'Trailers',
}

cookies = {
    'session-id': 'your-session-id',
    'session-id-time': 'your-session-id-time',
    'i18n-prefs': 'USD',
    'sp-cdn': 'L5Z9:US',
    'ubid-main': 'your-ubid-main',
    'x-wl-uid': 'your-x-wl-uid',
}


In [3]:
# Send the GET request

response = requests.get(url, headers=headers, cookies=cookies)

In [4]:
# Let's check status code, if 503 then change header and if 200 then proceed

response

<Response [200]>

In [5]:
# Function to scrape a single page

def scrape_amazon_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    items = soup.find_all('div', {'data-component-type': 's-search-result'})
    names = []
    prices = []
    reviews = []
    links = []

    for item in items:
        name_element = item.find('span', {'class': 'a-size-medium a-color-base a-text-normal'})
        price_whole = item.find('span', {'class': 'a-price-whole'})
        price_fraction = item.find('span', {'class': 'a-price-fraction'})
        review_element = item.find('span', {'class': 'a-icon-alt'})
        link_element = item.find('a', {'class': 'a-link-normal s-no-outline'})

        if name_element:
            names.append(name_element.text)
        else:
            names.append("Not Found")

        if price_whole and price_fraction:
            prices.append(f"{price_whole.text}{price_fraction.text}")
        else:
            prices.append("Not Found")

        if review_element:
            reviews.append(review_element.text)
        else:
            reviews.append("No Reviews")

        if link_element:
            links.append("https://www.amazon.com" + link_element['href'])
        else:
            links.append("Not Found")

    return names, prices, reviews, links

In [6]:
# Define list to run loop from all pages

page = 1
all_names = []
all_prices = []
all_reviews = []
all_links = []

In [7]:
# loop that will run from all pages of amazon

while True:
    print(f'Scraping page {page}...')
    names, prices, reviews, links = scrape_amazon_page(url + f'&page={page}')
    if not names:  # If no names were found, break the loop
        break
    all_names.extend(names)
    all_prices.extend(prices)
    all_reviews.extend(reviews)
    all_links.extend(links)
    page += 1
    time.sleep(1)  # Add a delay between requests


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...


In [8]:
# print len of all list to get an idea that data fetch is equal or not

print(len(all_names), len(all_prices), len(all_reviews), len(all_links))

245 245 245 245


In [9]:
# Create a DataFrame and save to Excel

df = pd.DataFrame({
    'Name': all_names,
    'Price': all_prices,
    'Reviews': all_reviews,
    'Link': all_links
})

In [10]:
df.head()

Unnamed: 0,Name,Price,Reviews,Link
0,"Apple iPhone 15, 256GB, Blue - Unlocked (Renewed)",754.95,4.5 out of 5 stars,https://www.amazon.com/Apple-iPhone-15-256GB-B...
1,"Apple iPhone 15 Plus, 128GB, Pink - T-Mobile (...",640.17,4.6 out of 5 stars,https://www.amazon.com/Apple-iPhone-15-Plus-12...
2,"Apple iPhone 15, 128GB, Black - Unlocked (Rene...",714.97,4.1 out of 5 stars,https://www.amazon.com/Apple-iPhone-15-128GB-B...
3,"Apple iPhone 15 Pro, 512GB, Blue Titanium - Un...",959.61,4.3 out of 5 stars,https://www.amazon.com/Apple-iPhone-15-Pro-Tit...
4,"Apple iPhone 15 Pro Max, 256GB, Blue Titanium ...",1038.0,4.2 out of 5 stars,https://www.amazon.com/Apple-iPhone-15-Pro-Max...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     245 non-null    object
 1   Price    245 non-null    object
 2   Reviews  245 non-null    object
 3   Link     245 non-null    object
dtypes: object(4)
memory usage: 7.8+ KB


In [12]:
df.isnull().sum()

Name       0
Price      0
Reviews    0
Link       0
dtype: int64

In [13]:
# Visualize data

fig = px.scatter(df, x="Price", y="Reviews", color="Name", hover_data=["Link"])

fig.update_layout(
    title="iPhone 15 Comparison: Price vs. Reviews",
    xaxis_title="Price",
    yaxis_title="Reviews",
)

fig.show()


In [14]:
# print in excel

df.to_excel('Iphones.xlsx', index=False)
print('Scraping complete. Data saved to Iphones.xlsx.')

Scraping complete. Data saved to Iphones.xlsx.


###THANK YOU