# Web Scraping using BeautifulSoup

In [1]:
# Importing required libraries
import requests               #for requesting information through url
from bs4 import BeautifulSoup #for creating parse tree and extracting data
import sys                    #for execption handling
import time                   #for setting time delay while loading webpages

### Scraping data from Cricbuzz website

In [2]:
try:                                                         #sometimes urls may not work and give error
    url = 'https://www.cricbuzz.com/'                        #storing url a variable
    response = requests.get(url)                             #requesting information through url
    if str(response) == '<Response [200]>' :                 #checking the response given by url, Response [200] denotes success
            print('Request accepted')
except Exception:                                            #exception handling in case there's any problem w.r.t. url
    error_type, error_obj, error_info  = sys.exc_info()
    print('Error in link: ', url)
    print(error_type)

Request accepted


In [3]:
mysoup = BeautifulSoup(response.text, 'html.parser')         #parsing html text

#### We want to extract the headlines of the posts. For example, the hightlighted text shown below:

![alt text](cric.JPG "Screenshot from the website")


In [4]:
news = mysoup.find_all('h2', class_ ="big-crd-hdln" ) #the headlines are in 'h2' tag and 'big-crd-hdln' class

In [5]:
for i in news:                                        #print the headlines
    print(i.text)
    print('\n')

	Rohit ton, Rahane fifty put India in control


	Cornwall ends BAN resistance


	Kedar, Harbhajan pick highest 2 crore auction bracket


	New-look South Africa out to save series




### Web scraping reviews of iPhone 12 Pro Max (256 GB) from Amazon 

##### Collecting reviews from one page

In [9]:
try:  
    amz = 'https://www.amazon.in/New-Apple-iPhone-Pro-256GB/product-reviews/B08L5T31M6/ref=cm_cr_arp_d_paging_btm?ie=UTF8&reviewerType=all_reviews' #storing the url a variable 
    response2 = requests.get(amz)                             #requesting information through url
    if str(response2) == '<Response [200]>' :                 #checking the response given by url, Response [200] denotes success
        print('Request accepted')
except Exception:                                             #exception handling in case there's any problem w.r.t. url
    error_type, error_obj, error_info  = sys.exc_info()
    print('Error in link: ', url2)
    print(error_type)

Request accepted


In [10]:
mysoup2 = BeautifulSoup(response2.text, 'html.parser')      #parsing html text

#### We want to extract the reviews of iPhone. For example, the hightlighted text shown below:

![alt text](amz.JPG "Screenshot of reviews from amazon")

In [11]:
reviews = mysoup2.find_all('div', class_ = "a-row a-spacing-small review-data")  #reviews are in 'div' tag and "a-row a-spacing-small review-data" class

In [12]:
for review in reviews:        #printing reviews
    print(review.text)



  Over priced. And without charger too.Not recommended. It is a waste of money. Be logical enough to make a call on this.




  Impressed! I’ve tested it with iphone11 and other iphones, it’s remarkable. It has unquestionably better video quality and touch sensitivity. The screen is bigger than I thought. I loved the Gold variant. But if u have a 11 max or a X max I wouldn’t suggest for an upgradation. I feel that the next iphone will have way more cooler features so wouldn’t hurt to wait. My previous iphone was 6, so in my case am totally peachy with this phone. Though it has a ceramic glass wouldn’t hurt to buy a screen guard. If u can afford it u can totally go 4 it.




  This is the best iPhone yes. The Max model gives you the better screen and overall is a camera beast. I would highly recommend this to anyone. Regarding value for money, its sad that we live in a country where Falling Rupee, Import Duty and High GST are the reasons for such high poricing. A rich country like USA

##### Collecting reviews from ALL pages

In [17]:
amz_reviews_list = []       #creating an empty list to store reviews
i = 1                       #inializing web page number
while True:                 #looping the code till reviews from all pages aree collected
    time.sleep(2)           #setting time delay to avoid getting us tagged a robot
    amz_url = 'https://www.amazon.in/New-Apple-iPhone-Pro-256GB/product-reviews/B08L5T31M6/ref=cm_cr_arp_d_paging_btm_next_'+str(i)+'?ie=UTF8&reviewerType=all_reviews&pageNumber='+str(i) #customizing the url to iterate and access different pages
    try:                    #sometimes url may not work and give error
        amz_resp = requests.get(amz_url)           #requesting information though the url
        if str(amz_resp) == '<Response [200]>' :   #checking the response given by url, Response [200] denotes success
            print('Page: ', i)                     #printing the page number accessing
            print('Request Accepted')
            i = i + 1                              #incrementing i to access next page
            amz_soup = BeautifulSoup(amz_resp.text, 'html.parser')    #parsing the html text
            amz_reviews = amz_soup.find_all('div', class_ = "a-row a-spacing-small review-data")   #the reviews are in 'div' tag and 'a-row a-spacing-small review-data' class
            j = len(amz_reviews_list)              #checking the number of reviews stored
            for amz_review in amz_reviews:         #accessing reviews one at a time
                amz_reviews_list.append(str(amz_review.text).replace('\n\n', ''))   #adding each review to the list
            print('Number of reviews collected: ', len(amz_reviews_list))          #printing the number of reviews collected so far
            if j == len(amz_reviews_list):         #checking if all reviews are collected, if yes then breaking the while loop
                break
                
    except exception as e:                         #exception handling in case there's any problem w.r.t. url
        error_type, error_obj, error_info  = sys.exc_info()
        print(error_type)
        print('problem with url: ', amz_url)
        

Page:  1
Request Accepted
Number of reviews collected:  10
Page:  2
Request Accepted
Number of reviews collected:  20
Page:  3
Request Accepted
Number of reviews collected:  30
Page:  4
Request Accepted
Number of reviews collected:  40
Page:  5
Request Accepted
Number of reviews collected:  50
Page:  6
Request Accepted
Number of reviews collected:  60
Page:  7
Request Accepted
Number of reviews collected:  70
Page:  8
Request Accepted
Number of reviews collected:  80
Page:  9
Request Accepted
Number of reviews collected:  90
Page:  10
Request Accepted
Number of reviews collected:  100
Page:  11
Request Accepted
Number of reviews collected:  107
Page:  12
Request Accepted
Number of reviews collected:  107


In [18]:
amz_reviews_list #checking the reviews stored in the list

['  Over priced. And without charger too.Not recommended. It is a waste of money. Be logical enough to make a call on this.',
 '  Impressed! I’ve tested it with iphone11 and other iphones, it’s remarkable. It has unquestionably better video quality and touch sensitivity. The screen is bigger than I thought. I loved the Gold variant. But if u have a 11 max or a X max I wouldn’t suggest for an upgradation. I feel that the next iphone will have way more cooler features so wouldn’t hurt to wait. My previous iphone was 6, so in my case am totally peachy with this phone. Though it has a ceramic glass wouldn’t hurt to buy a screen guard. If u can afford it u can totally go 4 it.',
 '  This is the best iPhone yes. The Max model gives you the better screen and overall is a camera beast. I would highly recommend this to anyone. Regarding value for money, its sad that we live in a country where Falling Rupee, Import Duty and High GST are the reasons for such high poricing. A rich country like USA

##### converting the list into a dataframe

In [19]:
import pandas as pd

In [20]:
amz_reviews_df = pd.DataFrame(data = amz_reviews_list, columns = ['Reviews'])

In [21]:
amz_reviews_df.head()

Unnamed: 0,Reviews
0,Over priced. And without charger too.Not rec...
1,Impressed! I’ve tested it with iphone11 and ...
2,This is the best iPhone yes. The Max model g...
3,"Pros: Great display, battery, cameras, usabi..."
4,\n Your browser does not support HTML5 video....


##### saving as a csv file

In [23]:
 amz_reviews_df.to_csv('amz_reviews.csv')