# Data Extraction

Customer review comments are extracted for following products for period FY2019 to FY2022:
 - Car Engine Oil
 - Car Fog Lamp
 - Car Tyre
 - Car Alloy Wheel
 - Car Windshield Glass Washer
 - Car Seat Cover
 - Car Windshield Wiper

In [1]:
# Importing required packages to scrape data and create dataframe
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
#We have used requests.adater lib to simulate sleep in between successive request to the server
my_headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OSX 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/71.0.3578.98 Safari/537.36", 
          "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"}

def get_data_from_url(url):
    session = requests.Session()
    retry = Retry(connect= 3, backoff_factor= 0.5)
    adapter = HTTPAdapter(max_retries= retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    resp = session.get(url,headers=my_headers)
    return resp

In [3]:
#Function to parse main URL
def mainurl(url):
    resp = get_data_from_url(url)
    print(resp)
    doc = BeautifulSoup(resp.text, "html.parser")
    return doc

In [4]:
#Defining of URL to fetch data
base_url = "https://www.amazon.in"

#Setting the main website from where data from amazon is to be extracted
doc = mainurl('https://www.amazon.in/s?k=car+engine+oil')
doc = mainurl('https://www.amazon.in/s?k=car+fog+lamp')
doc = mainurl('https://www.amazon.in/s?k=car+tyre')
doc = mainurl('https://www.amazon.in/s?k=alloy+car+wheels')
doc = mainurl('https://www.amazon.in/s?k=car+windshield+glass+washer')
doc = mainurl('https://www.amazon.in/s?k=car+seat+cover')
doc = mainurl('https://www.amazon.in/s?k=car+wiper')

<Response [200]>


In [5]:
#Extracting URLs of first 4 pages of each product
mydoc = doc
pages_links = []
firsturl = doc.find('a',{'class':'s-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
pagelink1 = firsturl.get("href")
pages_links.append(base_url + pagelink1)
totalpages = doc.find('span',{'class':'s-pagination-item s-pagination-disabled'}).text
for i in range(1,4):
    try:
        pagelink = mydoc.find('span',{'class':'s-pagination-item s-pagination-selected'})
        pagelink_next = pagelink.find_next('a',{'class':'s-pagination-item s-pagination-button'})
        pagelink_next1 = pagelink_next.get("href")
        pages_links.append(base_url + pagelink_next1)
        resp1 = get_data_from_url(pages_links[i])
        mydoc = BeautifulSoup(resp1.text, "html.parser")
    except:
        pass

In [6]:
#Extracting product URLs from first 4 pages of each product
links = []
for page in range(0,len(pages_links)):
    response4 = get_data_from_url(pages_links[page])
    doc4 = BeautifulSoup(response4.text, "html.parser")
    tags = doc4.find_all('a',{'class' : 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    for alpha_tag in range(0,len(tags)):
        alphalinks = tags[alpha_tag].get("href")
        links.append(base_url + alphalinks)

In [7]:
#Extracting URL or review page for each product
links1 = []
for k in range(0,len(links)):
    try:
        response1 = get_data_from_url(links[k])
        doc1 = BeautifulSoup(response1.text, "html.parser")
        alphatags = doc1.find('a',{'data-hook':'see-all-reviews-link-foot'})
        alphalinks = alphatags.get("href")
        links1.append(base_url + alphalinks)
    except:
        pass

In [8]:
#creating an empty dataframe
#car_df = pd.DataFrame(columns=['product_name', 'product_link', 'review_date','review_title','review_rating','review_comment'])
car_df = pd.DataFrame(columns=['product_name', 'product_link', 'review_date','review_title','review_rating','review_comment'])

In [9]:
#Parsing each review URL and extracting review title, review date, review comment and review rating for each product
num1=0

for i in range(0,len(links1)):
    try:
        response01 = get_data_from_url(links1[i])
        doc01 = BeautifulSoup(response01.text, "html.parser")
        
        rcomment = doc01.find_all('span',{'data-hook':'review-body'})
        pname = doc01.find('a',{'data-hook':'product-link'})
        pname1 = pname.text
        plink1 = pname.get("href")
        rtitle = doc01.find_all('a',{'data-hook':'review-title'},'span')
        rdate = doc01.find_all('span',{'data-hook':'review-date'})
        rstar= doc01.find_all('i',{'data-hook':'review-star-rating'})
            
        for a in range(0,len(rcomment)):
            try:
                review_comment2 = rcomment[a].text
                car_df.at[num1, 'review_comment'] = review_comment2
                car_df.at[num1, 'product_name'] = pname1          
                car_df.at[num1, 'product_link'] = base_url + plink1
                review_title1 = rtitle[a].text
                car_df.at[num1, 'review_title'] = review_title1
                review_date1 = rdate[a].text
                car_df.at[num1, 'review_date'] = review_date1
                rstar1 = rstar[a].find_next('span')
                review_star1 = rstar1.text            
                car_df.at[num1, 'review_rating'] = review_star1
                num1 = num1 + 1
            except:
                pass      
    except:
        pass

In [10]:
#car_df
car_df

Unnamed: 0,product_name,product_link,review_date,review_title,review_rating,review_comment
0,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,Reviewed in India on 11 June 2021,\nDecoits\n,1.0 out of 5 stars,\nThe product was not delivered to me but the ...
1,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,Reviewed in India on 15 December 2020,\nNice Product at Reasonable price\n,5.0 out of 5 stars,\nWell its a good product at very Discounted p...
2,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,Reviewed in India on 12 September 2021,\nOil is good but no coupon under cap\n,5.0 out of 5 stars,\nI'm very happy with oil product but disappoi...
3,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,Reviewed in India on 4 August 2022,\nDisappointed\n,1.0 out of 5 stars,\nबिक्रेता इमानदार नहीं है ढक्कन पहले से ही कू...
4,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,Reviewed in India on 18 December 2021,\nBest oil for swift\n,4.0 out of 5 stars,\nOil is price me bahut hi achha hai...pr coup...
...,...,...,...,...,...,...
405,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,Reviewed in India on 14 April 2022,\nNice\n,5.0 out of 5 stars,\nGood oil and nice packing . Oil quality is e...
406,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,Reviewed in India on 14 March 2022,\ntata safari smooth after using piston oil\n,5.0 out of 5 stars,"\nI replaced with this Oil, after quite a long..."
407,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,Reviewed in India on 17 March 2022,\nGreat Product\n,5.0 out of 5 stars,\nAmazing product. you can use it in your vehi...
408,PISTON 10W-30 Multi Grade Oil Synthetic Engine...,https://www.amazon.in/PISTON-10W-30-Synthetic-...,Reviewed in India on 14 April 2022,\nVery good engine oil\n,5.0 out of 5 stars,\nGood oil at this price . I recommend for you...


# Data Pre-processing

following are done as part of data pre-processing:

 - removed non-ASCII characters

 - removed Unicode characters

 - removed whitespaces

 - removed HTML tags

 - converted to lower case

 - replaced empty review comments with corresponding review title

 - converted review date format (example, changed 'Reviewed in India on 28 September 2021' to 28-Sept-2021)

 - changed review rating format (example, '5.0 out of 5 stars' to '5')

 - Images uploaded along with review comments were extracted as "the media could not be loaded". Removed this text from review comments

In [11]:
#For review comments and review title - removing non-ASCII characters, html tags, special characters, whitespaces and finally convert to lower case
import re
def clean_string(text):
    cleansing = []
    for i in range(0,len(text)):
        step1= re.sub('\W+'," ", text[i])
        step2= re.sub("(\xe9|\362)", "", step1)
        step3= re.sub("(<U\\+\\w+>)", "", step2)
        step4 = re.sub("[^a-zA-Z0-9]"," ",step3)
        step5 = step4.lower()
        step6 = re.sub(' +', " ", step5)
        step7 = re.sub('the media could not be loaded',"",step6)
        step8 = step7.lstrip(' ')
        cleansing.append(step8)  
    return cleansing   

In [12]:
#Converting review comment and review title columns to string to apply regex functions 
car_df['review_comment'] = car_df['review_comment'].astype(str)
car_df['review_title'] = car_df['review_title'].astype(str)

In [13]:
#Applying data cleansing on review comments and review title column
car_df["review_comment"] = clean_string(car_df['review_comment'])
car_df["review_title"] = clean_string(car_df['review_title'])

In [14]:
#Replaing empty review comments with review title. Mostly, review title is a summary of review comments, so instead of deleting the row, title is copied to comments for further analysis
for i in range(0,len(car_df['review_comment'])):
    if (car_df.loc[i, 'review_comment'] == ''):
        if (car_df.loc[i, 'review_title'] == ''):
            car_df = car_df.drop([i])
        else:
            car_df['review_comment'][i] = car_df['review_title'][i]     
    elif (pd.isna(car_df.loc[i,'review_comment'])):
        car_df['review_comment'][i] = car_df['review_title'][i]   
car_df = car_df.reset_index(drop=True)  

In [15]:
#Convering review date to date format and retaining data from 2019 onwards
for i in range(0,len(car_df['review_date'])):
    car_df['review_date'][i] = car_df['review_date'][i].split(" ", 4)[-1]
    split1 = car_df['review_date'][i]
    split1 = split1.split(" ", 2)[-1]
    split1 = int(split1)
    if (split1 < 2019):
        car_df = car_df.drop([i])
car_df = car_df.reset_index(drop=True)

In [16]:
#Converitng review rating to a format which can be used for further analysis. Example '5.0 out of 5 stars' is converted to '5'
for i in range(0,len(car_df['review_rating'])):
    car_df['review_rating'][i] = car_df['review_rating'][i].split(".", 1)[0]

In [17]:
car_df

Unnamed: 0,product_name,product_link,review_date,review_title,review_rating,review_comment
0,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,11 June 2021,decoits,1,the product was not delivered to me but the we...
1,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,15 December 2020,nice product at reasonable price,5,well its a good product at very discounted pri...
2,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,12 September 2021,oil is good but no coupon under cap,5,i m very happy with oil product but disappoint...
3,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,4 August 2022,disappointed,1,disappointed
4,GULF ULTRASYNTH X SAE 5W-30 - Fully Synthetic ...,https://www.amazon.in/GULF-ULTRASYNTH-SAE-5W-3...,18 December 2021,best oil for swift,4,oil is price me bahut hi achha hai pr coupen p...
...,...,...,...,...,...,...
353,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,14 April 2022,nice,5,good oil and nice packing oil quality is excel...
354,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,14 March 2022,tata safari smooth after using piston oil,5,i replaced with this oil after quite a long ti...
355,PISTON 5W-40 API SN Synthetic Technology Engin...,https://www.amazon.in/PISTON-Synthetic-Technol...,17 March 2022,great product,5,amazing product you can use it in your vehicle...
356,PISTON 10W-30 Multi Grade Oil Synthetic Engine...,https://www.amazon.in/PISTON-10W-30-Synthetic-...,14 April 2022,very good engine oil,5,good oil at this price i recommend for your ca...


In [19]:
#Extracting final dataset
car_df.to_csv('amazon_carproduct.csv', index=False)