# Team 6 - Web Scraping Amazon Reviews on TOZO W1 Wireless Charger

Team 6: Lai Leng Chan, Minsu Kim, Christopher Garcia

Objective:

## API Scraping & Data Ingestion

In [2]:
# Importing necessary libraries
import os
import pandas as pd
import datetime
import re
import requests
import time
import emoji

from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
from urllib.parse import urlencode

from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from wordcloud import WordCloud 

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
# After inspecting the HTML of the review page, we can see that the HTML line which we are trying 
# to scrape and extract follows the below format:
# <span class="a-profile-name">Nick</span>
# where span is the tag for the HTML line and the class is "a-profile-name"

# Define the list of URL that will be scraped

# Define the base URL
base_url = 'https://www.amazon.com/TOZO-Wireless-Upgraded-Sleep-Friendly-FastCharging/product-reviews/B07FM8R7J1/ref=cm_cr_getr_d_paging_btm_next_{}?ie=UTF8&reviewerType=all_reviews&pageNumber={}'

# Set the desired maximum number of pages
max_pages = 500  

# Generate the list of URLs
list_of_urls = [base_url.format(page_number, page_number) for page_number in range(1, max_pages + 1)]

In [3]:
# Retrieve each of the url's HTML data and convert the data into a beautiful soup object
# Find, extract and store reviewer names and review text into a list

# We will extract the data from the webpage via ScrapeAPI

reviews = []
star_ratings = []
data_string = ""

for url in list_of_urls: 
    params = {'api_key': "1327eb1bf20892837140fb2ade3f0714", 'url': url}
    response = requests.get('http://api.scraperapi.com/',   params=urlencode(params))
    soup = BeautifulSoup(response.text, 'html.parser')

    for item in soup.find_all("span", {"data-hook": "review-body"}):
      data_string = data_string + item.get_text()
      reviews.append(data_string)
      data_string = ""
    
    for item in soup.find_all("i", {"data-hook": "review-star-rating"}):
      star_ratings.append(item.get_text())

In [5]:
# Let's set up a dictionary to hold our customer names and reviews
reviews_dict = {'Reviews': reviews, 'Star Ratings': star_ratings}

# Print the lengths of each list.  
print(len(star_ratings), len(reviews))

4800 4800


In [6]:
# Convert the dictionary to a DataFrame
reviews_df = pd.DataFrame.from_dict(reviews_dict, orient='index')

# Convert dataframe to CSV file for easier assess in the future
reviews_df.to_csv('reviews.csv', index=False, header=True)

In [7]:
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
Reviews,"\nFirst, this charger looks fantastic. I got t...",\nUPDATE: Well...I was wrong about this wirel...,\nThis works perfectly on the side table while...,\nI purchased this charger for use in our camp...,"\nGot this for my Samsung Galaxy S23 phone, an...",\nI ordered this charger after using one at my...,\nCame here to see if there was any tips on ho...,\nDevice must be placed carefully or it will f...,\nCharges like advertised. I did have to take ...,\nI wanted something to use instead of the typ...,...,"\nEven with Magsafe case on, charging is super...",\nGreat charger! Able to charge my phone easil...,\nBought two of these and one was giving me pr...,\nI saw this on one of my colleagues desk at w...,\nAn excellent product. It’s in our kitchen as...,\nIt’s PERFECT for traveling. No more differen...,\n\n\n\n\n The media could ...,\nImpressed taking it out of the box. All met...,\nI love the design of this charger. Color on ...,\nkinda finnicky cuz ur phone has to be in the...
Star Ratings,4.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,4.0 out of 5 stars,5.0 out of 5 stars,4.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,...,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,3.0 out of 5 stars,4.0 out of 5 stars


## Data Proprocessing

In [32]:
# Reading the downloaded raw data into a dataframe
reviews_df = pd.read_csv('C:\\Users\\annie\\Documents\\GitHub\\SMS_SpamClassification\\reviews.csv')
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799
0,"\nFirst, this charger looks fantastic. I got t...",\nUPDATE: Well...I was wrong about this wirel...,\nThis works perfectly on the side table while...,\nI purchased this charger for use in our camp...,"\nGot this for my Samsung Galaxy S23 phone, an...",\nI ordered this charger after using one at my...,\nCame here to see if there was any tips on ho...,\nDevice must be placed carefully or it will f...,\nCharges like advertised. I did have to take ...,\nI wanted something to use instead of the typ...,...,"\nEven with Magsafe case on, charging is super...",\nGreat charger! Able to charge my phone easil...,\nBought two of these and one was giving me pr...,\nI saw this on one of my colleagues desk at w...,\nAn excellent product. It’s in our kitchen as...,\nIt’s PERFECT for traveling. No more differen...,\n\n\n\n\n The media could ...,\nImpressed taking it out of the box. All met...,\nI love the design of this charger. Color on ...,\nkinda finnicky cuz ur phone has to be in the...
1,4.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,4.0 out of 5 stars,5.0 out of 5 stars,4.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,...,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,5.0 out of 5 stars,3.0 out of 5 stars,4.0 out of 5 stars


In [33]:
# Transposing the dataframe for easier visualization
reviews_df = reviews_df.transpose()
reviews_df = reviews_df.reset_index(drop=True)
reviews_df.columns = ['Review', 'Star Rating']
reviews_df

Unnamed: 0,Review,Star Rating
0,"\nFirst, this charger looks fantastic. I got t...",4.0 out of 5 stars
1,\nUPDATE: Well...I was wrong about this wirel...,5.0 out of 5 stars
2,\nThis works perfectly on the side table while...,5.0 out of 5 stars
3,\nI purchased this charger for use in our camp...,5.0 out of 5 stars
4,"\nGot this for my Samsung Galaxy S23 phone, an...",5.0 out of 5 stars
...,...,...
4795,\nIt’s PERFECT for traveling. No more differen...,5.0 out of 5 stars
4796,\n\n\n\n\n The media could ...,5.0 out of 5 stars
4797,\nImpressed taking it out of the box. All met...,5.0 out of 5 stars
4798,\nI love the design of this charger. Color on ...,3.0 out of 5 stars


In [34]:
# Place any addtional functions or constants you need here. 

# Some punctuation variations
punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation - {"#"}

# Stopwords
sw = stopwords.words("english")

# Two useful regex
whitespace_pattern = re.compile(r"\s+")
hashtag_pattern = re.compile(r"^#[0-9a-zA-Z]+")

# It's handy to have a full set of emojis
all_language_emojis = set()

for country in emoji.EMOJI_DATA : 
    for em in emoji.EMOJI_DATA[country] : 
        all_language_emojis.add(em)

In [35]:
# Descriptive Statistics Function

def descriptive_stats(tokens, num_tokens = 5, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity, and num_tokens most common
        tokens. Return a list of 
    """
    
    if verbose :        
        print(f"There are {len(tokens)} tokens in the data.")
        print(f"There are {len(set(tokens))} unique tokens in the data.")
        print(f"There are {len(''.join(tokens))} characters in the data.")
        print(f"The lexical diversity is {len(set(tokens))/len(tokens):.3f} in the data.")
    
        # print the five most common tokens
        counts = Counter(tokens)
        
        if num_tokens > 0:
            print(counts.most_common(num_tokens))
        
    return([len(tokens), 
            len(set(tokens)),
            len("".join(tokens)),
            len(set(tokens))/len(tokens)])    
    
    return(0)

In [36]:
# Data cleaning functions
def contains_emoji(s):
    
    s = str(s)
    emojis = [ch for ch in s if emoji.is_emoji(ch)]

    return(len(emojis) > 0)


def remove_stop(tokens) :
    # modify this function to remove stopwords
    return(tokens)
 
def remove_punctuation(text, punct_set=tw_punct) : 
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    """ Splitting on whitespace rather than the book's tokenize function. That 
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """
    
    # modify this function to return tokens
    tokens = text.split()
    tokens = [token.strip() for token in tokens]
    return tokens

def prepare(text, pipeline) : 
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

In [37]:
# Apply the `pipeline` to the dataframe
my_pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]

reviews_df["tokens"] = reviews_df["Review"].apply(prepare,pipeline=my_pipeline)
reviews_df["num_tokens"] = reviews_df["tokens"].map(len) 

In [38]:
reviews_df

Unnamed: 0,Review,Star Rating,tokens,num_tokens
0,"\nFirst, this charger looks fantastic. I got t...",4.0 out of 5 stars,"[first, this, charger, looks, fantastic, i, go...",354
1,\nUPDATE: Well...I was wrong about this wirel...,5.0 out of 5 stars,"[update, welli, was, wrong, about, this, wirel...",822
2,\nThis works perfectly on the side table while...,5.0 out of 5 stars,"[this, works, perfectly, on, the, side, table,...",65
3,\nI purchased this charger for use in our camp...,5.0 out of 5 stars,"[i, purchased, this, charger, for, use, in, ou...",49
4,"\nGot this for my Samsung Galaxy S23 phone, an...",5.0 out of 5 stars,"[got, this, for, my, samsung, galaxy, s23, pho...",154
...,...,...,...,...
4795,\nIt’s PERFECT for traveling. No more differen...,5.0 out of 5 stars,"[it’s, perfect, for, traveling, no, more, diff...",22
4796,\n\n\n\n\n The media could ...,5.0 out of 5 stars,"[the, media, could, not, be, loaded, this, is,...",48
4797,\nImpressed taking it out of the box. All met...,5.0 out of 5 stars,"[impressed, taking, it, out, of, the, box, all...",47
4798,\nI love the design of this charger. Color on ...,3.0 out of 5 stars,"[i, love, the, design, of, this, charger, colo...",47


In [39]:
# Calls to descriptive_stats
descriptive_stats(reviews_df, num_tokens=10)

There are 4800 tokens in the data.
There are 4 unique tokens in the data.
There are 33 characters in the data.
The lexical diversity is 0.001 in the data.
[('Review', 1), ('Star Rating', 1), ('tokens', 1), ('num_tokens', 1)]


[4800, 4, 33, 0.0008333333333333334]