In [1]:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import pickle
import requests
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import re
import subprocess
from selenium import webdriver

# Ratings Over Time.
# Frequency of Review Words (Minus Stop Words).
# Out of All Positive Reviews - Number of People Who Found it Helpful; Out of All Negative Reviews - "".
# Sentiment Analysis: Buy, Caution, Don't Buy (Average Out the Sentiment).
# Useful Review Classification.


In [2]:
def setup_docker_scrapy():
    subprocess.call("docker pull scrapinghub/splash")
    subprocess.call("docker run -dit -p 8050:8050 scrapinghub/splash")

def end_docker_image():
    subprocess.call("docker system prune -a --all -f")

In [3]:
# Setup docker image to enable scraping on Amazon.
setup_docker_scrapy()

In [9]:
def get_data(review_total_data, url, current_page):
    print(current_page)
    resp = requests.get("http://localhost:8050/render.html", params={'url': url+str(current_page), 'wait': 3})
    html = resp.content
    soup = bs.BeautifulSoup(html, "lxml")
    reached_limit = False
    
    # Get all reviews.
    reviews = soup.find_all('div', {'data-hook': 'review'})
    print(len(reviews))
    review_number = 1
    for review in reviews:
        print("- " + str(review_number))
        # Extract Review ID
        id_pattern = re.compile("[A-Z0-9_]{1,100}-review-card")
        # Extract Datetime from String.
        datetime_index = review.find('span', {'data-hook': 'review-date'}).text.find("on")
        datetime = review.find('span', {'data-hook': 'review-date'}).text[datetime_index + 2:]
        # Extract Review Helpfulness.
        try:
            helpful = re.sub("[^0-9]", "", review.find('span', {'data-hook': 'helpful-vote-statement'}).text).strip()
        except:
            helpful = "0"
        # Extract Review Ratings.
        try:
            rating = float(review.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip())
        except:
            rating = float(review.find('i', {'data-hook': 'cmps-review-star-rating'}).text.replace('out of 5 stars', '').strip())
        review_data = {
            'id': review.find('div', id=id_pattern)["id"].replace("-review-card", "").lower(),
            'review': review.find('span', {'data-hook': 'review-body'}).text.strip(),
            'rating': rating,
            'helpfulness': helpful,
            'datetime': datetime.strip(),
            'additional': review.find('a', {'data-hook': 'format-strip'}).text,
        }
        review_total_data.append(review_data)
        review_number += 1
    # Pagination.
    url = None
    next_page_ul = soup.find("ul", {"class": "a-pagination"})
    if next_page_ul.find("li", {"class": "a-disabled a-last"}):
        reached_limit = True
    return reached_limit, review_total_data

In [6]:
url = "https://www.amazon.com/Apple-iPhone-Pro-512GB-Gold/product-reviews/B08BHSCF3N/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber="
current_page=2
review_total_data = []
resp = requests.get("http://localhost:8050/render.html", params={'url': url+str(current_page), 'wait': 3})
html = resp.content
soup = bs.BeautifulSoup(html, "lxml")
reached_limit = False

# Get all reviews.
reviews = soup.find_all('div', {'data-hook': 'review'})
print(len(reviews))
review_number = 1
for review in reviews:
    print("- " + str(review_number))
    # Extract Review ID
    id_pattern = re.compile("[A-Z0-9_]{1,100}-review-card")
    # Extract Datetime from String.
    datetime_index = review.find('span', {'data-hook': 'review-date'}).text.find("on")
    datetime = review.find('span', {'data-hook': 'review-date'}).text[datetime_index + 2:]
    # Extract Review Helpfulness.
    try:
        helpful = re.sub("[^0-9]", "", review.find('span', {'data-hook': 'helpful-vote-statement'}).text).strip()
    except:
        helpful = "0"
    # Extract Review Ratings.
    try:
        rating = float(review.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip())
    except:
        rating = float(review.find('i', {'data-hook': 'cmps-review-star-rating'}).text.replace('out of 5 stars', '').strip())
    review_data = {
        'id': review.find('div', id=id_pattern)["id"].replace("-review-card", "").lower(),
        'review': review.find('span', {'data-hook': 'review-body'}).text.strip(),
        'rating': rating,
        'helpfulness': helpful,
        'datetime': datetime.strip(),
        'additional': review.find('a', {'data-hook': 'format-strip'}).text,
    }
    review_total_data.append(review_data)

10
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1


In [10]:
review_total_data[5]

{'id': 'r365nnqol3v3gg',
 'review': 'Video Player is loading.Play VideoPlayMuteCurrent Time 0:00/Duration 0:00Loaded: 0%Stream Type LIVESeek to live, currently playing liveLIVERemaining Time -0:00 Playback Rate1xChaptersChaptersDescriptionsdescriptions off, selectedCaptionscaptions and subtitles off, selectedAudio TrackFullscreenThis is a modal window.\n\n\n\n  \xa0Espectacular diseño todo funciona 100% prácticamente nuevo . Lo malo es el valor y se pierden los accesorios por que solo viene con su cargador y de paso no es original . Pero 100% recomendado muy buen articulo y llego en 2 días a chile .',
 'rating': 5.0,
 'helpfulness': '9',
 'datetime': 'August 2, 2020',
 'additional': 'Size: 256GBColor: GoldService Provider: UnlockedProduct grade: Renewed'}

In [5]:
# Get Data.
sort_by = "top"
review_total_data = []
current_page = 1
reached_limit = False
url = "https://www.amazon.com/Apple-MacBook-13-inch-256GB-Storage/product-reviews/B08N5LNQCX/ref=cm_cr_arp_d_viewopt_rvwer?ie=UTF8&reviewerType=avp_only_reviews&pageNumber="

while not reached_limit:
    try:
        reached_limit, review_total_data = get_data(review_total_data, url, current_page)
        current_page += 1
    except:
        reached_limit, review_total_data = get_data(review_total_data, url, current_page)
        current_page += 1

1
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
2
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
3
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
4
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
5
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
6
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
7
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
8
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
9
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
10
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
11
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
12
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
13
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
14
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
15
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
16
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
17
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
18
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
19
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
20
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
21
10
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
22
10
- 1
- 2
- 3
- 4


In [10]:
len(review_total_data)

1234

In [11]:
review_total_data

[{'id': 'r2ma3lkg5j5jb3',
  'review': 'Beautiful laptop. I’ve upgraded from the MacBook Air of old...so far this laptop has blown it away. It wasn’t overly pricey to be honest and I feel it’s worth the price. Overall look is great and keyboard feel is nice. Larger touchpad is great, I’ve paired my Magic Trackpad 2 with it as well.',
  'rating': 5.0,
  'helpfulness': '289',
  'datetime': 'November 29, 2020',
  'additional': 'Capacity: 256GBColor: Space Gray'},
 {'id': 'r2xa6kr3rqgrma',
  'review': 'I have two MacBook Airs (MBA), a 13" and a 11, both with the previous form factor (the wide, body colored bezel around the screen, 1280x800 resolution) and this new 13" MBA improves upon it in every way (almost) and makes it nearly perfect.Good:- SCREEN looks great!  2560x1600 resolution is bright and sharp and looks good with the thin black bezel.  I usually don\'t go above 50% brightness.- KEYBOARD is good.  Some have mentioned that it still sucks, but I find it much better than the older M

In [12]:
# Clean Data.
df = pd.DataFrame(review_total_data)
df.columns = ["review_id", "review_text", "review_rating", "rating_helpfulness", "review_datetime", "review_add"]
df["review_text"] = df["review_text"].apply(lambda x: str(x).strip())
df["review_id"] = df["review_id"].apply(lambda x: str(x).replace("-review-card", "").lower())

# Extract Additional Data.
df["product_capacity"] = df.review_add.apply(lambda x: re.findall(r"[0-9]{1,4}GB", x)[0])
df["product_color"] = df.review_add.apply(lambda x: re.sub(r"Capacity: [0-9]{1,4}GBColor: ", "", x))

# Convert Column Types.
df["rating_helpfulness"] = pd.to_numeric(df["rating_helpfulness"])
df["review_datetime"] = pd.to_datetime(df["review_datetime"])

df.drop_duplicates(subset=["review_id", "review_datetime"], keep="first", inplace=True)
df["review_text"].replace('', np.nan, inplace=True)
df = df[df["review_text"].notna()]
df

Unnamed: 0,review_id,review_text,review_rating,rating_helpfulness,review_datetime,review_add,product_capacity,product_color
0,r2ma3lkg5j5jb3,Beautiful laptop. I’ve upgraded from the MacBo...,5.0,289.0,2020-11-29,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1,r2xa6kr3rqgrma,"I have two MacBook Airs (MBA), a 13"" and a 11,...",5.0,202.0,2020-12-02,Capacity: 256GBColor: Space Gray,256GB,Space Gray
2,r2u37mcgx4amd6,No complains so far. Very fast and performs g...,5.0,231.0,2020-11-26,Capacity: 256GBColor: Gold,256GB,Gold
3,r1mdj8zt1bgs62,"Lightweight, blazing fast, easy to use, batter...",3.0,106.0,2020-12-03,Capacity: 256GBColor: Silver,256GB,Silver
4,r5ueguatd3ejm,Bought this laptop bc i needed it for spring s...,5.0,78.0,2021-01-30,Capacity: 512GBColor: Gold,512GB,Gold
...,...,...,...,...,...,...,...,...
1228,ro07n30dhf1fm,Ótimo,5.0,0.0,2021-06-22,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1229,r3n011wq9ssw6t,Tudo funciona,5.0,0.0,2021-09-01,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1230,r3cfkp54tfinl1,Eficiência,5.0,0.0,2021-08-11,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1231,r2gyisauplomdb,Muito bom!,5.0,0.0,2021-09-16,Capacity: 256GBColor: Space Gray,256GB,Space Gray


In [83]:
def translate_reviews(df):
    # Translate all reviews to English.
    from deep_translator import GoogleTranslator

    review_text = list(df["review_text"].values)
    review_text_translated = GoogleTranslator(source='auto', target='english').translate_batch(review_text)
    return review_text_translated

In [85]:
def detect_language(df, review_text_translated):
    # Detect Language.
    from langdetect import detect

    language_full_name = {'aa':'Afar','ab':'Abkhazian','af':'Afrikaans','ak':'Akan','sq':'Albanian','am':'Amharic','ar':'Arabic',
    'an':'Aragonese','hy':'Armenian','as':'Assamese','av':'Avaric','ae':'Avestan','ay':'Aymara','az':'Azerbaijani','ba':'Bashkir',
    'bm':'Bambara','eu':'Basque','be':'Belarusian','bn':'Bengali','bh':'Bihari languages','bi':'Bislama','bo':'Tibetan','bs':'Bosnian',
    'br':'Breton','bg':'Bulgarian','my':'Burmese','ca':'Catalan; Valencian','cs':'Czech','ch':'Chamorro','ce':'Chechen','zh':'Chinese',
    'cu':'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic','cv':'Chuvash','kw':'Cornish','co':'Corsican',
    'cr':'Cree','cy':'Welsh','cs':'Czech','da':'Danish','de':'German','dv':'Divehi; Dhivehi; Maldivian','nl':'Dutch; Flemish','dz':'Dzongkha',
    'el':'Greek-Modern (1453-)','en':'English','eo':'Esperanto','et':'Estonian','eu':'Basque','ee':'Ewe','fo':'Faroese','fa':'Persian',
    'fj':'Fijian','fi':'Finnish','fr':'French','fy':'Western Frisian','ff':'Fulah','Ga':'Georgian','gd':'Gaelic; Scottish Gaelic','ga':'Irish',
    'gl':'Galician','gv':'Manx','el':'Greek-Modern (1453-)','gn':'Guarani','gu':'Gujarati','ht':'Haitian; Haitian Creole','ha':'Hausa',
    'he':'Hebrew','hz':'Herero','hi':'Hindi','ho':'Hiri Motu','hr':'Croatian','hu':'Hungarian','hy':'Armenian','ig':'Igbo','is':'Icelandic',
    'io':'Ido','ii':'Sichuan Yi; Nuosu','iu':'Inuktitut','ie':'Interlingue; Occidental',
    'ia':'Interlingua (International Auxiliary Language Association)','id':'Indonesian','ik':'Inupiaq','is':'Icelandic','it':'Italian',
    'jv':'Javanese','ja':'Japanese','kl':'Kalaallisut; Greenlandic','kn':'Kannada','ks':'Kashmiri','ka':'Georgian','kr':'Kanuri','kk':'Kazakh',
    'km':'Central Khmer','ki':'Kikuyu; Gikuyu','rw':'Kinyarwanda','ky':'Kirghiz; Kyrgyz','kv':'Komi','kg':'Kongo','ko':'Korean',
    'kj':'Kuanyama; Kwanyama','ku':'Kurdish','lo':'Lao','la':'Latin','lv':'Latvian','li':'Limburgan; Limburger; Limburgish','ln':'Lingala',
    'lt':'Lithuanian','lb':'Luxembourgish; Letzeburgesch','lu':'Luba-Katanga','lg':'Ganda','mk':'Macedonian','mh':'Marshallese',
    'ml':'Malayalam','mi':'Maori','mr':'Marathi','ms':'Malay','Mi':'Micmac','mk':'Macedonian','mg':'Malagasy','mt':'Maltese',
    'mn':'Mongolian','mi':'Maori','ms':'Malay','my':'Burmese','na':'Nauru','nv':'Navajo; Navaho','nr':'Ndebele-South; South Ndebele',
    'nd':'Ndebele-North; North Ndebele','ng':'Ndonga','ne':'Nepali','nl':'Dutch; Flemish','nn':'Norwegian Nynorsk; Nynorsk:Norwegian',
    'nb':'Bokmål-Norwegian; Norwegian Bokmål','no':'Norwegian','oc':'Occitan (post 1500)','oj':'Ojibwa','or':'Oriya','om':'Oromo',
    'os':'Ossetian; Ossetic','pa':'Panjabi; Punjabi','fa':'Persian','pi':'Pali','pl':'Polish','pt':'Portuguese','ps':'Pushto; Pashto',
    'qu':'Quechua','rm':'Romansh','ro':'Romanian; Moldavian; Moldovan','rn':'Rundi','ru':'Russian','sg':'Sango','sa':'Sanskrit',
    'si':'Sinhala; Sinhalese','sk':'Slovak','sk':'Slovak','sl':'Slovenian','se':'Northern Sami','sm':'Samoan','sn':'Shona','sd':'Sindhi',
    'so':'Somali','st':'Sotho-Southern','es':'Spanish; Castilian','sq':'Albanian','sc':'Sardinian','sr':'Serbian','ss':'Swati',
    'su':'Sundanese','sw':'Swahili','sv':'Swedish','ty':'Tahitian','ta':'Tamil','tt':'Tatar','te':'Telugu','tg':'Tajik','tl':'Tagalog',
    'th':'Thai','bo':'Tibetan','ti':'Tigrinya','to':'Tonga (Tonga Islands)','tn':'Tswana','ts':'Tsonga','tk':'Turkmen','tr':'Turkish',
    'tw':'Twi','ug':'Uighur; Uyghur','uk':'Ukrainian','ur':'Urdu','uz':'Uzbek','ve':'Venda','vi':'Vietnamese','vo':'Volapük','cy':'Welsh',
    'wa':'Walloon','wo':'Wolof','xh':'Xhosa','yi':'Yiddish','yo':'Yoruba','za':'Zhuang; Chuang','zh':'Chinese','zu':'Zulu'}

    df["review_text_translated"] = review_text_translated
    foreign_reviewers_df = df[df["review_text"] != df["review_text_translated"]][["review_id", "review_text"]]
    foreign_reviewers_df["review_lang"] = [language_full_name[detect(text)] for text in list(foreign_reviewers_df.review_text.values)]
    df = df.merge(foreign_reviewers_df, on=["review_id", "review_text"], how='left')
    df["review_lang"].fillna("English", inplace=True)

    df = df[[
        "review_id", 
        "review_text", 
        "review_text_translated", 
        "review_lang",
        "review_rating", 
        "rating_helpfulness", 
        "review_datetime",
        "review_add",
        "product_capacity",
        "product_color"]]

    return df


In [81]:
df

Unnamed: 0,review_id,review_text,review_text_translated,review_lang,review_rating,rating_helpfulness,review_datetime,review_add,product_capacity,product_color
0,r2ma3lkg5j5jb3,Beautiful laptop. I’ve upgraded from the MacBo...,Beautiful laptop. I’ve upgraded from the MacBo...,English,5.0,289.0,2020-11-29,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1,r2xa6kr3rqgrma,"I have two MacBook Airs (MBA), a 13"" and a 11,...","I have two MacBook Airs (MBA), a 13"" and a 11,...",English,5.0,202.0,2020-12-02,Capacity: 256GBColor: Space Gray,256GB,Space Gray
2,r2u37mcgx4amd6,No complains so far. Very fast and performs g...,No complains so far. Very fast and performs g...,English,5.0,231.0,2020-11-26,Capacity: 256GBColor: Gold,256GB,Gold
3,r1mdj8zt1bgs62,"Lightweight, blazing fast, easy to use, batter...","Lightweight, blazing fast, easy to use, batter...",English,3.0,106.0,2020-12-03,Capacity: 256GBColor: Silver,256GB,Silver
4,r5ueguatd3ejm,Bought this laptop bc i needed it for spring s...,Bought this laptop bc i needed it for spring s...,English,5.0,78.0,2021-01-30,Capacity: 512GBColor: Gold,512GB,Gold
...,...,...,...,...,...,...,...,...,...,...
1226,ro07n30dhf1fm,Ótimo,Excellent,Lithuanian,5.0,0.0,2021-06-22,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1227,r3n011wq9ssw6t,Tudo funciona,everything works,Spanish; Castilian,5.0,0.0,2021-09-01,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1228,r3cfkp54tfinl1,Eficiência,Efficiency,Portuguese,5.0,0.0,2021-08-11,Capacity: 256GBColor: Space Gray,256GB,Space Gray
1229,r2gyisauplomdb,Muito bom!,Very good!,Portuguese,5.0,0.0,2021-09-16,Capacity: 256GBColor: Space Gray,256GB,Space Gray


In [82]:
# Save to CSV
df.to_csv("product_review_data.csv")