# Team 6 - Web Scraping Amazon Reviews on TOZO W1 Wireless Charger
### Lai Ieng Chan
### Christopher Garcia
### Minsu Kim

## API Scraping & Data Ingestion

In [23]:
# Importing necessary libraries
import os
import pandas as pd
import datetime
import re
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
from urllib.parse import urlencode

In [56]:
# After inspecting the HTML of the review page, we can see that the HTML line which we are trying 
# to scrape and extract follows the below format:
# <span class="a-profile-name">Nick</span>
# where span is the tag for the HTML line and the class is "a-profile-name"

# Define the list of URL that will be scraped

# Define the base URL
base_url = 'https://www.amazon.com/TOZO-Wireless-Upgraded-Sleep-Friendly-FastCharging/product-reviews/B07FM8R7J1/ref=cm_cr_getr_d_paging_btm_next_{}?ie=UTF8&reviewerType=all_reviews&pageNumber={}'

# Set the desired maximum number of pages
max_pages = 500  

# Generate the list of URLs
list_of_urls = [base_url.format(page_number, page_number) for page_number in range(1, max_pages + 1)]

In [57]:
# Retrieve each of the url's HTML data and convert the data into a beautiful soup object
# Find, extract and store reviewer names and review text into a list

# We will extract the data from the webpage via ScrapeAPI

names = []
reviews = []
data_string = ""

for url in list_of_urls: 
    params = {'api_key': "b7b98ea4369cf7a4e9b3dbf94634738e", 'url': url}
    response = requests.get('http://api.scraperapi.com/',   params=urlencode(params))
    soup = BeautifulSoup(response.text, 'html.parser')

    for item in soup.find_all("span", class_="a-profile-name"):
      data_string = data_string + item.get_text()
      names.append(data_string)
      data_string = ""  
    
    for item in soup.find_all("span", {"data-hook": "review-body"}):
      data_string = data_string + item.get_text()
      reviews.append(data_string)
      data_string = ""

In [58]:
# Let's set up a dictionary to hold our customer names and reviews
reviews_dict = {'Reviewer Name': names, 'Reviews': reviews}

# Print the lengths of each list.  
print(len(names), len(reviews))

6358 4800


In [59]:
reviews_dict

{'Reviewer Name': ['Nick',
  'Lifeseeker',
  'Nick',
  'AaronR',
  'AaronR',
  'William',
  'Gabba Hey',
  'Zellie',
  'Leticia Ramos',
  'Pizzle',
  'GenericAmazonShopper',
  'LPS',
  'Urquialuis',
  'Nick',
  'Lifeseeker',
  'Paul S',
  'TN',
  'TN',
  'bigmac',
  'Amazon Customer',
  'Blue Sky',
  'Nats Lady',
  'ChancesR',
  'Mary Jane Rickman',
  'Krissi',
  'TJ',
  'Nick',
  'Lifeseeker',
  'Carol R.',
  'Robert Martinez',
  'Jim M',
  'Kathy peek',
  'Amazon Customer',
  'Gpa Jer',
  'Thomas Hixson',
  'Valentina Pevec',
  'Amazon Customer',
  'Jeffrey Howard',
  'Nick',
  'Lifeseeker',
  'Roberta S',
  'CrashnTiger',
  'Abigail',
  'Phil',
  'Krysta',
  'Amazon Customer',
  'Lifeseeker',
  'Bob in Vegas',
  'Alex G.',
  'Justin Bartrum',
  'Nick',
  'Lifeseeker',
  'Tiffany',
  'Quinn',
  'Dai Low',
  'Aaron',
  'Alex and Heather Chapman',
  'Erica B',
  'Erica B',
  'Richard Lynn',
  'Charles Schneider',
  'jgj',
  'prncez1921',
  'Nick',
  'Lifeseeker',
  'Paul W Currie',
  '

In [60]:
# Convert the dictionary to a DataFrame
reviews_df = pd.DataFrame.from_dict(reviews_dict, orient='index')

# Convert dataframe to CSV file for easier assess in the future
reviews_df.to_csv('reviews.csv', index=False, header=True)

In [61]:
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6348,6349,6350,6351,6352,6353,6354,6355,6356,6357
Reviewer Name,Nick,Lifeseeker,Nick,AaronR,AaronR,William,Gabba Hey,Zellie,Leticia Ramos,Pizzle,...,Amazon Customer,prat,Jim Lloyd,Jaritza Prado Paredes,Sherzod,J,Stefannie F,Chase Brock,Madrussian81,Gerardo
Reviews,"\nFirst, this charger looks fantastic. I got t...",\nUPDATE: Well...I was wrong about this wirel...,\nI purchased this charger for use in our camp...,"\nGot this for my Samsung Galaxy S23 phone, an...",\nCame here to see if there was any tips on ho...,\nI ordered this charger after using one at my...,\nCharges like advertised. I did have to take ...,\nDevice must be placed carefully or it will f...,\nI wanted something to use instead of the typ...,\nCumple su trabajo\n,...,,,,,,,,,,


## Data Proprocessing

In [62]:
# Reading the downloaded raw data into a dataframe
reviews_df = pd.read_csv('C:\\Users\\annie\\Downloads\\reviews.csv')
reviews_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6348,6349,6350,6351,6352,6353,6354,6355,6356,6357
0,Nick,Lifeseeker,Nick,AaronR,AaronR,William,Gabba Hey,Zellie,Leticia Ramos,Pizzle,...,Amazon Customer,prat,Jim Lloyd,Jaritza Prado Paredes,Sherzod,J,Stefannie F,Chase Brock,Madrussian81,Gerardo
1,"\nFirst, this charger looks fantastic. I got t...",\nUPDATE: Well...I was wrong about this wirel...,\nI purchased this charger for use in our camp...,"\nGot this for my Samsung Galaxy S23 phone, an...",\nCame here to see if there was any tips on ho...,\nI ordered this charger after using one at my...,\nCharges like advertised. I did have to take ...,\nDevice must be placed carefully or it will f...,\nI wanted something to use instead of the typ...,\nCumple su trabajo\n,...,,,,,,,,,,
