<a href="https://colab.research.google.com/github/Chootana/AmazonReviewsScraping/blob/master/amazon_reviews_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update 
!apt install chromium-chromedriver 

!pip install selenium 
!pip install beautifulsoup4

!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
# import 
import sys 
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from bs4 import BeautifulSoup

import time 
import pandas as pd 

In [None]:
class AmazonScraping():
    def __init__(self):
        pass 

    def get_page_from_(self, url):
        """
        Parameters
        -------
        url: str

        Returns
        -------
        text: str 
            source code for this url
        """

        text = ""
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        wd = webdriver.Chrome('chromedriver',options=chrome_options)

        wd.get(url)
        wd.implicitly_wait(10) 
        text = wd.page_source
            
        # stop browsing
        wd.quit()
        
        return text

    def get_all_reviews(self, url):
        """
        Parameters
        --------
        url: str

        Returns
        --------
        reviews: dict
            ratings, titles, texts
        """

        reviews = {
            'ratings': [],
            'titles': [],
            'texts': [],
        }

        # change the page to product-reviews page
        url = url.replace('dp', 'product-reviews')
        print('[URL] {}'.format(url))


        idx = 1
        while True:
            res = self.get_page_from_(url)
            soup_amazon = BeautifulSoup(res, features='lxml')

            print(f'# {idx} searching')
            review_ratings = soup_amazon.select('.review-rating')
            review_titles = soup_amazon.select('.review-title-content')
            review_texts = soup_amazon.select('.review-text')

            for num in range(len(review_texts)):
                reviews['ratings'].append(review_ratings[num].text.split(" ")[0])
                reviews['titles'].append(review_titles[num].text)
                reviews['texts'].append(review_texts[num].text)
            
            idx += 1
            time.sleep(1)

            next_page = soup_amazon.select('li.a-last a')
            if next_page != []:
                url_next = 'https://www.amazon.co.jp/{}'.format(next_page[0].attrs['href'])
                url = url_next
                print('[URL] {}'.format(url))
            else:
                break
        
        print('[Scraping] finish.')
        return reviews

    def save_reviews_as_csv(self, reviews, path_csv='./reviews.csv'):
        """
        """

        df_reviews = pd.DataFrame.from_dict(reviews, orient='index').T
        df_reviews.to_csv(path_csv, encoding='utf_8_sig')
        print('[Save] save to {}'.format(path_csv))

    def run(self, url, path_csv='./reviews.csv'):
        """
        """

        reviews = self.get_all_reviews(url)

        for key, val in reviews.items():
            assert val != [], '[False] There are no results.'
        
        self.save_reviews_as_csv(reviews, path_csv)
        print('[Finish]')

        



In [None]:
#@title ## Amazon Reviews Scraping

#@markdown --- 
#@markdown ### Enter URL
url = "https://www.amazon.co.jp/%E3%83%8B%E3%83%99%E3%82%A2-%E3%83%AA%E3%83%83%E3%83%97%E3%82%B1%E3%82%A2-%E3%83%93%E3%82%BF%E3%83%9F%E3%83%B3E-3-9g/dp/B001PM2L72/ref=sr_1_7_mod_primary_new?dchild=1&keywords=%E3%83%8B%E3%83%99%E3%82%A2+%E3%83%AA%E3%83%83%E3%83%97&qid=1608550669&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sr=8-7" #@param {type:"string"}


#@markdown --- 
#@markdown ### Enter Save Path
#@markdown ex.) ./reviews.csv
path_csv = "./test.csv" #@param {type:"string"}

#@markdown --- 


amazon_scraping = AmazonScraping()
amazon_scraping.run(url, path_csv)
