In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# required for scraper with vpn
import time
import subprocess

import glob

# Trustpilot Scraper

In [71]:
# to avoid running into beeing blocked by trustpilot reduce items in website_list or max_pages
# maximum number of review pages to scrape per website
max_pages = 500 + 1

website_list = [
    'www.apple.com', 'www.google.com', 'www.microsoft.com', 'www.facebook.com', 'www.twitter.com',
    'www.amazon.com', 'www.ebay.com', 'www.porkbun.com', 'www.godaddy.com', 'www.1password.com', 'nordvpn.com']

In [72]:
# scrape reviews from page
def get_reviews_from_page(page, website):
    return_df = pd.DataFrame()
    articles = page.find_all('article')

    for article in articles:
        country = article.find_next('aside').find_all('span')[-1].string

        article_section = article.find_next('section')
        stars = re.search(r'\d+', article_section.find('img')['alt']).group()
        date = article_section.find('time')['datetime']
        title = article_section.find('h2').string
        text_raw = article_section.find('p').contents
        text = ' '.join(line.text for line in text_raw if line.text != '')

        return_df = pd.concat(
            [return_df, pd.DataFrame(
                [{'site': website, 'date': date, 'stars': stars, 'title': title, 'text': text, 'location': country}])],
            ignore_index=True)

    return return_df

In [73]:
# get page and scrape reviews
def scrape_reviews_for_website(url):
    result_df = pd.DataFrame()

    soup = BeautifulSoup(requests.get(
        f'https://www.trustpilot.com/review/' + url).text, 'lxml')
    max_available_pages = int(
        soup.find('a', {'name': 'pagination-button-last'}).find('span').string) + 1

    # already fetched site - no need for duplicate fetching
    result_df = pd.concat(
        [result_df, get_reviews_from_page(soup, url)], ignore_index=True)

    # Scrape up to 500 pages. If reviews don't exceed 500 Pages scrape only these
    for index in range(2, max_available_pages if max_pages > max_available_pages else max_pages):
        # URL is composed of www.trustpilot.com/review + the website + the page number of the reviews as the query param page
        page = BeautifulSoup(requests.get(
            f'https://www.trustpilot.com/review/' + url + '?page=' + str(index)).text, 'lxml')
        result_df = pd.concat(
            [result_df, get_reviews_from_page(page, url)], ignore_index=True)

    result_df.to_csv('data/' + url + ' - Trustpilot.csv')

In [74]:
for website in website_list:
    scrape_reviews_for_website(website)

In [None]:
# scraper with PIA VPN - reset of VPN connection prevents blocking
# works on Windows ONLY and requires the PIA Desktop Client as well as a PIA subscription
# for website in website_list:
#
#     subprocess.call(
#         '"C:\Program Files\Private Internet Access\piactl.exe" disconnect')
#     subprocess.call(
#         '"C:\Program Files\Private Internet Access\piactl.exe" connect')
#
#     while True:
#         time.sleep(1)
#
#         if re.sub('\\r\\n', '',
#                   subprocess.run('"C:\Program Files\Private Internet Access\piactl.exe" get connectionstate',
#                                  stdout=subprocess.PIPE).stdout.decode('utf-8'),
#                   ) != 'Disconnected':
#             break
#
#     scrape_reviews_for_website(website)

# Combining all data sets

In [5]:
# folder should not contain previously combined data
filenames = glob.glob("data/*.csv")
dataframes = []

for file in filenames:
    df = pd.read_csv(file)
    dataframes.append(df)

result = pd.concat(dataframes, ignore_index=True)

del result[result.columns[0]]

result.to_csv('data/all.csv', index=False)