### Import libraries

We are going to use **Trustpilot** website for scraping!

In [1]:
%config Completer.use_jedi=False

In [2]:
import json
import time

from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import InvalidSessionIdException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from tqdm.notebook import tqdm

In [3]:
base_url = "https://trustpilot.com"

In [4]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

* **Fetching categories and sub-categories urls**

In [5]:
data_full = {}

soup = get_soup(base_url + '/categories')
for category in soup.findAll('div', {'class': 'categories_subCategory__3OxUx'}):
    name = category.find('h3', {'class': 'categories_subCategoryHeader__3Bd4c'}).text
    name = name.strip()
    data_full[name] = {}  
    sub_categories = category.find('div', {'class': 'categories_subCategoryList__1FB-L'})
    for sub_category in sub_categories.findAll('div', {'class': 'categories_subCategoryItem__2Qwj8'}):
        sub_category_name = sub_category.find('a', {'class': 'link_internal__YpiJI typography_typography__23IQz typography_weight-inherit__2IsoB typography_fontstyle-inherit__PIgau link_navigation__2cxCi'}).text
        sub_category_uri = sub_category.find('a', {'class': 'link_internal__YpiJI typography_typography__23IQz typography_weight-inherit__2IsoB typography_fontstyle-inherit__PIgau link_navigation__2cxCi'})['href'] 
        data_full[name][sub_category_name] = sub_category_uri

In [6]:
len(data_full)

22

* **Function to fetch company URLs referenced in a given subcategory**

In [7]:
def extract_company_urls_form_page():
    a_list = driver.find_elements_by_xpath('//a[@class="link_internal__YpiJI link_wrapper__LEdx5"]')
    urls = []
    for a in a_list:
        url = a.get_attribute('href')
        if url[:34] == 'https://www.trustpilot.com/review/': #this part is neccessary since the class also used for other URLs
            urls.append(url)
    dedup_urls = list(set(urls))
    return dedup_urls

This function indicates if a pagination exists on the page:

In [8]:
def go_next_page():
    try:
        button = driver.find_element_by_xpath('//a[@class="link_internal__YpiJI button_button__3sN8k button_large__3HOoE button_primary__2eJ8_ link_button__13BH6 pagination-link_next__1ld6a pagination-link_rel__3ZMei pagination-link_wideOnMobile__7WJ4L"]')
        return True, button
    except NoSuchElementException:
        return False, None

We start by initializing Selenium with a headless Chromedriver:

In [9]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")

prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome('driver/chromedriver', options=options)

timeout = 4

Let's launch our web scarping!!

In [11]:
company_urls = {}
for category in tqdm(data):
    for sub_category in tqdm(data[category], leave=False):
        company_urls[sub_category] = []

        url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all"
        driver.get(url)
        try: 
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'styles_businessUnitCard__3nW5f'))
            
            WebDriverWait(driver, timeout).until(element_present)
        except:
            pass
    
        next_page = True
        c = 1
        while next_page:
            extracted_company_urls = extract_company_urls_form_page()
            company_urls[sub_category] += extracted_company_urls
            next_page, button = go_next_page()
            
            if next_page:
                c += 1
                next_url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all" + f'&page={c}'
                driver.get(next_url)
                try: 
                    element_present = EC.presence_of_element_located(
                        (By.CLASS_NAME, 'styles_businessUnitCard__3nW5f'))
                    
                    WebDriverWait(driver, timeout).until(element_present)
                except:
                    pass

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
consolidated_data = []

for category in data:
    for sub_category in data[category]:
        for url in company_urls[sub_category]:
            consolidated_data.append((category, sub_category, url))

df_consolidated_data = pd.DataFrame(consolidated_data, columns=['category', 'sub_category', 'company_url'])

df_consolidated_data.to_csv('consolidate_company_urls.csv', index=False)