In [163]:
import requests
from bs4 import BeautifulSoup
import pandas
from selenium import webdriver
import time


In [164]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(os.path.join(sys.path[0], "chromedriver"), options=options)


In [176]:
# constants here
CATEGORIES = ['activewear', 'tops', 'bottoms', 'denim', 'dresses', 'knitwear', 'outerwear', 'suits', 'basics', 'sleepwear', 'swimwear', 'maternity', 'plus-size', 'shoes', 'bags', 'accessories']

CATEGORY_ROOT_URL = 'https://directory.goodonyou.eco/categories/'

BRAND_ROOT_URL = 'https://directory.goodonyou.eco'

In [177]:
# brand page specifics
# maps field name to ['div type', 'class name'], #example text
BRAND_SCRAPING_MAP = {
    'name': ['h1', 'sc-kjoXOD hxRXn StyledHeading-sc-1rdh4aw-0 iadRcX'], # 'Article 22'
    'rating': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Rated: Good'
    'price': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Price : $$'
    'location': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Location: Australia'
    'planet': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # '5 out of 5'
    'people': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # '4 out of 5', 'Not applicable'
    'animals': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # same as above
    'description': ['div', 'sc-kgAjT kqcGSN'], # all text comes in chunks of <p>'s
}

In [178]:
# if a store has already been scraped, just add the category to this map and do not "rescrape" to save time
# we assume there is only one unique page per brand

STORE_CATEGORY_MAP = {}

In [179]:
def scroll(driver, timeout=2):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

# first scrape all of the brand names
def scrape_brand_names():
    for c in CATEGORIES:
        category_url = CATEGORY_ROOT_URL + c
        print(category_url)

        driver.get(category_url)

        scroll(driver)

        page = driver.page_source

        soup = BeautifulSoup(page, "html.parser")

        for card in soup.find_all('div', attrs={'class': 'sc-ksYbfQ gBVJBH'}):
            brand_suffix = card.find('div').find('a')['href']
            brand_link = BRAND_ROOT_URL + brand_suffix

            # add category to that brand
            if brand_link in STORE_CATEGORY_MAP:
                STORE_CATEGORY_MAP[brand_link].append(c)
            else:
                STORE_CATEGORY_MAP[brand_link] = [c,]



In [180]:
scrape_brand_names()
print(STORE_CATEGORY_MAP)

.goodonyou.eco/brand/liewood': ['knitwear', 'swimwear', 'shoes', 'bags'], 'https://directory.goodonyou.eco/brand/jennifer-fisher': ['knitwear'], 'https://directory.goodonyou.eco/brand/f-wd': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/arch-4': ['knitwear'], 'https://directory.goodonyou.eco/brand/gloria-coelho': ['knitwear'], 'https://directory.goodonyou.eco/brand/fay': ['knitwear'], 'https://directory.goodonyou.eco/brand/mackintosh': ['knitwear', 'suits'], 'https://directory.goodonyou.eco/brand/361-degrees': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/lanidor': ['knitwear'], 'https://directory.goodonyou.eco/brand/ai-riders': ['knitwear'], 'https://directory.goodonyou.eco/brand/fillity': ['knitwear'], 'https://directory.goodonyou.eco/brand/mcm': ['knitwear', 'shoes', 'bags'], 'https://directory.goodonyou.eco/brand/soia-and-kyo': ['knitwear'], 'https://directory.goodonyou.eco/brand/ash': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/the

In [183]:
import csv
with open('brand_list.csv', mode='w') as brand_file:
    fieldnames = ['brand_name', 'brand_url', 'types']

    brand_writer = csv.writer(brand_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    for (brand_url, types) in STORE_CATEGORY_MAP.items():
        brand = brand_url.split('/')[-1]
        empty = ", "
        types_string = empty.join(types)
        brand_writer.writerow([brand, brand_url, types_string])


In [184]:
print(len(STORE_CATEGORY_MAP.keys()))

2181
