In [1]:
import requests
from bs4 import BeautifulSoup
import pandas
from selenium import webdriver
import time
import csv


In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(os.path.join(sys.path[0], "chromedriver"), options=options)


In [3]:
# constants here
CATEGORIES = ['activewear', 'tops', 'bottoms', 'denim', 'dresses', 'knitwear', 'outerwear', 'suits', 'basics', 'sleepwear', 'swimwear', 'maternity', 'plus-size', 'shoes', 'bags', 'accessories']

CATEGORY_ROOT_URL = 'https://directory.goodonyou.eco/categories/'

BRAND_ROOT_URL = 'https://directory.goodonyou.eco'

In [4]:
# brand page specifics
# maps field name to ['div type', 'class name'], #example text
BRAND_SCRAPING_MAP = {
    'name': ['h1', 'sc-kjoXOD hxRXn StyledHeading-sc-1rdh4aw-0 iadRcX'], # 'Article 22'
    'rating': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Rated: Good'
    'price': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Price : $$'
    'location': ['span', 'StyledText-sc-1sadyjn-0 bBUTWf'], # 'Location: Australia'
    'planet': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # '5 out of 5'
    'people': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # '4 out of 5', 'Not applicable'
    'animals': ['span', 'StyledText-sc-1sadyjn-0 bVvIwM'], # same as above
    'description': ['div', 'sc-kgAjT kqcGSN'], # all text comes in chunks of <p>'s
}

In [5]:
# if a store has already been scraped, just add the category to this map and do not "rescrape" to save time
# we assume there is only one unique page per brand

STORE_CATEGORY_MAP = {}

In [6]:
def scroll(driver, timeout=2):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

# first scrape all of the brand names
def scrape_brand_names():
    for c in CATEGORIES:
        category_url = CATEGORY_ROOT_URL + c
        print(category_url)

        driver.get(category_url)

        scroll(driver)

        page = driver.page_source

        soup = BeautifulSoup(page, "html.parser")

        for card in soup.find_all('div', attrs={'class': 'sc-ksYbfQ gBVJBH'}):
            brand_suffix = card.find('div').find('a')['href']
            brand_link = BRAND_ROOT_URL + brand_suffix

            # add category to that brand
            if brand_link in STORE_CATEGORY_MAP:
                STORE_CATEGORY_MAP[brand_link].append(c)
            else:
                STORE_CATEGORY_MAP[brand_link] = [c,]



In [7]:
scrape_brand_names()
print(STORE_CATEGORY_MAP)

https://directory.goodonyou.eco/categories/activewear


KeyboardInterrupt: 

In [186]:
# write the brand list
# with open('brand_list_2.csv', mode='w') as brand_file:
#     fieldnames = ['brand_name', 'brand_url', 'types']

#     brand_writer = csv.writer(brand_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     brand_writer.writerow(fieldnames)
#     for (brand_url, types) in STORE_CATEGORY_MAP.items():
#         
#         brand = brand_url.split('/')[-1]
#         empty = ", "
#         types_string = empty.join(types)
#         brand_writer.writerow([brand, brand_url, types_string])


In [12]:
with open('brand_list.csv', mode='r') as brand_file:
    brand_reader = csv.DictReader(brand_file)
    line_count = 0
    for row in brand_reader:
        print(row)
        if line_count != 0:
            brand_url = row['brand_url']
            if brand_url not in STORE_CATEGORY_MAP:
                STORE_CATEGORY_MAP[brand_url] = row['types'].split(", ")
        line_count += 1

print(STORE_CATEGORY_MAP)

.goodonyou.eco/brand/liewood': ['knitwear', 'swimwear', 'shoes', 'bags'], 'https://directory.goodonyou.eco/brand/jennifer-fisher': ['knitwear'], 'https://directory.goodonyou.eco/brand/f-wd': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/arch-4': ['knitwear'], 'https://directory.goodonyou.eco/brand/gloria-coelho': ['knitwear'], 'https://directory.goodonyou.eco/brand/fay': ['knitwear'], 'https://directory.goodonyou.eco/brand/mackintosh': ['knitwear', 'suits'], 'https://directory.goodonyou.eco/brand/361-degrees': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/lanidor': ['knitwear'], 'https://directory.goodonyou.eco/brand/ai-riders': ['knitwear'], 'https://directory.goodonyou.eco/brand/fillity': ['knitwear'], 'https://directory.goodonyou.eco/brand/mcm': ['knitwear', 'shoes', 'bags'], 'https://directory.goodonyou.eco/brand/soia-and-kyo': ['knitwear'], 'https://directory.goodonyou.eco/brand/ash': ['knitwear', 'shoes'], 'https://directory.goodonyou.eco/brand/the

In [42]:
# now scrape the brand pages themselves

def scrape_brand_info(soup):
    # ====== NAME ======= 
    try:
        name = soup.find('div', attrs={'class': 'heABzf'}).find('h1').text
    except:
        name = 'null'

    # ====== RATING + PRICE =======
    try:
        top_info = soup.find('div', attrs={'class': 'jtgCKU'}).find('div', attrs={'class': 'IRSNj'}).find('div', attrs={'class': 'IRSNj'})

        # ====== RATING ======
        try:
            r = top_info.find_all('span')[0].text
            rating = r.split(": ")[1]
        except:
            rating = 'null'
        
        # ====== PRICE ======
        try:
            p = top_info.find_all('span')[1].text
            price = p.split(" : ")[1]
        except:
            price = 'null'

    except:
        rating, price = 'null', 'null'

    # ====== LOCATION + SHIPS TO =======
    try:
        middle_info = soup.find('div', attrs={'class': 'gYNhOv'}).find('div')

        # ====== LOCATION ======
        try:
            l = middle_info.find_all('span', attrs={'class': 'bBUTWf'})[0].text
            location = l.split(": ")[1]
        except:
            location = 'null'

        # ====== SHIPS TO ======
        try:
            s = middle_info.find_all('span', attrs={'class': 'bBUTWf'})[1].text
            ships = s.split(": ")[1]
        except:
            ships = 'null'
    
    except:
        location, ships = 'null', 'null'

    # ====== DESCRIPTION =======
    try:
        description = soup.find('h4', attrs={'class': 'sc-gisBJw cLYslc StyledHeading-sc-1rdh4aw-0 cTTfWw'}).text
    
    except:
        description = 'null'

    # ===== SUB RATINGS ========
    try:
        pl, pe, an = [info.text.lower() for info in soup.find_all('span', attrs={'class': 'StyledText-sc-1sadyjn-0 bVvIwM'})][:3]
        
        planet = pl[0] if pl[0] in [str(val) for val in range(6)] else 'null'
        people = pe[0] if pe[0] in [str(val) for val in range(6)] else 'null'
        animals = an[0] if an[0] in [str(val) for val in range(6)] else 'null'
    
    except:
        planet, people, animals = 'null', 'null', 'null'

    # ====== EXPLANATION, LAST UPDATED =======

    try:
        # ====== EXPLANATION =======
        explain_info = soup.find('div', attrs={'class':'sc-kgAjT kqcGSN'})
        sep = " "
        explanation = sep.join([paragraph.text for paragraph in explain_info.find_all('p')][:-1])

        # ====== LAST UPDATED ======
        updated = explain_info.find_all('p')[-1].find('i').text.split('Last Updated: ')[1]

        try:
            updated_month, updated_year = updated.split()
        except:
            updated_month, updated_year = 'null', 'null'

    except:
        explanation, updated, updated_month, updated_year = 'null', 'null', 'null', 'null'

    brand_info = {
        'name': name,
        'rating': rating,
        'price': price,
        'location': location,
        'ships': ships,
        'description': description,
        'planet': planet,
        'people': people,
        'animals': animals,
        'explanation': explanation,
        'updated': updated,
        'updated_month': updated_month,
        'updated_year': updated_year
    }

    return brand_info

def scrape_brand_page(brand_url, types):
    # scrapes all of the info from the brand's html page
    print(brand_url)

    # get page source
    driver.get(brand_url)
    scroll(driver)
    brand_page = driver.page_source
    soup = BeautifulSoup(brand_page, "html.parser")

    brand_info = scrape_brand_info(soup)

    # add brand_url to dict
    brand_info['brand_url'] = brand_url

    # convert types into a string and add to dict
    sep = ", "
    brand_info['types'] = sep.join(types)

    return brand_info

def scrape_all_brand_pages():

    with open('brand_page_info.csv', mode='w') as csv_file:
        fieldnames = ['brand_url', 'types', 'name', 'rating', 'price', 'location', 'ships', 'description', 'planet', 'people', 'animals', 'explanation', 'updated', 'updated_month', 'updated_year']

        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()

        for (brand_url, types) in STORE_CATEGORY_MAP.items():
            brand_info = scrape_brand_page(brand_url, types)

            writer.writerow(brand_info)

In [43]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(os.path.join(sys.path[0], "chromedriver"), options=options)

scrape_all_brand_pages()

https://directory.goodonyou.eco/brand/bleed
https://directory.goodonyou.eco/brand/bhumi
https://directory.goodonyou.eco/brand/dharma-bums
https://directory.goodonyou.eco/brand/nube
https://directory.goodonyou.eco/brand/paapii-design
https://directory.goodonyou.eco/brand/living-crafts
https://directory.goodonyou.eco/brand/girlfriend-collective
https://directory.goodonyou.eco/brand/organic-basics
https://directory.goodonyou.eco/brand/elle-evans
https://directory.goodonyou.eco/brand/casagin
https://directory.goodonyou.eco/brand/nu-in
https://directory.goodonyou.eco/brand/purusha-people
https://directory.goodonyou.eco/brand/sage-larock
https://directory.goodonyou.eco/brand/ambiletics
https://directory.goodonyou.eco/brand/silou
https://directory.goodonyou.eco/brand/finisterre
https://directory.goodonyou.eco/brand/presca
https://directory.goodonyou.eco/brand/boob
https://directory.goodonyou.eco/brand/houdini
https://directory.goodonyou.eco/brand/dk-active
https://directory.goodonyou.eco/bran

UnboundLocalError: local variable 'updated_month' referenced before assignment