# Import

In [19]:
from pathlib import Path
import  time, argparse
import csv, sys
import pandas as pd
import os
import re
import uuid
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Browser Settings

In [12]:
# parser = argparse.ArgumentParser()
# args = parser.parse_args()
driver_path = './chromedriver-mac-x64/chromedriver'

options = webdriver.ChromeOptions()
options.add_argument('disable-gpu')
options.add_argument('--log-level=3') 
# options.add_argument('headless')
driver = webdriver.Chrome(service=Service(executable_path=driver_path), options=options)
# driver.set_window_size(1800, 1400)

# Visiting and Crawling

## Get Artists

In [20]:
def get_artist_page():
    WebDriverWait(driver, 10).until(lambda driver: len(driver.find_elements('xpath', '//*[@id="main"]/div/div[3]/*'))<2)
    # WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class*=ArtistsByLetter]')))
    artists_elements = driver.find_element('css selector', '[class*=ArtistsByLetter]').find_elements('xpath', '*')
    artists_data = pd.DataFrame(columns=['Guid','Name','Link']).set_index('Guid', inplace=True)
    for i, artist in enumerate(artists_elements):
        artist_guid =  uuid.uuid4().hex
        # waiter = WebDriverWait(driver, 15)
        # waiter.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="main"]/div/div[3]/div/div/div[{}]/a'.format(str(i+1)))))
        # print(i,artist.text)
        link = artist.find_element('xpath', '*').get_attribute('href')
        name = artist.text
        artist_data = {
            'Guid':artist_guid,
            'Name':name,
            'Link':link
        }
        new_df = pd.DataFrame(artist_data, index=[0])
        new_df.set_index('Guid', inplace=True)
        artists_data = pd.concat([artists_data, new_df])
    letter = driver.find_element('css selector', '[aria-label="Breadcrumb"]').find_element('xpath', './preceding::*[1]').text.split(' ')[-1]
    page = driver.current_url.split('=')[-1] if '=' in driver.current_url else '1'
    artist_path = './artists'
    if not os.path.exists(artist_path):
        os.makedirs(artist_path)
    artists_data.to_csv('artists/Artist-{}-{}.csv'.format(letter, page))
    print('Artists list obtained with initial {} on page-{}'.format(letter, page))
    return letter, int(page)
    
def click_next_page():
    pagination = driver.find_element('css selector', '[aria-label="Pagination"]')
    next_btn = pagination.find_elements('xpath', '*')[-1]
    if next_btn.get_attribute('href'):
        next_btn.click()
        time.sleep(1)
        return True
    else: 
        print('No more pages in current letter.')
        return False

In [21]:
starting_letter = 'a'
starting_page = 1
stopping_page = 3
url = 'https://www.artsy.net/artists/artists-starting-with-{}?page={}'.format(starting_letter, starting_page)
driver.get(url)
while True:    
    letter, page = get_artist_page()
    if not click_next_page() or page>= stopping_page:
        break

Artists list obtained with initial A on page-1
Artists list obtained with initial A on page-2
Artists list obtained with initial A on page-3


## Get Arts & Author's info


In [22]:
def get_artist_info(artist_link):
    driver.get(artist_link)
    artist_block = driver.find_element('xpath', '//*[@id="main"]/div/div[2]/div/div[1]/div/div[2]')
    # //*[@id="main"]/div/div[2]/div/div[1]/div/div[2]
    if len(artist_block.find_elements('xpath', '*'))>1:
        artist_name, artist_info = artist_block.text.split('\n')
    else: 
        artist_name = artist_block.text
        artist_info = None
        
    if len(driver.find_elements('css selector', '[class*="Message__Container"]')) > 0:
        art_num = 0
    else: 
        # todo: find fresnel-at-xs & fresnel-greaterThan-xs 
        xs_block = driver.find_elements('class name', 'fresnel-at-xs')
        art_num = len(driver.find_elements('css selector', '[data-test="artworkGridItem"]'))
    # art_num = len(driver.find_element('xpath', '//*[@id="main"]/div/div[7]/div[3]/div[1]').find_elements('xpath', '*'))
    return artist_name, artist_info, art_num

def get_art_data(art_link, artist):
    driver.get(art_link)
    art_guid =  uuid.uuid4().hex
    artwork_sidebar = driver.find_element('css selector', '[data-test="artworkSidebar"]')
    art_author = artist
    art_name = artwork_sidebar.find_element('xpath', './h1/i').text
    art_loc = artwork_sidebar.find_element('xpath', './h1').text.split(', ')[-1]
    art_sale = artwork_sidebar.find_elements('css selector', '[data-test="SaleMessage"]')
    art_price = art_sale[0].text if len(art_sale)>0 else ''
    # art_matierial = artwork_sidebar.find_element('xpath', './div[3]/div[1]').text
    # art_size = artwork_sidebar.find_element('xpath', './div[3]/div[2]').text

    art_detail = driver.find_elements('css selector', '[class*="ReadMore__Container"]')
    if len(art_detail)>0: 
        art_detail[0].click()
        art_des = art_detail[0].text
    else: art_des = ''

    art_data = {
            'Id':art_guid,
            'Title':art_name,
            'Author': artist,
            'Locale':art_loc,
            'Link':art_link,
            'Price':art_price,
            'Description':art_des,
            # 'Material':art_matierial,
            # 'Size':art_size
        }
    return art_data

In [23]:
with open('CMOA_artist.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Id','Full Name','Webpage','Number of Arts','Description'])
    writer.writeheader()
    f.close()
with open('CMOA_asset.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Id','Title','Author','Locale','Link','Price','Description'])
    writer.writeheader()
    f.close()
with open('CMOA_relationship.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['Id','AuthorId','Art_id'])
    writer.writeheader()
    f.close()

In [24]:
for f in Path('./artists/').glob('*.csv'):
    df = pd.read_csv(f)
    for i,row in df.iterrows():
        artist_id, artist_name, artist_link = row
        _, artist_info, art_num = get_artist_info(artist_link)

        with open('CMOA_artist.csv', 'a', newline='') as out_artist:
            artist_data = {
                'Id':artist_id,
                'Full Name':artist_name,
                'Webpage':artist_link,
                'Number of Arts':art_num,
                'Description':artist_info
            }
            fieldnames = artist_data.keys()
            writer = csv.DictWriter(out_artist, fieldnames=fieldnames)
            writer.writerow(artist_data)
        if art_num == 0: continue

        assert driver.current_url == artist_link
        art_items = driver.find_elements('css selector', '[data-test="artworkGridItem"]')
        art_links = [a.find_element('xpath', './a').get_attribute('href') for a in art_items]
        assert len(art_links) == art_num

        for art_link in art_links:
            art_data = get_art_data(art_link, artist_name)
            with open('CMOA_asset.csv', 'a', newline='') as out_asset:
                fieldnames = art_data.keys()
                writer = csv.DictWriter(out_asset, fieldnames=fieldnames)
                writer.writerow(art_data)

            with open('CMOA_relationship.csv', 'a', newline='') as out_rela:
                rela_data = {
                    'Id': uuid.uuid4().hex,
                    'AuthorId':artist_id,
                    'Art_id':art_data['Id'],
                }
                fieldnames = rela_data.keys()
                writer = csv.DictWriter(out_rela, fieldnames=fieldnames)
                writer.writerow(rela_data)

    #     if i>10:
    #         break
    #     # break
    # break

KeyboardInterrupt: 