# Use after prepare_csv

## Fills the artists dataset

In [1]:
import sys
try:
    sys.path.insert(0, "/usr/lib/python3.7/site-packages")
except FileNotFoundError:
    pass

import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from difflib import SequenceMatcher

In [None]:
inpath  = "arts.csv"
outpath = "artsnew.csv"
df = pd.read_csv(inpath)
del(df["Unnamed: 0"])
df.head(5)

In [5]:
# returns artist name from the line of dataset
def get_artist(row: pd.Series):
    return row[0].strip().rstrip().lower().replace(".", "").replace(",", "").replace(" ", "-")

# returns art name from the line of dataset
def get_art(row: pd.Series):
    return row[1].strip().rstrip().lower().replace(".", "").replace(",", "").replace(" ", "-")

# makes URL with artist and art names
def url(artist: str, art: str):
    return 'https://www.christies.com/lotfinder/' \
           'searchresults.aspx?sc_lang=eng&lid=1&searchFrom=searchresults&entry={}-' \
           '{}&searchtype=p&action=search'.format(artist, art)

# parces search results on Christies, searching for most fitting variant
def get_correct_url(s: requests.Session, artist: str, art: str):
    page = s.get(url(artist, art), headers={"User-Agent": UserAgent().random})
    text = BeautifulSoup(page.text, "html.parser")
    batch = text.find_all(attrs={"class": "image-container"})
    dist: float = 0; tmpdist: float = 0
    URL: str = ""
    tmpname: str = ""
    tmpartist: str = ""
    for var in batch:
        try:
            tmpname = var.find("h3").get_text().strip().rstrip().lower()
        except AttributeError:
            continue
        l = str(var).find('alt="') + 5
        tmpartist = str(var)[l: str(var).find("(")].strip().rstrip().lower()
        tmpdist = SequenceMatcher(None, art, tmpname).ratio()
        artistdist = SequenceMatcher(None, artist, tmpartist).ratio()
        if(artistdist >= 0.7 and tmpdist > dist):
            dist = tmpdist
            l = str(var).find('href="') + 6
            URL  = str(var)[l: str(var).find('">', l)]
    URL = URL.replace("amp;", "")
    if(URL == "" and dist < 0.3):
        return None
    return URL

# when the corresponding url is found by get_correct_url, 
# parces information from the page
def parce(s: requests.Session, URL: str):
    page = s.get(URL)
    tmp = ["", "", ""]
    text = BeautifulSoup(page.text, "html.parser")
    estimate = text.find(attrs = {"id": "main_center_0_lblPriceEstimatedPrimary"})
    description = text.find(attrs = {"id": "main_center_0_lblLotDescription"})
    provenance = text.find(attrs = {"id": "main_center_0_lblLotProvenance"})
    if(estimate != None):
        tmp[0] = estimate.get_text()
    if(description != None):
        tmp[1] = description.get_text()
    if(provenance != None):
        tmp[2] = provenance.get_text()
    return tmp

# fills df[st:en] with data got by parce for each line
# st - start index, en - end index
def process(st: int, en: int, s: requests.Session, df: pd.DataFrame):
    arr = []
    artist: str
    atr: str
    for i, row in df.iloc[st:en].iterrows():
        print("\r                     ", end="")
        print("\r" + str(i) + "/" + "[" + str(st) + ":" + str(en) + "]", end="")
        try:
            artist = get_artist(row)
            art    = get_art(row)
        except AttributeError:
            arr += [["", "", ""]]
            continue
        URL = get_correct_url(s, artist, art)
        if(URL != None):
            tmp = parce(s, URL)
            arr += [tmp]
        else:
            arr += [["", "", ""]]
    return arr

#### The dataset processing takes pretty much time, so we divide it into parts\nand process it by blocks of 50 lines

In [None]:
s = requests.Session()
s.mount("https://", HTTPAdapter(max_retries=Retry(connect=3, backoff_factor=0.5)))
page = s.get(url(get_artist(df.iloc[1]), get_art(df.iloc[1])), headers={"User-Agent": UserAgent().random})

df["Estimate"] = ["" for i in range(df.shape[0])]
df["Description"] = ["" for i in range(df.shape[0])]
df["Provenance"] = ["" for i in range(df.shape[0])]

for i in range(0, 20):
    arr = process(50*i, 50*(i + 1), s, df)
    df["Estimate"][50*i: 50*(i + 1)] = [x[0] for x in arr]
    df["Description"][50*i: 50*(i + 1)] = [x[1] for x in arr]
    df["Provenance"][50*i: 50*(i + 1)] = [x[2] for x in arr]
    df.to_csv(outpath)
    