# Library

In [1]:
import logging
import requests
from bs4 import BeautifulSoup
from urllib.error import HTTPError

import http.client as httplib  # or http.client if you're on Python 3 # httplib
httplib._MAXHEADERS = 10000
# ChunkedEncodingError

import time
import datetime
from datetime import datetime, timedelta

import re

import json

import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import sys

import random 
from random import sample

In [2]:
# pathes
art_project_path = '/content/drive/MyDrive/Art/Art_Project'
art_project_path_backups = '/content/drive/MyDrive/Art/Art_Project/parsing_backups'
art_project_path_tests = '/content/drive/MyDrive/Art/Art_Project/parsing_tests'

url_add = 'https://www.artsy.net'

# files
cat_dict = '/artsy_genom_cat_dict.txt'
gen_dict = '/artsy_genom_gen_dict.txt'
artwork_dict = '/artsy_genom_artwork_dict.txt'
artist_dict = '/artsy_genom_artist_dict.txt'
temp_data = '/artsy_temp_data.csv'

# id length
cat_id_len = 2
gen_id_len = 5
artist_id_len = 5

# for addition and updating
recently_added_pref = '?sort=-published_at'
recently_updated_pref = '?sort=-partner_updated_at'

# Functions

## Openings and savings

In [3]:
# 1. Save json dict
def save_dict(tree_dict, file, folder):
  if folder=='live':
    with open(art_project_path + file, 'w') as outfile:
      json.dump(tree_dict, outfile)
  elif folder=='backup':
    with open(art_project_path_backups + file, 'w') as outfile:
      json.dump(tree_dict, outfile)

# 2. Open json dict
def open_dict(file, folder):
  if folder=='live':
    with open(art_project_path + file, 'rb') as infile:
      tree_dict = json.load(infile)
  elif folder=='backup':
    with open(art_project_path_backups + file, 'rb') as infile:
      tree_dict = json.load(infile)
  return tree_dict

# 3. Open file.csv
def open_file_csv(file, folder): # folder: live, backup or test
  if folder=='live':
    df = pd.read_csv(art_project_path + file)
  elif folder=='backup':
    df = pd.read_csv(art_project_path_backups + file)
  elif folder=='test':
    df = pd.read_csv(art_project_path_tests + file)
  return df

# 4. Save df to csv file
def save_df_file(df, file, folder): # folder: live, backup or test
  if folder=='live':
    df.to_csv(art_project_path + file, index=False)
  elif folder=='backup':
    df.to_csv(art_project_path_backups + file, index=False)
  elif folder=='test':
    df.to_csv(art_project_path_tests + file, index=False)


## Parsing

### get_page

In [4]:
# 5. get_page
logging.basicConfig(level=logging.DEBUG, format="%(message)s")

def get_page(url, session):
  #response = requests.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
  try:
    response = session.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
  except: # requests.exceptions.ChunkedEncodingError:
    time.sleep(0.5)
    session = requests.Session()
    response = session.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})

  try:
    response.raise_for_status()
  except requests.exceptions.HTTPError: # for correction
    sys.exit(print('HTTPError: ' + str(url)))

  bs_page = BeautifulSoup(response.text, 'html.parser')
  return bs_page

### get next pages links list

In [5]:
# 6. get next pages links list
def get_next_page_list(url, page):
  try:
    last_page_data = page.find('main').find_all('nav', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 ibHUpM gzwvJD')[0].find_all('a')[-2]
    last_page = last_page_data.get_text()
    page_link = last_page_data.get('href')[:-len(last_page)]
    next_page_link_list_new = [url + page_link + str(p) for p in range(2, int(last_page)+1)]
  except:
    next_page_link_list_new = []

  return next_page_link_list_new

### get data from pages

In [6]:
# 7. get data from pages
def get_page_data(page):
  code_block = page.find('main').find_all('div', class_='ArtworkGrid-sc-1jsqquq-0 djwrUe')
  try:
    object_block = code_block[0].find_all('div', relay='[object Object]')
  
    # for the beggining into a DataFrame
    artsy_id_list = [obj.get('data-id') for obj in object_block]
    imgage_list = [obj.find_all('a')[0].find('img').get('src') for obj in object_block]
    artist_list = [obj.find_all('a')[1].find_all('div')[0].get_text() for obj in object_block]
  
    gallery_list = []
    for obj in object_block:
      try:
        gallery_list.append(obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn hENCPo').get_text())
      except:
        gallery_list.append(np.nan)
  
    price_list = []
    for obj in object_block:
      try:
        price_list.append(obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 eXbAnU jkuGdd').get_text())
      except:
        price_list.append(np.nan)
  
    # can be no year info !!!
    name_list = []
    year_list = []
    for obj in object_block:
      text = obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn kySEpG').get_text()
      name = obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn kySEpG').find('i').get_text()
      year = text.replace(name, '').replace(', ', '')
      if len(year)==0:
        year = np.nan
      name_list.append(name)
      year_list.append(year)

    df_data_new = pd.DataFrame({'artsy_id': artsy_id_list,
                                'name': name_list, 
                                'link': imgage_list, 
                                'year': year_list,
                                'artist': artist_list,
                                'gallery': gallery_list,
                                'price': price_list
                                })
  except:
    df_data_new = pd.DataFrame({'artsy_id': np.nan,
                                'name': np.nan, 
                                'link': np.nan, 
                                'year': np.nan,
                                'artist': np.nan,
                                'gallery': np.nan,
                                'price': np.nan
                                }, index=[0])
  
  return df_data_new

### get category data into df - aggregator

In [7]:
# Function aggregator
# 8. Get the all category data into DataFrame
def get_df_data(df_input, genes_links_list, genes_ids_list, cat):
  start_time = time.time()
  print(datetime.now().time())
  df_output = df_input.copy()
  df_output = df_output[0:0]

  for link in genes_links_list:
    session = requests.Session()
    total_link = url_add + link
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has started')
    # 5. get_page
    gene_page = get_page(total_link, session)
    #           get_page(url, session) -> 
    # -> bs_page
    
    # 6. get next pages links list
    next_page_link_list = get_next_page_list(url_add, gene_page)
    #                     get_next_page_list(url, page) -> 
    # -> next_page_link_list_new
    print(len(next_page_link_list)+1, 'pages for parsing')

    # 7. get data from pages
    df_data = get_page_data(gene_page)
    #         get_page_data(page) -> 
    # -> df_data_new
    df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
    df_data['category_id'] = cat
    df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')

    df_output = pd.concat([df_output, df_data])

    # Get data from next pages
    if len(next_page_link_list) > 0:
      for page_link in next_page_link_list:
        # 5. get_page
        gene_page = get_page(page_link, session)
        #           get_page(url, session) -> 
        # -> bs_page

        # 7. get data from pages
        df_data = get_page_data(gene_page)
        #         get_page_data(page) -> 
        # -> df_data_new
        df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
        df_data['category_id'] = cat
        df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
        df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')

        df_output = pd.concat([df_output, df_data])
      
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has finished')
    print((time.time() - start_time), 'seconds have passed')
    print(datetime.now().time())
    print('')
    time.sleep(0.5)

  return df_output

In [8]:
# Function aggregator
# 8.1 Get the first pages category data into DataFrame
def get_df_data_add_up(df_input, genes_links_list, genes_ids_list, cat):
  start_time = time.time()
  print(datetime.now().time())
  df_output = df_input.copy()
  df_output['pages_count'] = np.nan
  df_output['second_page_link'] = np.nan
  df_output = df_output[0:0]

  for link in genes_links_list:
    session = requests.Session()
    total_link = url_add + link
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has started')
    # 5. get_page
    gene_page = get_page(total_link, session)
    #           get_page(url, session) -> 
    # -> bs_page
    
    # 6. get next pages links list
    next_page_link_list = get_next_page_list(url_add, gene_page)
    #                     get_next_page_list(url, page) -> 
    # -> next_page_link_list_new
    print(len(next_page_link_list)+1, 'pages for parsing')

    # 7. get data from pages
    df_data = get_page_data(gene_page)
    #         get_page_data(page) -> 
    # -> df_data_new
    df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
    df_data['category_id'] = cat
    df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['pages_count'] = len(next_page_link_list)+1
    if len(next_page_link_list) > 0:
      df_data['second_page_link'] = next_page_link_list[0]
    else:
      df_data['second_page_link'] = np.nan

    df_output = pd.concat([df_output, df_data])
      
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has finished')
    print((time.time() - start_time), 'seconds have passed')
    print(datetime.now().time())
    print('')
    time.sleep(0.5)

  return df_output

### correct image link

In [9]:
# 9. correct image link
def img_link_corr(img):
  parts = img.split('.jpg')[0].replace('?', '/').replace('%2F', '/').split('/')
  img_new = parts[0] + '//' + parts[5] + '/' + parts[6] + '/' + parts[7]  + '.jpg'
  return img_new

### filtering

In [10]:
# 10. Filtering
def get_cat_id(cat):
  return cat

def gene_links_samples(gen_key):
  cat_id = get_cat_id(cat)
  return genes_dict[gen_key]['category_id'] == cat_id

# Gens dict

In [None]:
url = 'https://www.artsy.net/categories'

In [None]:
response = requests.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
genome_page = BeautifulSoup(response.text, 'html.parser')
genome_page.title.string

'Artsy — Discover, Buy, and Sell Fine Art'

## categories_dict

In [None]:
# !!!!! For the first time !!!!!
confirmation = input('Confirm the categories_dict re/seting (y/n)')
if confirmation == 'y':
  categories_dict = {}
  print('categories_dict has been re/set {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict re/seting (y/n)y
categories_dict has been re/set 2022-03-30 06:01
dictionary's length is - 0


In [None]:
# !!!!! All next times !!!!!
confirmation = input('Confirm the categories_dict opening (y/n)')
if confirmation == 'y':
  categories_dict = open_dict(cat_dict) # 2. Open json dict
  print('categories_dict has been opened {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict opening (y/n)n
nothing has happened


In [None]:
categories_block = genome_page.find('div', class_='sticky-inner-wrapper').find_all('div', class_='Box-sc-15se88d-0 Flex-cw39ct-0 elJsUD')

#categories_dict = {'artistic_disciplines': {'id': '02', 'name': 'Artistic Disciplines', 'link': 'jump--artistic-disciplines'}}
id = len(categories_dict) + 1
for cat in categories_block:
  key = cat.find('a').get('href').replace('#jump--', '').replace('-', '_')
  if key not in categories_dict.keys():
    categories_dict.update({key: {'id': str(id) if len(str(id))>1 else '0'+str(id), 'name': cat.get_text(), 'link': cat.find('a').get('href').replace('#', ''), 'genes_id': [], 'parsing_date': datetime.now().date().strftime('%Y-%m-%d'), 'updating_date': datetime.now().date().strftime('%Y-%m-%d')}})
    id += 1
    print('New key has been added - {}'.format(key))
  else:
    if categories_dict[key]['link'] != cat.find('a').get('href').replace('#', '') or categories_dict[key]['name'] != cat.get_text():
      print('an Old name - {}\nor an Old link - {}'.format(categories_dict[key]['name'], categories_dict[key]['link']))
      categories_dict[key].update({'link': cat.find('a').get('href').replace('#', '')})
      categories_dict[key].update({'name': cat.get_text()})
      categories_dict[key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})
      print('has been updated with \na New name - {}\nor a New link - {}'.format(categories_dict[key]['name'], categories_dict[key]['link']))
      print('')

New key has been added - artistic_disciplines
New key has been added - subject_matter
New key has been added - styles_and_movements
New key has been added - design_movements
New key has been added - visual_qualities
New key has been added - medium_and_techniques
New key has been added - time_periods
New key has been added - geographic_regions
New key has been added - materials
New key has been added - design_concepts_and_techniques
New key has been added - cultural_and_religious_styles
New key has been added - furniture_and_lighting
New key has been added - textiles
New key has been added - jewelry_and_fashion_object_types
New key has been added - tableware_vessels_and_objects
New key has been added - antiquities_artifacts_and_religious_objects


In [None]:
len(categories_dict)

16

In [None]:
categories_dict['artistic_disciplines']

{'genes_id': [],
 'id': '01',
 'link': 'jump--artistic-disciplines',
 'name': 'Artistic Disciplines',
 'parsing_date': '2022-03-30',
 'updating_date': '2022-03-30'}

In [None]:
confirmation = input('Confirm the categories_dict saving (y/n)')
if confirmation == 'y':
  save_dict(categories_dict, cat_dict) # 1. Save json dict
  print('categories_dict has been saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict saving (y/n)y
categories_dict has been saved 2022-03-30 06:02
dictionary's length is - 16


## genes_dict

In [None]:
# !!!!! For the first time !!!!!
confirmation = input('Confirm the genes_dict re/seting (y/n)')
if confirmation == 'y':
  genes_dict = {}
  print('genes_dict has been re/set {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict re/seting (y/n)y
genes_dict was re/set


In [None]:
# !!!!! All next times !!!!!
confirmation = input('Confirm the genes_dict opening (y/n)')
if confirmation == 'y':
  genes_dict = open_dict(gen_dict) # 2. Open json dict
  print('genes_dict has been opened {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict opening (y/n)y
genes_dict has been opened 2022-03-30 04:12
dictionary's length is - 1032


In [None]:
# !!!!! All next times !!!!!
# Determining the count of genes in the category for creating an id (using () instead of [] is also possible)
count_list = [sum([1 for v in genes_dict.values() if v['category_id']==value['id']]) for value in categories_dict.values()]
id_len = len(str(max(count_list)))
print(id_len)
print(count_list)

3
[7, 213, 252, 26, 75, 175, 20, 68, 39, 22, 69, 27, 6, 7, 19, 7]


In [None]:
#genes_dict = {'architecture_1': {'id': '0201', 'name': 'Architecture', 'link': '/gene/architecture-1', 'category_id': '02'}}
# !!! working link is 'https://www.artsy.net' + link !!!
for value in categories_dict.values():
  category = genome_page.find('div', id=value['link']).find('h2').text
  if category == value['name']:
    count = sum([1 for v in genes_dict.values() if v['category_id'] == value['id']])
    id = count+1
    gens_block = genome_page.find('div', id=value['link']).find_all('a')
    for a in gens_block:
      gen_key = a.get('href').replace('/gene/', '').replace('-', '_')
      if gen_key not in genes_dict.keys():
        genes_dict.update({gen_key: {'id': value['id']+('0'*(id_len-len(str(id)))+str(id)), 'name': a.get_text(), 'link': a.get('href'), 'category_id': value['id'], 'artsy_id': [], 'artist_id': [], 'parsing_date': datetime.now().date().strftime('%Y-%m-%d'), 'updating_date': datetime.now().date().strftime('%Y-%m-%d')}})
        id += 1
        print('New key has been added - {}'.format(gen_key)) # !!! Not for the first time
      else:
        if genes_dict[gen_key]['link'] != a.get('href') or genes_dict[gen_key]['name'] != a.get_text():
          print('an Old name - {}\nor an Old link - {}'.format(genes_dict[gen_key]['name'], genes_dict[gen_key]['link']))
          genes_dict[gen_key].update({'link': a.get('href')})
          genes_dict[gen_key].update({'name': a.get_text()})
          genes_dict[gen_key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})
          print('has been updated with \na New name - {}\nor a New link - {}'.format(genes_dict[gen_key]['name'], genes_dict[gen_key]['link']))
          print('')
  else:
    print("Category name doesn't match")

In [None]:
len(genes_dict)

1032

In [None]:
for value in genes_dict.values():
  if value['category_id'] == '01': # '02'
    print(value)

{'id': '01001', 'name': 'Architecture', 'link': '/gene/architecture-1', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01002', 'name': 'Design', 'link': '/gene/design', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01003', 'name': 'Fashion Design and Wearable Art', 'link': '/gene/fashion-design-and-wearable-art', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01004', 'name': 'Graphic Design', 'link': '/gene/graphic-design', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01005', 'name': 'Jewelry', 'link': '/gene/jewelry', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01006', 'name': 'Music', 'link': '/gene/music', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01007', 'name': 'Written Word', 'link': '/gene/written-word', 'category_

In [None]:
confirmation = input('Confirm the genes_dict saving (y/n)')
if confirmation == 'y':
  save_dict(genes_dict, gen_dict) # 1. Save json dict
  print('genes_dict was saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict saving (y/n)y
genes_dict was saved 2022-03-30 04:16
dictionary's length is - 1032


### addition/updating lists of genes' id into the categories_dict

In [None]:
# Dict of genes' id for the categories_dict
#cat_gen_id_dict = {}
#for value in categories_dict.values():
#  gen_id_list = [v['id'] for v in genes_dict.values() if v['category_id'] == value['id']]
#  cat_gen_id_dict.update({value['id']: gen_id_list})

#len(cat_gen_id_dict)

16

In [None]:
#cat_gen_id_dict['01']

['01001', '01002', '01003', '01004', '01005', '01006', '01007']

In [None]:
#for k, v in categories_dict.items():
#  for key, value in cat_gen_id_dict.items():
#    if v['id'] == key and categories_dict[k]['genes_id'] != value:
#      categories_dict[k].update({'genes_id': value})
#      categories_dict[k].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})

#len(categories_dict)

16

In [None]:
for key, value in categories_dict.items():
  gen_id_list = [v['id'] for v in genes_dict.values() if v['category_id'] == value['id']]
  if categories_dict[key]['genes_id'] != gen_id_list:
      categories_dict[key].update({'genes_id': gen_id_list})
      categories_dict[key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})

len(categories_dict)

In [None]:
categories_dict['artistic_disciplines']

{'genes_id': ['01001', '01002', '01003', '01004', '01005', '01006', '01007'],
 'id': '01',
 'link': 'jump--artistic-disciplines',
 'name': 'Artistic Disciplines',
 'parsing_date': '2022-03-30',
 'updating_date': '2022-03-30'}

In [None]:
confirmation = input('Confirm the categories_dict saving (y/n)')
if confirmation == 'y':
  save_dict(categories_dict, cat_dict) # 1. Save json dict
  print('categories_dict has been saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict saving (y/n)y
categories_dict has been saved 2022-03-30 06:03
dictionary's length is - 16


# Artworks and Artists dicts

## Filtering

The fastest and simplest way to collect data is to go through the genes_dict and read data from a tape of objects like from a news feed. On a tape they have:
- artist's name - to the artists_dict !!!
- artwork name - to the artworks_dict
- artwork year - to the artworks_dict
- Galleries and Institutions name - to the galleries_dict !!!
- price - to the artworks_dict
- image link - to the artworks_dict

That's anough for me.

In parallel I'm going to collect artists_dict and galleries_dict.

Artwork can belong to different genes, but I don't need this repetition in the artworks_dict. So I can't create a composed id for it (like gene id). Artwork id should be end-to-end.

Seems like I should choose the most comprehensive category, collect its database, and then add to it only new information.

But how much artworks are in Artsy database? Via their API I collected around 26 000 once.

In [None]:
categories_dict = open_dict(cat_dict, 'live') # 2. Open json dict
print("dictionary's length is - {}".format(len(categories_dict)))

dictionary's length is - 16


In [None]:
def by_value(item):
  return len(item[1]['genes_id'])

for k, v in sorted(categories_dict.items(), key=by_value, reverse=True):
  print(k, '->', 'id', categories_dict[k]['id'], '->', 'genes count' , len(categories_dict[k]['genes_id']))

styles_and_movements -> id 03 -> genes count 252
subject_matter -> id 02 -> genes count 213
medium_and_techniques -> id 06 -> genes count 175
visual_qualities -> id 05 -> genes count 75
cultural_and_religious_styles -> id 11 -> genes count 69
geographic_regions -> id 08 -> genes count 68
materials -> id 09 -> genes count 39
furniture_and_lighting -> id 12 -> genes count 27
design_movements -> id 04 -> genes count 26
design_concepts_and_techniques -> id 10 -> genes count 22
time_periods -> id 07 -> genes count 20
tableware_vessels_and_objects -> id 15 -> genes count 19
artistic_disciplines -> id 01 -> genes count 7
jewelry_and_fashion_object_types -> id 14 -> genes count 7
antiquities_artifacts_and_religious_objects -> id 16 -> genes count 7
textiles -> id 13 -> genes count 6


03 styles_and_movements looks like the the most comprehensive category.
Next will be:
- subject_matter -> 02
- medium_and_techniques -> 06

In [None]:
genes_dict = open_dict(gen_dict, 'live') # 2. Open json dict
print("dictionary's length is - {}".format(len(genes_dict)))

dictionary's length is - 1032


In [148]:
cat = '12'
#prefix = recently_added_pref
#prefix = recently_updated_pref

filtered_dict = {k: v for k, v in genes_dict.items() if v['category_id'] == cat}
genes_links_list = [v['link'] for v in filtered_dict.values()]
# for addition or updating
#genes_links_list = [v['link']+prefix for v in filtered_dict.values()]
genes_ids_list = [v['id'] for v in filtered_dict.values()]
print(len(genes_links_list))
print(len(genes_ids_list))

27
27


## Data collecting

### Parsing

#### First collection
- reparse categories: 12, 10 !!!

In [194]:
len(genes_ids_list_er)

2

In [195]:
# too many genes -> let's collect by 50 at a time
backup_names = []

In [196]:
df_temp_data_old = open_file_csv(temp_data, 'live') # 3. Open file.csv
try:
  # 8. Get the all category data into DataFrame    !!!!! [:] or _er         !!!!! [:] or _er 
  df_temp_data_new = get_df_data(df_temp_data_old, genes_links_list_er, genes_ids_list_er, cat)
  #                  get_df_data(df_input, genes_links_list, genes_ids_list, cat) ->
  # -> df_output
except SystemExit:
  pass

save_df_file(df_temp_data_new, temp_data, 'live') # 4. Save df to csv file

new_name_csv = temp_data.replace('.csv', '') + '_' + str(datetime.now()) + '.csv'
backup_names.append(new_name_csv)
save_df_file(df_temp_data_new, new_name_csv, 'backup') # 4. Save df to csv file

Starting new HTTPS connection (1): www.artsy.net:443


10:49:07.460917
Parsing of gene 12014 https://www.artsy.net/gene/lighting has started


https://www.artsy.net:443 "GET /gene/lighting HTTP/1.1" 200 None


69 pages for parsing


https://www.artsy.net:443 "GET /gene/lighting?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/lighting?page=15 HTTP/

Parsing of gene 12014 https://www.artsy.net/gene/lighting has finished
136.2954502105713 seconds have passed
10:51:23.756482



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 12011 https://www.artsy.net/gene/dining-table has started


https://www.artsy.net:443 "GET /gene/dining-table HTTP/1.1" 200 None


11 pages for parsing


https://www.artsy.net:443 "GET /gene/dining-table?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/dining-table?page=11 HTTP/1.1" 200 None


Parsing of gene 12011 https://www.artsy.net/gene/dining-table has finished
155.91847801208496 seconds have passed
10:51:43.379939



Saving blocks of 50 genes' data in separate files in beckup folder. Then we'll join them together and check if some pages have been missed during the parsing.

In [197]:
backup_names

['/artsy_temp_data_2022-04-19 10:51:43.926261.csv']

In [None]:
backup_names_cat_06 = ['/artsy_temp_data_2022-04-06 07:01:26.758130.csv',
                       '/artsy_temp_data_2022-04-06 07:19:01.562291.csv',
                       '/artsy_temp_data_2022-04-06 08:35:48.400500.csv',
                       '/artsy_temp_data_2022-04-06 10:31:00.254918.csv',
                       '/artsy_temp_data_2022-04-06 11:58:44.750998.csv',
                       '/artsy_temp_data_2022-04-06 12:41:26.025146.csv',
                       '/artsy_temp_data_2022-04-06 13:34:44.996171.csv']
                       + '/artsy_temp_data_2022-04-06 14:58:06.086053.csv'

backup_names_cat_05 = ['/artsy_temp_data_2022-04-06 16:39:34.236815.csv',
                       '/artsy_temp_data_2022-04-06 17:49:37.250209.csv']
                       + 
                      ['/artsy_temp_data_2022-04-15 07:46:32.183547.csv',
                       '/artsy_temp_data_2022-04-15 07:56:14.144004.csv']

backup_names_cat_11 = ['/artsy_temp_data_2022-04-08 06:51:14.997844.csv',
                       '/artsy_temp_data_2022-04-08 07:08:01.411081.csv']
                       +
                       '/artsy_temp_data_2022-04-18 09:50:37.600100.csv'

backup_names_cat_08 = ['/artsy_temp_data_2022-04-08 08:57:02.093364.csv',
                       '/artsy_temp_data_2022-04-08 10:40:21.922750.csv']
                       +
                       '/artsy_temp_data_2022-04-18 12:02:55.479950.csv',
                       '/artsy_temp_data_2022-04-18 12:43:06.028716.csv'
                       +
                       '/artsy_temp_data_2022-04-18 14:06:13.761778.csv'

backup_names_cat_09 = ['/artsy_temp_data_2022-04-08 12:02:03.429937.csv',
                       '/artsy_temp_data_2022-04-08 12:20:19.139091.csv',
                       '/artsy_temp_data_2022-04-08 12:50:03.398655.csv',
                       '/artsy_temp_data_2022-04-08 13:09:14.315180.csv']
                       +
                       '/artsy_temp_data_2022-04-18 15:58:00.276252.csv',
                       +
                       '/artsy_temp_data_2022-04-18 16:07:15.086943.csv'

backup_names_cat_12 = ['/artsy_temp_data_2022-04-08 13:24:06.923816.csv',
                       '/artsy_temp_data_2022-04-08 13:40:26.582642.csv',
                       '/artsy_temp_data_2022-04-08 14:08:48.410197.csv']
                       or
                       ['/artsy_temp_data_2022-04-19 09:23:10.775068.csv',
                        '/artsy_temp_data_2022-04-19 09:44:44.394285.csv']
                        +
                        '/artsy_temp_data_2022-04-19 10:51:43.926261.csv'

backup_names_cat_04 = ['/artsy_temp_data_2022-04-08 15:20:14.101746.csv']
                       +
                       '/artsy_temp_data_2022-04-19 04:08:43.691739.csv'

backup_names_cat_10 = ['/artsy_temp_data_2022-04-19 07:35:01.334592.csv',
                       '/artsy_temp_data_2022-04-19 07:48:03.359223.csv']
                       +
                       '/artsy_temp_data_2022-04-19 08:11:46.261977.csv'

backup_names_cat_07 = ['/artsy_temp_data_2022-04-09 04:15:24.907317.csv',
                       '/artsy_temp_data_2022-04-09 05:05:55.273163.csv']
                       +
                       '/artsy_temp_data_2022-04-19 04:48:56.712756.csv'

backup_names_cat_15 = ['/artsy_temp_data_2022-04-09 05:29:21.939929.csv']
                       +
                       '/artsy_temp_data_2022-04-19 05:22:25.192914.csv'

backup_names_cat_01 = ['/artsy_temp_data_2022-04-09 05:43:38.333522.csv']

backup_names_cat_14 = ['/artsy_temp_data_2022-04-09 06:01:02.568000.csv']

backup_names_cat_16 = ['/artsy_temp_data_2022-04-09 06:22:33.619264.csv']

backup_names_cat_13 = ['/artsy_temp_data_2022-04-09 06:44:32.738152.csv']
                       +
                       '/artsy_temp_data_2022-04-19 07:06:37.297601.csv'

#### Addition and Updating

In [None]:
df_temp_data_old = open_file_csv(temp_data, 'live') # 3. Open file.csv
#  8.1 Get the first pages category data into DataFrame
df_temp_data_new = get_df_data_add_up(df_temp_data_old, genes_links_list, genes_ids_list, cat)
#                  get_df_data_add_up(df_input, genes_links_list, genes_ids_list, cat) ->
# -> df_output
save_df_file(df_temp_data_new, temp_data, 'live') # 4. Save df to csv file

new_name_csv = temp_data.replace('.csv', '') + '_' + str(datetime.now()) + '.csv'
backup_names.append(new_name_csv)
save_df_file(df_temp_data_new, new_name_csv, 'backup') # 4. Save df to csv file

In [None]:
backup_names

### Errors' parsing checking

Checking the completeness of a new data after collecting error genes

In [None]:
# !!!!!!!!!!!!!!!!!

In [None]:
# cat 09
backup_names = ['/artsy_temp_data_2022-04-18 15:58:00.276252.csv',
                '/artsy_temp_data_2022-04-18 16:07:15.086943.csv']

In [None]:
df_temp_data_new = open_file_csv(backup_names[0], 'backup') # 3. Open file.csv
for name in backup_names[1:]:
  df_temp_data_new_part = open_file_csv(name, 'backup') # 3. Open file.csv
  df_temp_data_new = pd.concat([df_temp_data_new, df_temp_data_new_part])

#df_temp_data_new = df_temp_data_new.dropna(subset=['artsy_id'], axis=0) # !!!
df_temp_data_new.drop_duplicates(subset = ['gene_id', 'artsy_id'], keep = 'first', inplace = True)
df_temp_data_new = df_temp_data_new.reset_index().drop(['index'], axis=1) # !!!


In [None]:
df_temp_data_new['category_id'] = df_temp_data_new['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
df_temp_data_new['gene_id'] = df_temp_data_new['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

In [None]:
# !!!!!!!!!!!!!

In [198]:
df_temp_data_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2373 entries, 0 to 7
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       2373 non-null   object
 1   name           2373 non-null   object
 2   link           2373 non-null   object
 3   year           2315 non-null   object
 4   artist         2373 non-null   object
 5   gallery        2357 non-null   object
 6   price          2373 non-null   object
 7   gene_id        2373 non-null   object
 8   category_id    2373 non-null   object
 9   parsing_date   2373 non-null   object
 10  updating_date  2373 non-null   object
dtypes: object(11)
memory usage: 222.5+ KB


Everything is correct in case of empty df. -> Mergening

In [199]:
df_temp_data_new.loc[df_temp_data_new['artsy_id'].isna()==True]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date


In [None]:
df_temp_data_new.loc[df_temp_data_new['gene_id']=='04016'].head()

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
0,6234540a64caba000b7d3d9e,Osvaldo Borsani & Eugenio Gerli for Tecno Oval...,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1968,"Osvaldo Borsani, Eugenio GerliOsvaldo Borsani ...",MORENTZ,"US$6,450",4016,4,2022-04-19,2022-04-19
1,623b0890478704000b678259,Desk,https://d7hftxdivxxvm.cloudfront.net?resize_to...,ca. 1960,"Marco ZanusoDesk , ca. 1960Gokelaere & Robinso...",Gokelaere & Robinson,"€25,000",4016,4,2022-04-19,2022-04-19
2,545a72c57261692cdfe90100,"""Nitor,"" Bronze Candlestick",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2014,"Aldus""Nitor,"" Bronze Candlestick, 2014Maison G...",Maison Gerard,"US$6,200",4016,4,2022-04-19,2022-04-19
3,5592bc6e7261693ebf00003d,Try Tray,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1995,"Gaetano PesceTry Tray, 1995The Modern ArchiveU...",The Modern Archive,US$595,4016,4,2022-04-19,2022-04-19
4,53eac6aa7261691149ef0000,Octopus Chandelier,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2014,"Achille SalvagniOctopus Chandelier, 2014Maison...",Maison Gerard,"US$147,500",4016,4,2022-04-19,2022-04-19


In [None]:
print(url_add + genes_links_list[15])
print(genes_ids_list[15])

https://www.artsy.net/gene/italian-furniture-and-design
04016


In [None]:
genes_ids_list_er = [genes_ids_list[15]]
genes_links_list_er = [genes_links_list[15]]

In [None]:
# or

In [None]:
genes_ids_list_er = [g_id for g_id in df_temp_data_new.loc[df_temp_data_new['artsy_id'].isna()==True]['gene_id']]
genes_ids_list_er = list(set(genes_ids_list_er))
genes_ids_list_er

['09039', '09031']

In [None]:
genes_links_list_er = [genes_links_list[genes_ids_list.index(g_id)] for g_id in genes_ids_list_er]
genes_links_list_er

['/gene/wood', '/gene/stone']

In [None]:
df_temp_data_new = df_temp_data_new.dropna(subset=['artsy_id'], axis=0)

In [None]:
save_df_file(df_temp_data_new, new_name_csv, 'backup') # 4. Save df to csv file

### Primary parsing checking

Checking the completeness of the data

In [154]:
# cat 12
#backup_names = ['/artsy_temp_data_2022-04-09 06:44:32.738152.csv']
backup_names

['/artsy_temp_data_2022-04-19 09:23:10.775068.csv',
 '/artsy_temp_data_2022-04-19 09:44:44.394285.csv']

In [162]:
df_cat_dat = open_file_csv(backup_names[0], 'backup') # 3. Open file.csv
for name in backup_names[1:]:
  df_cat_dat_part = open_file_csv(name, 'backup') # 3. Open file.csv
  df_cat_dat = pd.concat([df_cat_dat, df_cat_dat_part])

In [None]:
# or
df_cat_dat = open_file_csv(backup_names[0], 'backup') # 3. Open file.csv

In [163]:
df_cat_dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23352 entries, 0 to 13960
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artsy_id       23349 non-null  object 
 1   name           23349 non-null  object 
 2   link           23347 non-null  object 
 3   year           22847 non-null  object 
 4   artist         23347 non-null  object 
 5   gallery        23093 non-null  object 
 6   price          23347 non-null  object 
 7   gene_id        23350 non-null  float64
 8   category_id    23350 non-null  object 
 9   parsing_date   23350 non-null  object 
 10  updating_date  23348 non-null  object 
dtypes: float64(1), object(10)
memory usage: 2.1+ MB


In [None]:
# !!!!!!!!!!!!!!!!

In [164]:
df_cat_dat['category_id'].unique()

array(['12', nan, '2022-04-19'], dtype=object)

In [171]:
df_cat_dat_nan = df_cat_dat.loc[df_cat_dat['category_id'].isna()==True]
df_cat_dat_nan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 2611 to 4705
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artsy_id       2 non-null      object 
 1   name           2 non-null      object 
 2   link           0 non-null      object 
 3   year           0 non-null      object 
 4   artist         0 non-null      object 
 5   gallery        0 non-null      object 
 6   price          0 non-null      object 
 7   gene_id        0 non-null      float64
 8   category_id    0 non-null      object 
 9   parsing_date   0 non-null      object 
 10  updating_date  0 non-null      object 
dtypes: float64(1), object(10)
memory usage: 192.0+ bytes


In [172]:
df_cat_dat_nan

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
2611,61698fb5299f47000de9a839,‘Ero’ chairs by Starck for Kartell,,,,,,,,,
4705,61698fb5299f47000de9a839,‘Ero’ chairs by Starck for Kartell,,,,,,,,,


In [173]:
artsy_id_list = df_cat_dat_nan['artsy_id'].tolist()
name_list = df_cat_dat_nan['name'].tolist()

In [169]:
df_cat_dat_date = df_cat_dat.loc[df_cat_dat['category_id']=='2022-04-19']
df_cat_dat_date.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 2612 to 4706
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artsy_id       2 non-null      object 
 1   name           2 non-null      object 
 2   link           2 non-null      object 
 3   year           2 non-null      object 
 4   artist         2 non-null      object 
 5   gallery        2 non-null      object 
 6   price          2 non-null      object 
 7   gene_id        2 non-null      float64
 8   category_id    2 non-null      object 
 9   parsing_date   2 non-null      object 
 10  updating_date  0 non-null      object 
dtypes: float64(1), object(10)
memory usage: 192.0+ bytes


In [170]:
df_cat_dat_date

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
2612,(Italy),https://d7hftxdivxxvm.cloudfront.net?resize_to...,Circa 2000,Philippe Starck‘Ero’ chairs by Starck for Kart...,Eclectica Contemporary,Price on request,12005,12.0,2022-04-19,2022-04-19,
4706,(Italy),https://d7hftxdivxxvm.cloudfront.net?resize_to...,Circa 2000,Philippe Starck‘Ero’ chairs by Starck for Kart...,Eclectica Contemporary,Price on request,12017,12.0,2022-04-19,2022-04-19,


In [174]:
imgage_list = df_cat_dat_date['name'].tolist()
year_list = df_cat_dat_date['link'].tolist()
artist_list = df_cat_dat_date['year'].tolist()
gallery_list = df_cat_dat_date['artist'].tolist()
price_list = df_cat_dat_date['gallery'].tolist()
gene_list = df_cat_dat_date['price'].tolist()
cat_list = df_cat_dat_date['gene_id'].tolist()
parsing_list = df_cat_dat_date['parsing_date'].tolist()

In [175]:
df_data_new = pd.DataFrame({'artsy_id': artsy_id_list,
                            'name': name_list, 
                            'link': imgage_list, 
                            'year': year_list,
                            'artist': artist_list,
                            'gallery': gallery_list,
                            'price': price_list,
                            'gene_id': gene_list,
                            'category_id': cat_list,
                            'parsing_date': parsing_list,
                            'updating_date': parsing_list,
                            })
df_data_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artsy_id       2 non-null      object 
 1   name           2 non-null      object 
 2   link           2 non-null      object 
 3   year           2 non-null      object 
 4   artist         2 non-null      object 
 5   gallery        2 non-null      object 
 6   price          2 non-null      object 
 7   gene_id        2 non-null      object 
 8   category_id    2 non-null      float64
 9   parsing_date   2 non-null      object 
 10  updating_date  2 non-null      object 
dtypes: float64(1), object(10)
memory usage: 304.0+ bytes


In [177]:
df_data_new['category_id'].unique()

array([12.])

In [168]:
df_cat_dat_12 = df_cat_dat.loc[df_cat_dat['category_id']=='12']
df_cat_dat_12.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23348 entries, 0 to 13960
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artsy_id       23345 non-null  object 
 1   name           23345 non-null  object 
 2   link           23345 non-null  object 
 3   year           22845 non-null  object 
 4   artist         23345 non-null  object 
 5   gallery        23091 non-null  object 
 6   price          23345 non-null  object 
 7   gene_id        23348 non-null  float64
 8   category_id    23348 non-null  object 
 9   parsing_date   23348 non-null  object 
 10  updating_date  23348 non-null  object 
dtypes: float64(1), object(10)
memory usage: 2.1+ MB


In [176]:
df_cat_dat_12['category_id'].unique()

array(['12'], dtype=object)

In [178]:
df_cat_dat = df_cat_dat_12.copy()
df_cat_dat = pd.concat([df_cat_dat, df_data_new])
df_cat_dat = df_cat_dat.reset_index().drop(['index'], axis=1)
df_cat_dat['category_id'] = 12

df_cat_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23350 entries, 0 to 23349
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       23347 non-null  object
 1   name           23347 non-null  object
 2   link           23347 non-null  object
 3   year           22847 non-null  object
 4   artist         23347 non-null  object
 5   gallery        23093 non-null  object
 6   price          23347 non-null  object
 7   gene_id        23350 non-null  object
 8   category_id    23350 non-null  int64 
 9   parsing_date   23350 non-null  object
 10  updating_date  23350 non-null  object
dtypes: int64(1), object(10)
memory usage: 2.0+ MB


In [179]:
df_cat_dat['category_id'].unique()

array([12])

In [None]:
# !!!!!!!!

In [188]:
#df_cat_dat['category_id'] = df_cat_dat['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
#df_cat_dat['gene_id'] = df_cat_dat['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

df_cat_dat['category_id'] = df_cat_dat['category_id'].dropna().apply(lambda x: str(int(float(x))) if len(str(int(float(x))))==cat_id_len else ('0'*(cat_id_len-len(str(int(float(x)))))+str(int(float(x)))))
df_cat_dat['gene_id'] = df_cat_dat['gene_id'].dropna().apply(lambda x: str(int(float(x))) if len(str(int(float(x))))==gen_id_len else ('0'*(gen_id_len-len(str(int(float(x)))))+str(int(float(x)))))

In [189]:
df_cat_dat.loc[df_cat_dat['artsy_id'].isna()==True]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
5851,,,,,,,,12011,12,2022-04-19,2022-04-19
10649,,,,,,,,12014,12,2022-04-19,2022-04-19
11250,,,,,,,,12014,12,2022-04-19,2022-04-19


In [190]:
df_cat_dat.loc[df_cat_dat['gene_id']=='12011'].head()

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
5611,624d61a5e25714000c32bde3,Spanish 18th Century Refectory Dining Table,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1700-1750,Unknown ArtistSpanish 18th Century Refectory D...,AVANTIQUES,"US$4,500",12011,12,2022-04-19,2022-04-19
5612,5eb1d70b3e52a6000df08763,"""Cloud"" CenterTable",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2020,"Jacques Jarrige""Cloud"" CenterTable, 2020Valeri...",Valerie Goodman Gallery,"US$27,000",12011,12,2022-04-19,2022-04-19
5613,59da74309c18db249fcdbccd,"Lacquered desk table ""Luca""",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2017,"Jacques JarrigeLacquered desk table ""Luca"", 20...",Valerie Goodman Gallery,"US$23,750",12011,12,2022-04-19,2022-04-19
5614,5bae2d5b1f74731ca8eac04a,Oval dining table with leaves,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1952,"Hans J. WegnerOval dining table with leaves, 1...",Dansk Møbelkunst Gallery,Price on request,12011,12,2022-04-19,2022-04-19
5615,620258c734e7fc000b5e722f,Lovö dining table,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1930,"Axel Einar HjorthLovö dining table, 1930Gokela...",Gokelaere & Robinson,"€14,000",12011,12,2022-04-19,2022-04-19


Getting list of genes with missing pages parsing and lounch parsing of these genes (not pages !!!) again.

In [191]:
genes_ids_list_er = [str(g_id) for g_id in df_cat_dat.loc[df_cat_dat['artsy_id'].isna()==True]['gene_id']]
genes_ids_list_er = list(set(genes_ids_list_er))
genes_ids_list_er

['12014', '12011']

In [192]:
genes_links_list_er = [genes_links_list[genes_ids_list.index(g_id)] for g_id in genes_ids_list_er]
genes_links_list_er

['/gene/lighting', '/gene/dining-table']

In [193]:
df_cat_dat = df_cat_dat.dropna(subset=['artsy_id'], axis=0)

It would be better for faster errors' parsing if I got the page number info...

!!! One item can belong not only to different categoryes but also to different genes into one category. That's very important for joining with reparsed data !!!

In [None]:
print(df_cat_dat['artsy_id'].count())
print(df_cat_dat['artsy_id'].nunique())

110253
73696


In [None]:
print(df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].count().max())
print(df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].nunique().max())

2
1


In [None]:
gene_artsy_groupped = df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].agg(['count']).reset_index()
gene_artsy_groupped.loc[gene_artsy_groupped['count']>1]

Unnamed: 0,gene_id,artsy_id,count
4309,06007,540f40cd7261692d6d550000,2
18541,06021,5ae9db4a9c18db166e639c7e,2
20019,06021,606d59a4b58b09000e0f0fed,2
20209,06021,60e05ea715fca20013f5854a,2
20657,06021,61c0ee1e845711000bf8163c,2
...,...,...,...
247062,06174,5f172c2dea92a20011267465,2
250018,06175,618bea8b5180b7000e0faba9,2
250139,06175,61ba11e097ec3a000b017724,2
251094,06175,623b41fed562fe000c49e3ba,2


In [None]:
df_cat_dat.loc[df_cat_dat['artsy_id']=='6248b64fa3259b000c24e910']

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
39366,6248b64fa3259b000c24e910,Time Is Movin’,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2021,"Mitsuyo OkadaTime Is Movin’, 2021Space 776US$290",Space 776,US$290,6175,6,2022-04-06,2022-04-06
39369,6248b64fa3259b000c24e910,Time Is Movin’,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2021,"Mitsuyo OkadaTime Is Movin’, 2021Space 776US$290",Space 776,US$290,6175,6,2022-04-06,2022-04-06


In [None]:
for link in df_cat_dat.loc[df_cat_dat['artsy_id']=='4d8b93484eb68a1b2c00125b']['link']:
  print(link)

https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&qua

There can be item duplicates inside one gene items' list also. Delete them.

### Mergening of all the data

In [201]:
# cat 10
df_cat_dat_total = pd.concat([df_cat_dat, df_temp_data_new])
df_cat_dat_total.drop_duplicates(subset = ['gene_id', 'artsy_id'], keep = 'first', inplace = True)
df_cat_dat_total = df_cat_dat_total.reset_index().drop(['index'], axis=1) # !!!

In [None]:
# or
df_cat_dat_total = df_cat_dat.copy()

In [202]:
df_cat_dat_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23433 entries, 0 to 23432
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       23433 non-null  object
 1   name           23433 non-null  object
 2   link           23433 non-null  object
 3   year           22927 non-null  object
 4   artist         23433 non-null  object
 5   gallery        23177 non-null  object
 6   price          23433 non-null  object
 7   gene_id        23433 non-null  object
 8   category_id    23433 non-null  object
 9   parsing_date   23433 non-null  object
 10  updating_date  23433 non-null  object
dtypes: object(11)
memory usage: 2.0+ MB


Genes checking: sometimes there may be a problem (like with cat 03)

In [203]:
print(len(genes_ids_list))
df_cat_dat_total['gene_id'].dropna().nunique()

27


27

In [204]:
check_gene_id = df_cat_dat_total['gene_id'].dropna().unique().tolist()
len(check_gene_id)

27

In [205]:
save_df_file(df_cat_dat_total, '/artsy_temp_data_12.csv', 'backup') # 4. Save df to csv file
#df_cat_dat_total = open_file_csv('/artsy_temp_data_09.csv', 'backup') # 3. Open file.csv

In [None]:
# !!! cat 10 !!!
genes_ids_list_er = list(set(genes_ids_list) - set(check_gene_id))
genes_ids_list_er

['10021',
 '10008',
 '10001',
 '10013',
 '10006',
 '10017',
 '10020',
 '10004',
 '10009',
 '10003',
 '10014',
 '10015',
 '10012',
 '10016',
 '10005',
 '10010',
 '10011',
 '10018',
 '10002',
 '10007',
 '10022',
 '10019']

In [None]:
genes_links_list_er = [genes_links_list[genes_ids_list.index(g_id)] for g_id in genes_ids_list_er]
for link in genes_links_list_er:
  print(url_add + link)

https://www.artsy.net/gene/whimsical-design
https://www.artsy.net/gene/handcrafted-furniture-and-design
https://www.artsy.net/gene/cad-computer-aided-design
https://www.artsy.net/gene/modular-design
https://www.artsy.net/gene/engineering-slash-construction-and-design
https://www.artsy.net/gene/restrained-forms
https://www.artsy.net/gene/timber-frame-construction
https://www.artsy.net/gene/design-by-artists
https://www.artsy.net/gene/iconic-works-of-design
https://www.artsy.net/gene/design-by-architects
https://www.artsy.net/gene/non-functional-design
https://www.artsy.net/gene/reinforced-concrete-construction
https://www.artsy.net/gene/masonry-construction
https://www.artsy.net/gene/religious-building
https://www.artsy.net/gene/emerging-design
https://www.artsy.net/gene/industrial-and-product-design
https://www.artsy.net/gene/jewelry-by-painters-and-sculptors
https://www.artsy.net/gene/steel-slash-iron-frame-construction
https://www.artsy.net/gene/childrens-furniture-and-design
https:/

Pages of these genes are empty, so everything's allright.

## Data studying

In [206]:
#df_cat_dat_total = open_file_csv('/artsy_temp_data_05.csv', 'backup') # 3. Open file.csv
df_cat_dat_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23433 entries, 0 to 23432
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       23433 non-null  object
 1   name           23433 non-null  object
 2   link           23433 non-null  object
 3   year           22927 non-null  object
 4   artist         23433 non-null  object
 5   gallery        23177 non-null  object
 6   price          23433 non-null  object
 7   gene_id        23433 non-null  object
 8   category_id    23433 non-null  object
 9   parsing_date   23433 non-null  object
 10  updating_date  23433 non-null  object
dtypes: object(11)
memory usage: 2.0+ MB


In [207]:
print(df_cat_dat_total.groupby(['gene_id', 'artsy_id'])['artsy_id'].count().max())
print(df_cat_dat_total.groupby(['gene_id', 'artsy_id'])['artsy_id'].nunique().max())

1
1


Transform id from int into str and link into a working one.

In [208]:
#df_cat_dat_total['category_id'] = df_cat_dat_total['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
#df_cat_dat_total['gene_id'] = df_cat_dat_total['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

df_cat_dat_total['link'] = df_cat_dat_total['link'].dropna().apply(lambda x: img_link_corr(x))

df_cat_dat_total.sample(5)

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
8363,6010b930607b5600105e53fb,NULL Hot Pink Bench,https://d32dm0rphc51dk.cloudfront.net/76S39HpT...,2020,"Studio BUZAONULL Hot Pink Bench, 2020Gallery A...",Gallery ALL,"US$16,500",12013,12,2022-04-19,2022-04-19
567,5acfc4f8275b244eaaddc622,Set of 10 Regency Dining Chairs,https://d32dm0rphc51dk.cloudfront.net/SPlwYlva...,1980s,"Karl SpringerSet of 10 Regency Dining Chairs, ...",Todd Merrill Studio,Sold,12001,12,2022-04-19,2022-04-19
13948,5a1a064acd530e4e2c8f1629,Palle Suenson Windsor Chairs,https://d32dm0rphc51dk.cloudfront.net/QU0FdLLl...,ca. 1945,"Palle SuensonPalle Suenson Windsor Chairs, ca....",Københavns Møbelgalleri,"US$2,500",12017,12,2022-04-19,2022-04-19
8684,62053630994cf1000b4f1e01,Affix Console - Pillar,https://d32dm0rphc51dk.cloudfront.net/Mf1OPpBJ...,2020,"Charlotte JockheerAffix Console - Pillar , 202...",Carwan Gallery,"€2,700",12013,12,2022-04-19,2022-04-19
15712,5f57594501c6c1000e50f402,Pair of wall-mounted unit - SAS Royal Hotel,https://d32dm0rphc51dk.cloudfront.net/KLLjn-ND...,1958,Arne JacobsenPair of wall-mounted unit - SAS R...,Gokelaere & Robinson,Sold,12018,12,2022-04-19,2022-04-19


In [209]:
for link in df_cat_dat_total['link'].sample(5):
  print(link)

https://d32dm0rphc51dk.cloudfront.net/zDBzPgwc7ar0OLeafXDVtg/large.jpg
https://d32dm0rphc51dk.cloudfront.net/Hz6PtR84xoj17JyO9qsH2w/large.jpg
https://d32dm0rphc51dk.cloudfront.net/I6Qt7zHCEJOT_buiNd3z8g/large.jpg
https://d32dm0rphc51dk.cloudfront.net/RBDn0CBYgH50-8lPBggZ9A/large.jpg
https://d32dm0rphc51dk.cloudfront.net/l3Y83NOcx9FZw076Riv-wQ/large.jpg


Count of unique artsy_id and link should be equal. But sometimes they're not (like in cat 03).

In [210]:
print(df_cat_dat_total['artsy_id'].nunique())
print(df_cat_dat_total['link'].nunique())

11253
11253


In [None]:
# !!! links' problem solving !!!

In [None]:
artsy_id_link = df_cat_dat_total[['artsy_id', 'link']]
artsy_id_link.drop_duplicates(subset = ['link', 'artsy_id'], keep = 'first', inplace = True)
artsy_id_link.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138883 entries, 0 to 179744
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   artsy_id  138883 non-null  object
 1   link      138883 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [None]:
artsy_link_groupped = artsy_id_link.groupby(['artsy_id'])['link'].agg(['count']).reset_index()
links_prob_list = artsy_link_groupped.loc[artsy_link_groupped['count']>1]['artsy_id'].tolist()
len(links_prob_list)

1

In [None]:
links_prob_list

['6247862634f6b2000b712474']

In [None]:
id = links_prob_list[0]
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
132903,6247862634f6b2000b712474,PORTRAIT STYLISÉ DE JACQUELINE (TÊTE DE FEMME)...,https://d32dm0rphc51dk.cloudfront.net/dIHPcpGC...,1962,Pablo PicassoPORTRAIT STYLISÉ DE JACQUELINE (T...,Robert Fontaine Gallery,Price on request,8052,8,2022-04-08,2022-04-08
162343,6247862634f6b2000b712474,PORTRAIT STYLISÉ DE JACQUELINE (TÊTE DE FEMME)...,https://d32dm0rphc51dk.cloudfront.net/3gxxZMXE...,1962,Pablo PicassoPORTRAIT STYLISÉ DE JACQUELINE (T...,Robert Fontaine Gallery,Price on request,8062,8,2022-04-08,2022-04-08


In [None]:
for link in df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id]['link'].unique():
  print(link)

https://d32dm0rphc51dk.cloudfront.net/dIHPcpGCPw8QyU-YRGQRMA/large.jpg
https://d32dm0rphc51dk.cloudfront.net/3gxxZMXEYPjh8hhouRj9lQ/large.jpg


These pictures are the same but shots were done with different light. Replace the last one.

In [None]:
df_cat_dat_total.at[162343, 'link'] = 'https://d32dm0rphc51dk.cloudfront.net/dIHPcpGCPw8QyU-YRGQRMA/large.jpg'

In [None]:
print(df_cat_dat_total['artsy_id'].nunique())
print(df_cat_dat_total['link'].nunique())

138882
138882


In [None]:
save_df_file(df_cat_dat_total, '/artsy_temp_data_08.csv', 'backup') # 4. Save df to csv file
#df_cat_dat_total = open_file_csv('/artsy_temp_data_05.csv', 'backup') # 3. Open file.csv

In [None]:
# !!! links' problem solving !!!

### Artists

We need to create artist id for artworks dict. Artists seem to be ok.

In [None]:
#df_cat_dat_total = open_file_csv('/artsy_temp_data_05.csv', 'backup') # 3. Open file.csv

In [211]:
print(df_cat_dat_total['artist'].dropna().nunique())
artists_list = sorted(df_cat_dat_total['artist'].dropna().unique().tolist())
print(len(artists_list))

10905
10905


In [212]:
artists_list[:10]

[' Unknown,  Athens, Greece, EuropeBlack-Figure Neck Amphora, ca. 510 BCEJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Athens, Greece, EuropeCockleshell Aryballos, 525 -475 BCEJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Athens, Greece, EuropeExaleiptron, ca. 500 BCEJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Corinth, Greece, EuropeCorinthian Aryballos,  first quarter of 6th century B.C.J. Paul Getty MuseumPermanent collection ',
 ' Unknown,  East GreeceAlabastron,  2nd century B.C.J. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Eastern MediterraneanAryballos,  6th centuryJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Eastern MediterraneanFlask,  3rd centuryJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Eastern MediterraneanSprinkler Flask,  3rd -4th centuryJ. Paul Getty MuseumPermanent collection ',
 ' Unknown,  Greece, Cyclades, EuropeCollared Jar of the Grotta-Pelos Group, 3000 BCE -2800 B.C.J. Paul Getty MuseumPermane

In [213]:
artists_list[-10:]

['touche—toucheOnyx, 2020Everyday GallerySold ',
 'touche—touchePlanet B, 2020Everyday Gallery€2,020 ',
 "touche—toucheSectional sofa 'BBC's', 2019Everyday GallerySold ",
 'touche—toucheTale of Frac, 2020Everyday Gallery€6,400 ',
 'touche—toucheThe Colour of Honey, 2020Everyday Gallery€6,700 ',
 "touche—toucheWall Mirror 'Abby S.', 2019Everyday Gallery€3,150 ",
 'wells coatesBauhaus Oak Desk, ca. 1930Fears and KahnSold ',
 'Élisabeth JouliaLamp, ca. 1955Galerie Pascal Cuisinier ',
 'Émile-Jacques RuhlmannModele Sultzer, Writing Table, ca. circa 1932Maison GerardUS$100,000–US$150,000 ',
 'Óscar Tusquets BlancaSet of Six Oscar Tusquets “Lucas” Dining Chairs for Driade, ca. 1987Almond & Co.Price on request ']

In [214]:
artists_list[(len(artists_list)//2):(len(artists_list)//2+10)]

['Joaquim TenreiroSide table, 1950Mercado ModernoPrice on request ',
 'Joaquim TenreiroSide table, ca. 1950Mercado ModernoSold ',
 'Joaquim TenreiroSofa and Armchairs, 1958Mercado ModernoPrice on request ',
 'Joaquim TenreiroSofa, c. 1950sSalon 94 DesignSold ',
 'Joaquim TenreiroSoft-edged rectangular dining table in jacaranda with black underpainted glass top and curved legs. , 1949Jeff Lincoln Art+DesignPrice on request ',
 'Joaquim TenreiroThree seat sofa, 1958Gokelaere & RobinsonSold ',
 'Joaquim TenreiroTriangular Table, 1960 / 2021ETELPrice on request ',
 'Joaquim TenreiroWhite Circular Table, 1960 / 2021ETELPrice on request ',
 'Joaquim TenreiroYelloe Circular Table, 1960 / 2021ETELPrice on request ',
 'Joaquim Tenreirolong bench, 1950sMercado ModernoPrice on request ']

Not ok actually. If it become important, I'll do somthing with it. -> It became. Look 'artists name revision' block

In [None]:
#decoding_table = (
#'\x00'     #  0x00 -> NULL
#'\x01'     #  0x01 -> START OF HEADING
#'\x02'     #  0x02 -> START OF TEXT
#'\x03'     #  0x03 -> END OF TEXT
#'\x04'     #  0x04 -> END OF TRANSMISSION
#'\x05'     #  0x05 -> ENQUIRY
#'\x06'     #  0x06 -> ACKNOWLEDGE
#'\x07'     #  0x07 -> BELL
#'\x08'     #  0x08 -> BACKSPACE
#'\t'       #  0x09 -> HORIZONTAL TABULATION
#'\n'       #  0x0A -> LINE FEED
#'\x0b'     #  0x0B -> VERTICAL TABULATION
#'\x0c'     #  0x0C -> FORM FEED
#'\r'       #  0x0D -> CARRIAGE RETURN
#'\x0e'     #  0x0E -> SHIFT OUT
#'\x0f'     #  0x0F -> SHIFT IN
#'\x10'     #  0x10 -> DATA LINK ESCAPE
#'\x11'     #  0x11 -> DE
#add the character code here
#'\u200b' #add this in the file and save it.

### Galleries

We also need to create gallery id for artworks dict.

In [None]:
print(df_cat_dat_total['gallery'].nunique())
gallery_list = sorted(df_cat_dat_total['gallery'].dropna().unique().tolist())
print(len(gallery_list))

4104
4104


In [None]:
gallery_list[:10]

[' "A Brush with Reality: Calligraphic Paintings by Lobsang Choephel" at Tibet House US, New York (2015)',
 ' "Disguise: Masks and Global African Art" at Seattle Art Museum, Seattle (2015)',
 '"2015 Wolfgang Hahn Prize: Michael Krebber and R. H. Quaytman" at Museum Ludwig, Cologne (2015)',
 '"21er Raum: Iman Issa - Material" at 21er Haus, Vienna',
 '"4 REAL & TRUE 2. Wim Wenders. Landscapes. Photographs." at Museum Kunstpalast, Düsseldorf (2015)',
 '"A Beautiful Lie – Eckersberg" at Statens Museum for Kunst, Copenhagen ',
 '"A New Dynasty – Created in China"  Venue: ARoS Aarhus Museum of Art, Aarhus ',
 '"Abstract/Object" at Art Institute of Chicago, Chicago',
 '"Accrochage 3: Pop & Music/Sound" at Fondation Louis Vuitton, Paris',
 '"Adventures of the Black Square: Abstract Art and Society 1915-2015" at Whitechapel Gallery, London (2015)']

In [None]:
gallery_list[-10:]

['°CLAIRbyKahn Galerie',
 'ÆRENA Galleries and Gardens',
 'Österreichische Galerie Belvedere, Vienna',
 '“Clifford Ross: Landscape Seen & Imagined” at MASS MoCA, North Adams (2015)',
 '“Collectionism and Modernity. Two Case Studies: The Im Obersteg and Rudolf Staechelin Collections” at the Museo Reina Sofía, Madrid',
 '“Created by a Hand with but a Chisel Armed…” Sculpture in St Petersburg’s Palaces in the 19th Century at The State Hermitage Museum, 2016',
 '“L’image volée” at Fondazione Prada, Milan (2016)',
 '√K Contemporary',
 '首都藝術中心 Capital Art Center',
 '鳩ノ森美術 / HATONOMORI ART']

Seems like in some cases there's a exhibition name in a gellery name box. Let's try to divide them.

In [None]:
exhib_list = []
gal_list = []

for g in gallery_list:
  try:
    gal = g.split(' at ')[-1]
    gal_list.append(gal)
    exhib_list.append(g.replace(' at ' + gal, ''))
  except:
    gal_list.append(g)

gal_list = sorted(list(set(gal_list)))
print(len(gal_list))
exhib_list = sorted(list(set(exhib_list)))
print(len(exhib_list))

5256
5360


In [None]:
gal_list[:10]

[' Museum Ludwig, Cologne',
 '"A New Dynasty – Created in China"  Venue: ARoS Aarhus Museum of Art, Aarhus ',
 '"Anselm Kiefer"at Centre Pompidou, Paris',
 '"Barbie"at Musée des Arts Décoratifs, Paris',
 '"Chen Zhen: Without going to New York and Paris, life could be internationalized"  Venue: Rockbund Art Museum, Shanghai',
 '"Inhuman"at Fridericianum, Kassel, Germany',
 '"Joaquín Torres-García: The Arcadian Modern"at the Museum of Modern Art, New York',
 '"Landscapes of Belgium" Musée d\'Ixelles, Brussels (2015)',
 '"Philippe Parreno: Anywhere, Anywhere, Out Of The World", Palais de Tokyo, Paris (2013)',
 '"This Art is Your Art" Competition: The White House Historical Association, Artsy, and the Robert Rauschenberg Foundation']

The data is not clear and this info is not critical, leave it as it is, won't do gallery id.

### Artworks

In [None]:
#df_cat_dat_total = open_file_csv('/artsy_temp_data_05.csv', 'backup') # 3. Open file.csv

In [None]:
df_cat_dat_total.columns

Index(['artsy_id', 'name', 'link', 'year', 'artist', 'gallery', 'price',
       'gene_id', 'category_id', 'parsing_date', 'updating_date'],
      dtype='object')

For ***one to one connections***

In [215]:
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'year', 'artist', 'gallery', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index()
one_to_one.sample(5)

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,parsing_date,updating_date,count
1484,54aeaa537261695c3bb00a00,The Wild Shall Wild Remain,https://d32dm0rphc51dk.cloudfront.net/P1Fnnd0r...,2014,"Jaclyn KainThe Wild Shall Wild Remain, 2014Gal...",Gallery NAGA,US$600,2022-04-19,2022-04-19,1
5564,5d8f8bbc12f011000e8ea6a9,Rectangular side table,https://d32dm0rphc51dk.cloudfront.net/ttVe8uJ8...,Circa 1930,"André SornayRectangular side table, Circa 1930...",Galerie Alain Marcelpoil,Sold,2022-04-19,2022-04-19,2
4827,5c1146d723a3b8768cc5c799,Trèfle low table,https://d32dm0rphc51dk.cloudfront.net/LKR5gqgd...,ca. 1955,"Jean RoyèreTrèfle low table, ca. 1955Galerie P...",Galerie Patrick Seguin,Price on request,2022-04-19,2022-04-19,2
6437,5e8631e10bf924000e6e50cb,Two-tier Side Table,https://d32dm0rphc51dk.cloudfront.net/g8nVCnRB...,1955,"Maison LeleuTwo-tier Side Table, 1955Maison Ge...",Maison Gerard,"US$28,000",2022-04-19,2022-04-19,2
4487,5b8efbebc2fefa1f43b2ef87,Nike Deadstock,https://d32dm0rphc51dk.cloudfront.net/NqufsAAG...,2013,"TF DUTCHMANNike Deadstock, 2013Deep Space Gall...",Deep Space Gallery,Sold,2022-04-19,2022-04-19,2


In [216]:
print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

11253
10854
10854


In [217]:
print('artsy_id count must be', df_cat_dat_total['artsy_id'].nunique())
print('')
for col in ['name', 'link', 'year', 'artist', 'gallery', 'price', 'parsing_date', 'updating_date']:
  group_df = df_cat_dat_total.groupby(['artsy_id', col])['category_id'].agg(['count']).reset_index()
  print('by column', col)
  print('artsy_id count is', group_df['artsy_id'].count())
  print('artsy_id nunique is', group_df['artsy_id'].nunique())
  print('count and nunique is the same - ', group_df['artsy_id'].count()==group_df['artsy_id'].nunique())
  print(list(set(df_cat_dat_total['artsy_id'].unique()) - set(group_df['artsy_id'].unique())))
  print('')

artsy_id count must be 11253

by column name
artsy_id count is 11253
artsy_id nunique is 11253
count and nunique is the same -  True
[]

by column link
artsy_id count is 11253
artsy_id nunique is 11253
count and nunique is the same -  True
[]

by column year
artsy_id count is 10989
artsy_id nunique is 10989
count and nunique is the same -  True
['53907dd9275b2474bf0014e9', '5b05aac41a1e864553fdb3d6', '52418812c9dc2415da000577', '590a00e9c9dc24694a2f1ee3', '54184c9372616957a02b0500', '5c9317684f31f536d095988f', '53f642e37261692d733e0200', '553ea9dd726169189cd60000', '57191a02cd530e6593000834', '57191a02139b214a0f001e13', '581ccc529c18db3e71000fe4', '529506579c18dba92600006b', '546159497261692d43710900', '5c9315a89c63792aaedeceba', '544532f3726169636acd1200', '5a60c2128b3b8143e549b606', '58af1674b202a302a02ca508', '57238bf6cd530e186c000198', '5ba7fe14036cf52540e32c32', '50ef217c11668218e20008c5', '51a5c091275b245a2900002d', '5b7bd9aeda075b64022b342e', '5c931633495e6d7285075c98', '51a0602

In [None]:
df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(['5ade13f79c18db6586720662', '57e27e8a7622dd65fc000380', '57ee50cbcd530e65fa00003f'])]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
27348,57ee50cbcd530e65fa00003f,"A Large White-ground Lekythos, Attributed to t...",https://d32dm0rphc51dk.cloudfront.net/91vHFCn1...,Attic-ca. 410 B.C.,"Unknown GreekA Large White-ground Lekythos, At...",,,7012,7,2022-04-09,2022-04-09
27360,57e27e8a7622dd65fc000380,A Torso of an Athlete,https://d32dm0rphc51dk.cloudfront.net/rgHMZzri...,Late HellenisticRoman1st cent. B.C.,"Unknown GreekA Torso of an Athlete, Late Helle...",,Sold,7012,7,2022-04-09,2022-04-09
28164,5ade13f79c18db6586720662,An Attic Bilingual Eye-cup,https://d32dm0rphc51dk.cloudfront.net/5qmgVLxc...,,Unknown GreekAn Attic Bilingual Eye-cup,,,7012,7,2022-04-09,2022-04-09


Due to NaNs in 'name' (sometimes), 'year' and 'gallery' we can loose ids. And we have problems with prices, sometimes in years and artists.

#### name

In [None]:
artsy_id_name = df_cat_dat_total[['artsy_id', 'name']]
artsy_id_name.drop_duplicates(subset = ['name', 'artsy_id'], keep = 'first', inplace = True)
id_name_g = artsy_id_name.groupby(['artsy_id'])['name'].agg(['count']).reset_index()

id_prob_name = id_name_g.loc[id_name_g['count']>1]['artsy_id'].tolist()
len(id_prob_name)

1

In [None]:
inx = 0
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_name[inx]]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
91463,62346eea92005e000e7ed5c0,Chasing Goof Fortune - Hiroshima Sleepless Nig...,https://d32dm0rphc51dk.cloudfront.net/iaGiNT0W...,2010,Ori GershtChasing Goof Fortune - Hiroshima Sle...,Galleria Bianconi,"€13,000–€15,000",8037,8,2022-04-08,2022-04-08
111536,62346eea92005e000e7ed5c0,Chasing Good Fortune - Hiroshima Sleepless Nig...,https://d32dm0rphc51dk.cloudfront.net/iaGiNT0W...,2010,Ori GershtChasing Good Fortune - Hiroshima Sle...,Galleria Bianconi,"€13,000–€15,000",8043,8,2022-04-08,2022-04-08
122202,62346eea92005e000e7ed5c0,Chasing Good Fortune - Hiroshima Sleepless Nig...,https://d32dm0rphc51dk.cloudfront.net/iaGiNT0W...,2010,Ori GershtChasing Good Fortune - Hiroshima Sle...,Galleria Bianconi,"€13,000–€15,000",8048,8,2022-04-08,2022-04-08


In [None]:
inx_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_name[inx]].index.tolist()

name_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_name[inx]]['name'].unique()
name_list

array(['Chasing Goof Fortune - Hiroshima Sleepless Nights - Never Again 01',
       'Chasing Good Fortune - Hiroshima Sleepless Nights - Never Again 01'],
      dtype=object)

In [None]:
for i in inx_list:
  df_cat_dat_total.at[i, 'name'] = name_list[0]

#### price

In [None]:
artsy_id_price = df_cat_dat_total[['artsy_id', 'price']]
artsy_id_price.drop_duplicates(subset = ['price', 'artsy_id'], keep = 'first', inplace = True)
id_price_g = artsy_id_price.groupby(['artsy_id'])['price'].agg(['count']).reset_index()

id_prob_price = id_price_g.loc[id_price_g['count']>1]['artsy_id'].tolist()
len(id_prob_price)

1

In different genes mentioned price could be different. In most cases prices are very similar -> put the mode or one that in the interval. Sometims there isn't a mode or it's 'Price on request'. Put the maximum. We don't need history here.

In [None]:
inx = 0
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_price[inx]]
#df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(id_prob_price)]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
5106,5c75c7b07f2e1f1daf553e74,Homage to the Square: Full (Rug),https://d32dm0rphc51dk.cloudfront.net/Y-veD8oj...,Current production based on 1962 work,"Josef AlbersHomage to the Square: Full (Rug), ...",Artware Editions,Sold,13003,13,2022-04-09,2022-04-09
14367,5c75c7b07f2e1f1daf553e74,Homage to the Square: Full (Rug),https://d32dm0rphc51dk.cloudfront.net/Y-veD8oj...,Current production based on 1962 work,"Josef AlbersHomage to the Square: Full (Rug), ...",Artware Editions,"US$1,250",13005,13,2022-04-19,2022-04-19


In [None]:
inx_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_price[inx]].index.tolist()

price_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_price[inx]]['price'].unique()
price_list

array(['Sold ', 'US$1,250 '], dtype=object)

In [None]:
for i in inx_list:
  df_cat_dat_total.at[i, 'price'] = price_list[1]

In [None]:
#for i in inx_list:
#  print(df_cat_dat_total['price'].iloc[[i]].index.values[0])

#### year

In [None]:
artsy_id_year = df_cat_dat_total[['artsy_id', 'year']]
artsy_id_year.drop_duplicates(subset = ['year', 'artsy_id'], keep = 'first', inplace = True)
id_year_g = artsy_id_year.groupby(['artsy_id'])['year'].agg(['count']).reset_index()

id_prob_year = id_year_g.loc[id_year_g['count']>1]['artsy_id'].tolist()
len(id_prob_year)

0

In [None]:
inx = 1
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_year[inx]]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
65426,6229f8fddf1d11000d69c4f4,Caballero I,https://d32dm0rphc51dk.cloudfront.net/KDZ7OhTx...,2015,"Manolo ValdésCaballero I, 2015Proyecto H€7,500...",Proyecto H,"€9,000–€11,000",6054,6,2022-04-06,2022-04-06
187793,6229f8fddf1d11000d69c4f4,Caballero I,https://d32dm0rphc51dk.cloudfront.net/KDZ7OhTx...,2006,"Manolo ValdésCaballero I, 2006Proyecto H€9,000...",Proyecto H,"€9,000–€11,000",6131,6,2022-04-06,2022-04-06


In [None]:
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_year[inx]]['year'].unique()

array(['2015', '2006'], dtype=object)

In [None]:
df_cat_dat_total.at[65609, 'year'] = '2006'
df_cat_dat_total.at[65426, 'year'] = '2006'

#### gallery

In [None]:
artsy_id_gallery = df_cat_dat_total[['artsy_id', 'gallery']]
artsy_id_gallery.drop_duplicates(subset = ['gallery', 'artsy_id'], keep = 'first', inplace = True)
id_gallery_g = artsy_id_gallery.groupby(['artsy_id'])['gallery'].agg(['count']).reset_index()

id_prob_gallery = id_gallery_g.loc[id_gallery_g['count']>1]['artsy_id'].tolist()
len(id_prob_gallery)

1

In [None]:
inx = 0
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_gallery[inx]]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
22221,61e7987ef82abd000c0fb200,Salute (White) Artist Proof,https://d32dm0rphc51dk.cloudfront.net/wkoV4skP...,2021,"Shen Jingdong 沈敬东Salute (White) Artist Proof, ...",SimpleArte,US$700,8008,8,2022-04-08,2022-04-08
179437,61e7987ef82abd000c0fb200,Salute (White) Artist Proof,https://d32dm0rphc51dk.cloudfront.net/wkoV4skP...,2021,"Shen Jingdong 沈敬东Salute (White) Artist Proof, ...",iCare4U,US$700,8005,8,2022-04-18,2022-04-18


In [None]:
inx_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_gallery[inx]].index.tolist()

gallery_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_gallery[inx]]['gallery'].unique()
gallery_list

array(['SimpleArte', 'iCare4U'], dtype=object)

In [None]:
for i in inx_list:
  df_cat_dat_total.at[i, 'gallery'] = gallery_list[0]

#### artist

In [None]:
artsy_id_artist = df_cat_dat_total[['artsy_id', 'artist']]
artsy_id_artist.drop_duplicates(subset = ['artist', 'artsy_id'], keep = 'first', inplace = True)
id_artist_g = artsy_id_artist.groupby(['artsy_id'])['artist'].agg(['count']).reset_index()

id_prob_artist = id_artist_g.loc[id_artist_g['count']>1]['artsy_id'].tolist()
len(id_prob_artist)

1

In [None]:
inx = 0
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_artist[inx]]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
5106,5c75c7b07f2e1f1daf553e74,Homage to the Square: Full (Rug),https://d32dm0rphc51dk.cloudfront.net/Y-veD8oj...,Current production based on 1962 work,"Josef AlbersHomage to the Square: Full (Rug), ...",Artware Editions,"US$1,250",13003,13,2022-04-09,2022-04-09
14367,5c75c7b07f2e1f1daf553e74,Homage to the Square: Full (Rug),https://d32dm0rphc51dk.cloudfront.net/Y-veD8oj...,Current production based on 1962 work,"Josef AlbersHomage to the Square: Full (Rug), ...",Artware Editions,"US$1,250",13005,13,2022-04-19,2022-04-19


In [None]:
inx_list = df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_artist[inx]].index.tolist()

df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_artist[inx]]['artist'].unique()

array(['Josef AlbersHomage to the Square: Full (Rug), Current production based on 1962 workArtware EditionsSold ',
       'Josef AlbersHomage to the Square: Full (Rug), Current production based on 1962 workArtware EditionsUS$1,250 '],
      dtype=object)

In [None]:
for i in inx_list:
  df_cat_dat_total.at[i, 'artist'] = 'Josef Albers'

#### dates

In [None]:
df_cat_dat_total['parsing_date'].max()

'2022-04-19'

In [None]:
df_cat_dat_total['parsing_date'].min()

'2022-04-09'

In [None]:
date = df_cat_dat_total['parsing_date'].unique().min()
df_cat_dat_total['parsing_date'] = date
df_cat_dat_total['updating_date'] = date

#### check

In [218]:
# 'name', - ???
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'artist', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index()

print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

11253
11253
11253


In [219]:
save_df_file(df_cat_dat_total, '/artsy_temp_data_12.csv', 'backup') # 4. Save df to csv file

## Dict creating

In [220]:
#df_cat_dat_total = open_file_csv('/artsy_temp_data_11.csv', 'backup') # 3. Open file.csv
df_cat_dat_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23433 entries, 0 to 23432
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       23433 non-null  object
 1   name           23433 non-null  object
 2   link           23433 non-null  object
 3   year           22927 non-null  object
 4   artist         23433 non-null  object
 5   gallery        23177 non-null  object
 6   price          23433 non-null  object
 7   gene_id        23433 non-null  object
 8   category_id    23433 non-null  object
 9   parsing_date   23433 non-null  object
 10  updating_date  23433 non-null  object
dtypes: object(11)
memory usage: 2.0+ MB


In [None]:
#df_cat_dat_total['category_id'] = df_cat_dat_total['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
#df_cat_dat_total['gene_id'] = df_cat_dat_total['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

In [221]:
df_cat_dat_total['category_id'].unique()

array(['12'], dtype=object)

### artworks_dict

#### initiation

In [None]:
#artworks_dict = {'artsy_id': {'name': 'Portrait of a Man in Armour', 
#                              'link': 'https://d32dm0rphc51dk.cloudfront.net/yaB__0LXFvbXC4jiUK4Vpw/large.jpg', 
#                              'year': 'ca. 1560',
#                              'artist_id': '000000', # Jacopo Bassano
#                              -- 'gallery_id': '000000', # Robilant+Voena
#                              'price': 'Price on request',
#                              'gene_id': ['21603', ''],
#                              'category_id': ['03', '']}}

***One to one connections***

Let's create a dict from complete data.

In [222]:
# 'name' - ??? !!!
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'artist', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index().drop('count', axis=1)

check_id_list = one_to_one['artsy_id'].sample(10).tolist()

print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

11253
11253
11253


In [223]:
# Dict creation
artworks_dict = one_to_one.set_index('artsy_id').to_dict('index')
len(artworks_dict)

11253

In [224]:
for id in check_id_list:
  print(artworks_dict[id])

{'name': "Wim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands", 'link': 'https://d32dm0rphc51dk.cloudfront.net/jlr236hnEO9JaBYOCLMDyQ/large.jpg', 'artist': "Wim RietveldWim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands, 1954MORENTZUS$1,275 ", 'price': 'US$1,275 ', 'parsing_date': '2022-04-19', 'updating_date': '2022-04-19'}
{'name': 'Relief Lamp 1', 'link': 'https://d32dm0rphc51dk.cloudfront.net/9kT5TdBiaJcvt9LTlCE_2A/large.jpg', 'artist': 'Kasper KjeldgaardRelief Lamp 1, 2017Patrick Parrish GallerySold ', 'price': 'Sold ', 'parsing_date': '2022-04-19', 'updating_date': '2022-04-19'}
{'name': 'Collective Gymnastics', 'link': 'https://d32dm0rphc51dk.cloudfront.net/luGGBqmQ2T_hhxJlXX1oSg/large.jpg', 'artist': 'Barbora ŽilinskaitėCollective Gymnastics, 2021Friedman BendaUS$22,000 ', 'price': 'US$22,000 ', 'parsing_date': '2022-04-19', 'updating_date': '2022-04-19'}
{'name': 'Simply Stainless Pedestal Stand', 'link': 'https://d32dm0rphc51dk.cloudfront.

#### expansion

In [None]:
#artworks_dict = open_dict(artwork_dict)

Columns with NaNs:
- 'name' (not always), 'year', 'gallery'
- ***one to one connection***
- info is in the **df**

In [62]:
# one to one connection from df
def artwork_dict_expan_oto(df, main_dict, param):
  group_df = df.groupby(['artsy_id', param])['category_id'].agg(['count']).reset_index().drop('count', axis=1)
  group_dict = group_df.set_index('artsy_id').to_dict('index')

  for k in main_dict:
    if k in group_dict.keys():
      main_dict[k].update(group_dict[k])
    else:
      main_dict[k].update({param: ' '}) # !!!!

# one to many connection from df
def artwork_dict_expan_otm(df, main_dict, param):
  group_df = df.groupby(['artsy_id'])[param].apply(list).apply(set).apply(list).reset_index(name=param)
  group_dict = group_df.set_index('artsy_id').to_dict('index')

  for k in main_dict:
    if k in group_dict.keys():
      main_dict[k].update(group_dict[k])
    else:
      main_dict[k].update({param: [' ']}) # !!!!

In [225]:
# name - !!!!!!
#artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'name')
# year
artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'year')
# gallery
artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'gallery')

In [226]:
for id in check_id_list:
  print('name:', artworks_dict[id]['name'], 'year:', artworks_dict[id]['year'], 'gallery:', artworks_dict[id]['gallery'])

name: Wim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands year: 1954 gallery: MORENTZ
name: Relief Lamp 1 year: 2017 gallery: Patrick Parrish Gallery
name: Collective Gymnastics year: 2021 gallery: Friedman Benda
name: Simply Stainless Pedestal Stand year: 2010 gallery:  
name: CHARLESTON Floor Lamp year: 1984 gallery: the Office.
name: Shelf 'Dune 04' year: 2007 gallery: David Gill Gallery
name: Another Land year: 2016 gallery: Patrick Parrish Gallery
name: Unusual Bench with Curved Frame year: ca. 1946 gallery: Maison Gerard
name: Circle Mirror with Black Chainmail year: 2018 gallery: Carvalho Park
name: SOAP Column (Toffee) year: 2018 gallery: Etage Projects


Columns with repeating values:
- 'gene_id', 'category_id'
- list type parametr - ***one to many connection***
- id info is in the **df**

In [227]:
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()

#artworks_dict = {k: v.update({'category_id': [cat[0]]}) for k, v in artworks_dict.items()} - doesn't work correctly
for k in artworks_dict:
  artworks_dict[k].update({'category_id': [cat[0]]})

# gene_id
artwork_dict_expan_otm(df_cat_dat_total, artworks_dict, 'gene_id')

In [228]:
for id in check_id_list:
  print('category_id:', artworks_dict[id]['category_id'], 'gene_id:', artworks_dict[id]['gene_id'])

category_id: ['12'] gene_id: ['12017', '12005']
category_id: ['12'] gene_id: ['12014', '12004']
category_id: ['12'] gene_id: ['12013', '12018']
category_id: ['12'] gene_id: ['12020', '12025']
category_id: ['12'] gene_id: ['12014', '12012']
category_id: ['12'] gene_id: ['12013', '12018']
category_id: ['12'] gene_id: ['12013', '12020', '12025']
category_id: ['12'] gene_id: ['12023', '12003', '12017']
category_id: ['12'] gene_id: ['12015']
category_id: ['12'] gene_id: ['12023', '12017']


In [229]:
save_dict(artworks_dict, '/artsy_genom_artwork_dict_cat12.txt', 'backup') # 1. Save json dict

#### artists name revision

In [None]:
#artworks_dict = open_dict(artwork_dict, 'live')
#print(len(artworks_dict))

215336


In [230]:
filtered_dict = {k: v for k, v in artworks_dict.items() if k == check_id_list[0]} # v['artist_id'] == '00047'
filtered_dict

{'5f51eb20e62c3100102ec722': {'artist': "Wim RietveldWim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands, 1954MORENTZUS$1,275 ",
  'category_id': ['12'],
  'gallery': 'MORENTZ',
  'gene_id': ['12017', '12005'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/jlr236hnEO9JaBYOCLMDyQ/large.jpg',
  'name': "Wim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands",
  'parsing_date': '2022-04-19',
  'price': 'US$1,275 ',
  'updating_date': '2022-04-19',
  'year': '1954'}}

In [231]:
# Old data
artists_name_list_old = [v['artist'] for v in artworks_dict.values()]

artists_name_list_old_ = list(set(artists_name_list_old))
print(len(artists_name_list_old_))

10905


In [232]:
sample(artists_name_list_old, 10)

['Bjarne MelgaardCabinett Man 1, 2007Galeria SendaPrice on request ',
 "Pia Maria Raeder'Sea Anemone' Side Table (Bright Yellow), 2016Galerie BSL€9,000 ",
 'ZanottaKarelia, 1966THEFOURTHSold ',
 'Humberto and Fernando CampanaNoah Bench, 2017Carpenters Workshop GalleryPrice on request ',
 'Sebastian ErrazurizDelta Chair, 2008Cristina Grajales Gallery ',
 'Cristian AndersenSeaside 1990 / Large, 2020Etage ProjectsPrice on request ',
 'Pierre JeanneretIndian Rosewood Desk, ca. 1958Peter Blake GallerySold ',
 'Roberto Giulio RidaSettimanile tall chest of drawers, 2014Nilufar Gallery ',
 'Astrid KroghIkat II, 2011"Crafted: Objects in Flux" at Museum of Fine Arts, Boston ',
 'Aksel Bender Madsen and Ejner LarsenDesk with leather top, 1966Dansk Møbelkunst GalleryPrice on request ']

In [233]:
for v in artworks_dict.values():
  v.update({'artist': v['artist'].replace(v['name'], '') if v['name']!=' ' else v['artist']})
  v.update({'artist': v['artist'].replace(v['year'], '') if v['year']!=' ' else v['artist']})
  v.update({'artist': v['artist'].replace(v['gallery'], '') if v['gallery']!=' ' else v['artist']})
  v.update({'artist': v['artist'].replace(v['price'], '') if v['price']!=' ' else v['artist']})
  v.update({'artist': 'Unknown' if len(v['artist'])==0 else v['artist']})
  v.update({'artist': v['artist'].strip(u'\u200b')})
  v.update({'artist': re.sub(r'\xa0|\n|\t', ' ', v['artist'])})
  v.update({'artist': v['artist'].strip()})
  v.update({'artist': v['artist'][:-1] if v['artist'][-1:]==',' else v['artist']})
  try:
    v.update({'artist': v['artist'][1:] if v['artist'][0]==',' else v['artist']})
  except:
    pass
  v.update({'artist': v['artist'].strip()})
  v.update({'artist': 'Unknown' if len(v['artist'])==0 else v['artist']})

In [234]:
filtered_dict = {k: v for k, v in artworks_dict.items() if k == check_id_list[0]} # v['artist_id'] == '00047'
filtered_dict

{'5f51eb20e62c3100102ec722': {'artist': 'Wim Rietveld',
  'category_id': ['12'],
  'gallery': 'MORENTZ',
  'gene_id': ['12017', '12005'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/jlr236hnEO9JaBYOCLMDyQ/large.jpg',
  'name': "Wim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands",
  'parsing_date': '2022-04-19',
  'price': 'US$1,275 ',
  'updating_date': '2022-04-19',
  'year': '1954'}}

In [235]:
# New data
artists_name_list_new = [v['artist'] for v in artworks_dict.values()]

artists_name_list_new_ = list(set(artists_name_list_new))
print(len(artists_name_list_new_))

2142


In [236]:
sample(artists_name_list_new, 10)

['Gary Magakis',
 'Samy Rio',
 'Charles and Ray Eames',
 'Poul Kjærholm',
 'Zhoujie Zhang',
 'Marco Iannicelli',
 'Unknown',
 'Finn Juhl',
 'Lukas Janitsch',
 'Frank Gehry']

In [237]:
artists_name_list_diff_1 = list(set(artists_name_list_old_) - set(artists_name_list_new_))
len(artists_name_list_diff_1)

10905

In [238]:
sample(artists_name_list_diff_1, 10)

["Mattia BonettiCoffee Table 'Yo-Yo', 2008David Gill GalleryPrice on request ",
 'Tanya AguiñigaMonadnock, 2013Volume Gallery ',
 'Nanna Ditzel"Ring" Chair by Nanna Ditzel, 1950-1959Almond & Co.Sold ',
 'Brian ThoreenMixed Marble Coffee Table - Green, 2015Patrick Parrish GalleryPrice on request ',
 'Benjamin Rollins CaldwellFencing Side Chair, 2011 ',
 'Amanda RichardsPrism Sconce ( Double ), 2017Tuleste FactoryUS$16,000 ',
 'Albert PaleyComet Table Lamp, 2005-2010Wexler GalleryUS$9,500 ',
 'Henry KrokatsisUntitled (Braitrim 01), 2010Vigo Gallery£15,400 ',
 'Paul CocksedgeSlump Rock Coffee Table 2, 2019Carpenters Workshop GalleryPrice on request ',
 'Egg CollectiveHaynes Mirror - Round, ContemporaryEgg CollectivePrice on request ']

In [239]:
artists_name_list_diff_2 = list(set(artists_name_list_new_) - set(artists_name_list_old_))
len(artists_name_list_diff_2)

2142

In [240]:
sample(artists_name_list_diff_2, 10)

['Piero Bottoni',
 'Carlo Zen',
 ':mentalKLINIK',
 'Fredrik Paulsen',
 'Nick Weddell',
 'Council',
 'André Dubreuil',
 'Fabio Hendry',
 'Hank Willis Thomas',
 'Ettore Sottsass, Memphis Group']

In [241]:
not_title_name = []
for n in artists_name_list_new:
  if n[0].isupper()==False and n[0].isdigit()==False: # n.istitle() doesn't work correctly
    not_title_name.append(n)
not_title_name = [n for n in not_title_name if n != 'unknown']
not_title_name = list(set(not_title_name))
len(not_title_name)

14

In [242]:
sample(not_title_name, 14) # 10

['nendo,',
 'wells coates',
 'david/nicolas',
 'bahraini—danish',
 'antoine phillippon',
 'gt2P',
 'nendo',
 ':mentalKLINIK',
 'chmara.rosinke',
 'elias sequra pasquale',
 'touche—touche',
 'gerrit rietveldt',
 'morito ebine',
 '&Tradition']

In [None]:
# !!!!!!!!!

In [243]:
not_title_name

['gerrit rietveldt',
 ':mentalKLINIK',
 'gt2P',
 'david/nicolas',
 'bahraini—danish',
 'chmara.rosinke',
 'nendo',
 'antoine phillippon',
 'wells coates',
 'nendo,',
 'elias sequra pasquale',
 '&Tradition',
 'morito ebine',
 'touche—touche']

In [252]:
n = not_title_name[11]
n
# ', Florence'
# ', Venice'
# 'nendo,'

'&Tradition'

In [245]:
for l in n:
  print(l, l==',')
print(n[-1]==',') # 0

n False
e False
n False
d False
o False
, True
True


In [253]:
filtered_dict = {k: v for k, v in artworks_dict.items() if v['artist']=='&Tradition'}
len(filtered_dict)

3

In [254]:
filtered_dict

{'5fe1f9d309672600114e2bfa': {'artist': '&Tradition',
  'category_id': ['12'],
  'gallery': 'THEFOURTH',
  'gene_id': ['12020', '12025'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/sra72EpyMQR11YmeYACbfw/large.jpg',
  'name': 'Lato LN9 Table ',
  'parsing_date': '2022-04-19',
  'price': '€860 ',
  'updating_date': '2022-04-19',
  'year': '2010'},
 '5fe1fd20dca897001014a1eb': {'artist': '&Tradition',
  'category_id': ['12'],
  'gallery': 'THEFOURTH',
  'gene_id': ['12004'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/pZp33YMFbCaZphlb7ptG6A/large.jpg',
  'name': 'Flowerpot VP7 Pendant ',
  'parsing_date': '2022-04-19',
  'price': '€670 ',
  'updating_date': '2022-04-19',
  'year': '1968'},
 '5fe302deb114a000129e5495': {'artist': '&Tradition',
  'category_id': ['12'],
  'gallery': 'THEFOURTH',
  'gene_id': ['12020', '12025'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/qAcfIoVZ_l6CybIGW-mzQw/large.jpg',
  'name': 'Shuffle Table ',
  'parsing_date': '2022-04-19',
  'price':

In [247]:
filtered_dict = {k: v for k, v in artworks_dict.items() if v['artist']=='nendo,'}
len(filtered_dict)

1

In [248]:
filtered_dict

{'59f065077622dd4d9e4b6418': {'artist': 'nendo,',
  'category_id': ['12'],
  'gallery': 'Sèvres Porcelain Manufactory',
  'gene_id': ['12013', '12008', '12025'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/UiOTOwq7gJ5PGDfQYRLwjA/large.jpg',
  'name': 'Hasu',
  'parsing_date': '2022-04-19',
  'price': 'Price on request ',
  'updating_date': '2022-04-19',
  'year': '2017'}}

In [249]:
for k in filtered_dict.keys():
  print(k)

59f065077622dd4d9e4b6418


In [250]:
artworks_dict['59f065077622dd4d9e4b6418'].update({'artist': 'nendo'}) # Unknown
artworks_dict['59f065077622dd4d9e4b6418']

{'artist': 'nendo',
 'category_id': ['12'],
 'gallery': 'Sèvres Porcelain Manufactory',
 'gene_id': ['12013', '12008', '12025'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/UiOTOwq7gJ5PGDfQYRLwjA/large.jpg',
 'name': 'Hasu',
 'parsing_date': '2022-04-19',
 'price': 'Price on request ',
 'updating_date': '2022-04-19',
 'year': '2017'}

In [None]:
# !!!!!!!!!!!!!

In [251]:
real_not_title_name = {}
for n in not_title_name:
  filt_dict_1 = {k: v for k, v in artworks_dict.items() if v['artist'] == n}
  new_n_list = list(set([v['name'] + v['artist'] for v in filt_dict_1.values()]))
  for n_n in new_n_list:
    filt_dict_2 = {k: v for k, v in artworks_dict.items() if v['artist'] == n_n}
    if len(filt_dict_2)>0:
      real_not_title_name.update({n: n_n})

print(len(real_not_title_name))

0


In [None]:
# !!!!!!!!!!!!!!

In [106]:
real_not_title_name
# 'n and William Ladd' must be 'Steven and William Ladd' (name: Steve)
# 'horsten Kirchhoff' must be 'Thorsten Kirchhoff' (name: T)

{'n and William Ladd': 'Steven and William Ladd'}

In [107]:
filtered_dict = {k: v for k, v in artworks_dict.items() if v['artist'] in real_not_title_name.keys()}
filtered_dict

{'56296c0a726169642d000160': {'artist': 'n and William Ladd',
  'category_id': ['10'],
  'gallery': 'Saint Louis Art Museum',
  'gene_id': ['10005'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/ywNOVSVs7bot7KElvh_TBw/large.jpg',
  'name': 'Steve',
  'parsing_date': '2022-04-19',
  'price': ' ',
  'updating_date': '2022-04-19',
  'year': '2014'},
 '564b5bf04b848064aa0000e8': {'artist': 'n and William Ladd',
  'category_id': ['10'],
  'gallery': 'Mingei International Museum',
  'gene_id': ['10005'],
  'link': 'https://d32dm0rphc51dk.cloudfront.net/awaOMMtcpWF8vgt_3MKpMA/large.jpg',
  'name': 'Steve',
  'parsing_date': '2022-04-19',
  'price': ' ',
  'updating_date': '2022-04-19',
  'year': '2012'}}

In [108]:
for v in artworks_dict.values():
  if v['artist'] in real_not_title_name.keys():
    v.update({'artist': real_not_title_name[v['artist']]})
    v.update({'name': 'Untitled'})

In [109]:
for id in filtered_dict.keys():
  print(artworks_dict[id])

{'name': 'Untitled', 'link': 'https://d32dm0rphc51dk.cloudfront.net/ywNOVSVs7bot7KElvh_TBw/large.jpg', 'artist': 'Steven and William Ladd', 'price': ' ', 'parsing_date': '2022-04-19', 'updating_date': '2022-04-19', 'year': '2014', 'gallery': 'Saint Louis Art Museum', 'category_id': ['10'], 'gene_id': ['10005']}
{'name': 'Untitled', 'link': 'https://d32dm0rphc51dk.cloudfront.net/awaOMMtcpWF8vgt_3MKpMA/large.jpg', 'artist': 'Steven and William Ladd', 'price': ' ', 'parsing_date': '2022-04-19', 'updating_date': '2022-04-19', 'year': '2012', 'gallery': 'Mingei International Museum', 'category_id': ['10'], 'gene_id': ['10005']}


In [None]:
# !!!!!!!!!!!!!

In [255]:
for v in artworks_dict.values():
  v.update({'artist': v['artist'].title()}) # bad for not names

In [256]:
# New data
artists_name_list_new = [v['artist'] for v in artworks_dict.values()]

artists_name_list_new_ = list(set(artists_name_list_new))
print(len(artists_name_list_new_))

2141


In [257]:
sample(artists_name_list_new_, 10)

['Tobia Scarpa',
 'Honma Kazuaki',
 'Kenzo Tange',
 'Eric Shaw',
 'Andrea Blum',
 'Geraldo De Barros',
 'Alexandre Chapelin',
 'Stephen Chase',
 'Francesca Dimattio',
 'Adalberto Dal Lago']

In [258]:
#save_dict(artworks_dict, artwork_dict, 'live') # 1. Save json dict
save_dict(artworks_dict, '/artsy_genom_artwork_dict_cat12.txt', 'backup') # 1. Save json dict

### artists_dict

#### initiation

In [None]:
#artists_dict = {'000000': {'name': 'Jacopo Bassano', 
#                           'artwork_id: ['000000', ''], # Portrait of a Man in Armour
#                           -- 'gallery_id': ['000000', ''], # Robilant+Voena
#                           'gene_id': ['21603', ''],
#                           'category_id': ['03', '']}}

In [None]:
artworks_dict = open_dict(artwork_dict, 'live')
print(len(artworks_dict))

8094


In [None]:
artists_name_list = list(set([v['artist'] for v in artworks_dict.values()]))
print(len(artists_name_list))

1176


In [None]:
# artist_id creation
at_id_len = len(str(len(artists_name_list)))
at_id_list = []
c = 1
for a in artists_name_list:
  id = '0'*(at_id_len-len(str(c)))+str(c)
  at_id_list.append(id)
  c += 1

print(len(at_id_list))
print(len(artists_name_list))

1176
1176


***One to one connection***

In [None]:
# Dict creation
artists_dict = pd.DataFrame({'id': at_id_list, 'name': artists_name_list}).set_index('id').to_dict('index')
len(artists_dict)

1176

In [None]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'R.C. Gorman'}
artists_dict_samp

{'0468': {'name': 'R.C. Gorman'}}

##### add 'artist_id'  in artworks_dict

In [None]:
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]

for value in artworks_dict.values():
  if value['artist'] in values_list:
    value.update({'artist_id': keys_list[values_list.index(value['artist'])]})
  else:
    value.update({'artist_id': ' '})

In [None]:
artworks_dict['623f201d30b258000cc3053a']

{'artist': 'Tuukka Tammisaari',
 'artist_id': '33527',
 'category_id': ['03'],
 'gallery': 'Kristof De Clercq',
 'gene_id': ['03056', '03013', '03090'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/hrEVTH4ekjqm_iKE1HStzg/large.jpg',
 'name': 'Untitled',
 'parsing_date': '2022-04-01',
 'price': '€6,600 ',
 'updating_date': '2022-04-01',
 'year': '2021'}

In [None]:
artists_dict['33527']

{'name': 'Tuukka Tammisaari'}

In [None]:
# Replacing
#artworks_dict = {k: artworks_dict[k] for k in artworks_dict.keys() - {'artist'}}
#artworks_dict['623f201d30b258000cc3053a']

In [None]:
save_dict(artworks_dict, artwork_dict, 'live') # 1. Save json dict

#### expansion

with 'artwork_id', 'gene_id' and 'category_id'
- ***one to many connection***
- id info is in the artists_dict

In [None]:
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()

#artists_dict = {k: v.update({'category_id': [cat[0]]}) for k, v in artists_dict.items()}
for k in artists_dict:
  artists_dict[k].update({'category_id': [cat[0]]})

In [None]:
# one to many connection from df
def artist_dict_expan_otm(param):
  group_df = df_cat_dat_total.groupby(['artist'])[param].apply(list).apply(set).apply(list).reset_index(name=param)
  group_dict = group_df.set_index('artist').to_dict('index')

  for k, v in artists_dict.items():
    if v['name'] in group_dict.keys():
      artists_dict[k].update(group_dict[v['name']])
    else:
      artists_dict[k].update({param: [' ']})

In [None]:
# gene_id
artist_dict_expan_otm('gene_id')

In [None]:
# artsy_id
artist_dict_expan_otm('artsy_id')

In [None]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Jacopo Bassano'}
artists_dict_samp

In [None]:
save_dict(artists_dict, artist_dict, 'live') # 1. Save json dict

##### correction - not regular action

In [None]:
#artworks_dict = open_dict(artwork_dict, 'live')
#print(len(artworks_dict))
# 215336

215336


In [None]:
def artist_dict_expan(param):
  param_dict = {}
  for v in artworks_dict.values():
    if v['artist_id'] not in param_dict.keys():
      param_dict.update({v['artist_id']: v[param]})
    else:
      v_new = list(set(param_dict[v['artist_id']] + v[param]))
      param_dict.update({v['artist_id']: v_new})
  print(len(param_dict))

  for k in artists_dict:
    if k in param_dict.keys():
      artists_dict[k].update({param: param_dict[k]})
    else:
      artists_dict[k].update({param: ' '})
  print(len(artists_dict))

In [None]:
artist_dict_expan('category_id')
artist_dict_expan('gene_id')

36975
36975
36975
36975


In [None]:
#'artsy_id'

artsy_id_dict = {}
for k, v in artworks_dict.items():
  if v['artist_id'] not in artsy_id_dict.keys():
    artsy_id_dict.update({v['artist_id']: [k]})
  else:
    v_new = list(set(artsy_id_dict[v['artist_id']] + [k]))
    artsy_id_dict.update({v['artist_id']: v_new})
print(len(artsy_id_dict))

for key in artists_dict:
  if key in artsy_id_dict.keys():
    artists_dict[key].update({'artsy_id': artsy_id_dict[key]})
  else:
    artists_dict[key].update({'artsy_id': ' '})
print(len(artists_dict))

36975
36975


In [None]:
artists_dict['33527']

{'artsy_id': ['5cfb7cb33741d9000f585a5e',
  '5dab31b68291a100122e8426',
  '5dab3142ff80ed000d869223',
  '5de11d7934dce00012f2186e',
  '5e26bfeee2b26b000dd2d623',
  '6056178c0d266100129a121b',
  '5de0f3458d9e2a00127034f0',
  '623f217a6d2c31000b9603b4',
  '604ca1dfb66fe900111318de',
  '5de140b78d19ce0012ce2e2b',
  '623f2108b8401b000c745e13',
  '623f201d30b258000cc3053a',
  '5de14c1b7fe05700136f442e',
  '5dab33258110c4000e2bf1ab',
  '5b3cd2cf139b21230e9f297b',
  '5dab308fb8e1a20011c39112',
  '5de11db47b22d6001202a6df',
  '5de1274647800700116a0973',
  '5de126b190b4bf0012548c16',
  '5de11e017b22d6001202a6f2',
  '5cfb7e2ed989710012310932',
  '5de1410a8533dd000ed36e8f',
  '60560b53bb73b30011a8b3f6',
  '5de13f1d8533dd00120eeede',
  '5de127ab34dce0000e3ca625',
  '5de12731f574de000d808f01',
  '623f22356eee8f000db6c352',
  '5de14bea7fe057000f189094',
  '5de14bc042f3b30012089d16',
  '5de0fd0456a016000e1f3a8a',
  '5dab341eea2ef7000f6a01dc',
  '623f20555f2256000dc92b3a',
  '5dab3278301916000e790690'

In [None]:
len(artists_dict['33527']['artsy_id'])

50

In [None]:
filtered_dict = {k: v for k, v in artworks_dict.items() if v['artist_id'] == '33527'}
len(filtered_dict)

50

In [None]:
save_dict(artists_dict, artist_dict, 'live') # 1. Save json dict

#### addition

In [259]:
#artworks_dict = open_dict('/artsy_genom_artwork_dict_cat08.txt', 'backup')
print(len(artworks_dict))

11253


In [260]:
artists_dict = open_dict(artist_dict, 'live')
len(artists_dict)

46682

In [261]:
artists_samp = sample(list(artists_dict.keys()), 3)
for id in artists_samp:
  print(artists_dict[id])

{'name': 'Mark Frygell', 'category_id': ['03', '08', '06'], 'gene_id': ['03117', '03090', '08056', '06075'], 'artsy_id': ['61fbe302c320e7000d3cb612']}
{'name': 'Moisès Villèlia', 'category_id': ['06', '05'], 'gene_id': ['05015', '05007', '06054', '06012'], 'artsy_id': ['5f89e66b3dad4b00145cf188', '5f919d6a45d834001538c3bb']}
{'name': 'Joan Konkel', 'category_id': ['06', '03', '09', '02', '13', '05'], 'gene_id': ['09001', '03248', '03012', '05051', '05041', '06097', '05074', '06086', '02103', '06137', '02102', '03010', '03071', '03008', '03221', '13005', '13006', '05049', '05020'], 'artsy_id': ['57ffb52d9c18db2c5c00145f', '5435950e7261690710bc0400', '5f6e2c70231378000eee0ddc', '62378f82c45031000c0bde6c', '622909efb28924000bf7b130', '59e67c61c9dc24387d0e7f67', '543571e972616942cef50300', '53d3dbd97261692d39aa0000']}


In [262]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Moisès Villèlia'}
artists_dict_samp

{'01617': {'artsy_id': ['5f89e66b3dad4b00145cf188',
   '5f919d6a45d834001538c3bb'],
  'category_id': ['06', '05'],
  'gene_id': ['05015', '05007', '06054', '06012'],
  'name': 'Moisès Villèlia'}}

In [118]:
def artist_dict_add(param):
  param_dict = {}

  if param == 'artsy_id':
    for k, v in artworks_dict.items():
      if v['artist'] not in param_dict.keys():
        param_dict.update({v['artist']: [k]})
      else:
        v_new = list(set(param_dict[v['artist']] + [k]))
        param_dict.update({v['artist']: v_new})
  
  else:
    for v in artworks_dict.values():
      if v['artist'] not in param_dict.keys():
        param_dict.update({v['artist']: v[param]})
      else:
        v_new = list(set(param_dict[v['artist']] + v[param]))
        param_dict.update({v['artist']: v_new})

  return param_dict

In [263]:
# Dict updating
# {'id': {'artsy_id': [''], 'category_id': [''], 'gene_id': [''], 'name': ''}}

# Old dict data
c = len(artists_dict)+1 # for id creation
id_list = list(artists_dict.keys()) # list of str
name_list = [v['name'] for v in artists_dict.values()] # list of str

# New artworks_dict data
artists_list = list(set([v['artist'] for v in artworks_dict.values()]))
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()
# gene_id
gene_dict = artist_dict_add('gene_id')
# artsy_id
artwork_dict = artist_dict_add('artsy_id')

# New dict
new_artists_id = []
cross_artists_id = []
for name in artists_list:
  if name not in name_list:
    id = '0'*(artist_id_len-len(str(c)))+str(c)
    new_artists_id.append(id)
    artists_dict.update({id: {'artsy_id': artwork_dict[name], 'category_id': [cat[0]], 'gene_id': gene_dict[name], 'name': name}})
    c += 1
  else:
    id = id_list[name_list.index(name)]
    cross_artists_id.append(id)
    new_cat = list(set(artists_dict[id]['category_id'] + cat))
    artists_dict[id].update({'category_id': new_cat})
    new_gene = list(set(artists_dict[id]['gene_id'] + gene_dict[name]))
    artists_dict[id].update({'gene_id': new_gene})
    new_artwork = list(set(artists_dict[id]['artsy_id'] + artwork_dict[name]))
    artists_dict[id].update({'artsy_id': new_artwork})

print(len(artists_dict))
print(len(new_artists_id))
print(len(cross_artists_id))

46737
476
1665


In [264]:
artists_dict[new_artists_id[len(new_artists_id)//2]]

{'artsy_id': ['60f7168e6c1d5d000f9c3b01'],
 'category_id': ['12'],
 'gene_id': ['12017', '12005'],
 'name': 'Philippe Starck, Eugeni Quitllet'}

In [265]:
artists_dict[cross_artists_id[len(cross_artists_id)//2]]

{'artsy_id': ['5d88b03e8ff6a400106b0012', '609a9807552ec600136da453'],
 'category_id': ['12', '09'],
 'gene_id': ['12018', '09018', '12024', '12005', '12017'],
 'name': 'Preben Fabricius And Jørgen Kastholm'}

In [266]:
save_dict(artists_dict, artist_dict, 'live') # 1. Save json dict

##### add 'artist_id'  in artworks_dict

In [267]:
len(artworks_dict)

11253

In [268]:
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]

for value in artworks_dict.values():
  if value['artist'] in values_list:
    value.update({'artist_id': keys_list[values_list.index(value['artist'])]})
  else:
    value.update({'artist_id': ' '})

In [269]:
artworks_dict[check_id_list[0]]

{'artist': 'Wim Rietveld',
 'artist_id': '42412',
 'category_id': ['12'],
 'gallery': 'MORENTZ',
 'gene_id': ['12017', '12005'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/jlr236hnEO9JaBYOCLMDyQ/large.jpg',
 'name': "Wim Rietveld and W.H. Gispen Blue '205' Chairs for Kembo, Netherlands",
 'parsing_date': '2022-04-19',
 'price': 'US$1,275 ',
 'updating_date': '2022-04-19',
 'year': '1954'}

In [270]:
artists_dict[artworks_dict[check_id_list[0]]['artist_id']]

{'artsy_id': ['5f51eb20e62c3100102ec722'],
 'category_id': ['12', '09'],
 'gene_id': ['12005', '12017', '09018'],
 'name': 'Wim Rietveld'}

In [None]:
# Replacing
#artworks_dict = {k: artworks_dict[k] for k in artworks_dict.keys() - {'artist'}}
#artworks_dict['623f201d30b258000cc3053a']

In [271]:
#save_dict(artworks_dict, artwork_dict, 'live') # 1. Save json dict
save_dict(artworks_dict, '/artsy_genom_artwork_dict_cat12.txt', 'backup') # 1. Save json dict

### artworks_dict addition

In [273]:
artworks_dict_new = open_dict('/artsy_genom_artwork_dict_cat12.txt', 'backup')
print(len(artworks_dict_new))
artworks_dict = open_dict(artwork_dict, 'live')
print(len(artworks_dict))

11253
447649


Let's divide new dict ids data on two parts: artsy_id is already in the dict and not.

In [274]:
old_id_list = list(artworks_dict.keys()) # list of str

# New df data

new_id_list = list(artworks_dict_new.keys()) # list of str
cat = [v['category_id'] for v in artworks_dict_new.values()][0] # list of lists with str
print('New category:', cat[0])
print('')

new_artwork_id = list(set(new_id_list) - set(old_id_list))
cross_artwork_id = list(set(old_id_list) & set(new_id_list))
print('New id count:', len(new_artwork_id))
print('Cross id count:', len(cross_artwork_id))
print('Check the division:', len(new_id_list)==len(new_artwork_id)+len(cross_artwork_id))
print('')

artworks_dict_total_len = len(old_id_list) + len(new_artwork_id)
print('New dict length must be:', artworks_dict_total_len)

New category: 12

New id count: 2925
Cross id count: 8328
Check the division: True

New dict length must be: 450574


#### cross_artwork_id updating

Name, link and artist of artsy_id that is already in the dict shoild be the same. Let's check it.

In [275]:
dif_name = []
dif_link = []
dif_artist = []

for id in cross_artwork_id:
  if artworks_dict[id]['name'] != artworks_dict_new[id]['name']:
    dif_name.append(id)
  if artworks_dict[id]['link'] != artworks_dict_new[id]['link']:
    dif_link.append(id)
  if artworks_dict[id]['artist'] != artworks_dict_new[id]['artist']:
    dif_artist.append(id)

print('Cross id count:', len(cross_artwork_id))
print(len(dif_name), 'ids have different name')
print(len(dif_link), 'ids have different link')
print(len(dif_artist), 'ids have different artist')

Cross id count: 8328
0 ids have different name
1 ids have different link
128 ids have different artist


In [132]:
for id in sample(dif_name, 6): # 10
  print(id, ':', artworks_dict[id]['name'], '->', artworks_dict_new[id]['name'])

61f2f30591e3e8000cf2cfc5 : All is Fine -> Everything is Going to be Fine
6256297628165e000e237715 : Reading Can Seriously Damage Your Innocence -> Reading Can Seriously Damage Your Ignorance
54b0f7d27261695a02cd0500 : Kanata He (Far Beyond) I -> Kanata He (Far Beyond)
621e0719ace6f6000ca0c3d1 : Untitled -> A
618ad3108253e8000dd9484c : She is in the Bathroom -> A Room of Her Own
615359daef467f000b74265e : Tired -> Tired Shopper


In [276]:
for id in sample(dif_artist, 10):
  print(id, ':', artworks_dict[id]['artist'], '->', artworks_dict_new[id]['artist'])

5d889871fdf95f0012048daf : Aksel Bender Madsen and Ejner Larsen -> Aksel Bender Madsen And Ejner Larsen
534dd6688b3b81b6300001c0 : Roy McMakin -> Roy Mcmakin
51a5d3748b3b816a7b00039e : André Monpoix and Alain Richard -> André Monpoix And Alain Richard
54adb7567261695c37410700 : Gerald McCabe -> Gerald Mccabe
5c4788dc2ebe312aada594bb : KAWS -> Kaws
519784748b3b81132d000038 : Geneviève Dangles and Christian Defrance -> Geneviève Dangles And Christian Defrance
5a1a03918b3b810ef3b3ead7 : Hvidt and Mølgaard Nielsen -> Hvidt And Mølgaard Nielsen
609a8debcadb240013aacc18 : Hvidt and Mølgaard Nielsen -> Hvidt And Mølgaard Nielsen
56373b6e7261696fb7000746 : Alberto Biagetti and Laura Baldassari -> Alberto Biagetti And Laura Baldassari
61852613cff7d0000c6f043e : Guillerme et Chambron -> Guillerme Et Chambron


In [277]:
for id in sample(dif_link, 1): # 10
  print(id, ':', artworks_dict[id]['link'], '->', artworks_dict_new[id]['link'])

624c243c1921db000e9153af : https://d32dm0rphc51dk.cloudfront.net/_I0WSM1SSE9Ub72ShrMnMQ/large.jpg -> https://d32dm0rphc51dk.cloudfront.net/QiE6n5L-JpRKf1Q82cDStw/large.jpg


In [None]:
# excaption
artworks_dict['62280e48bd9211000cc4b47d'].update({'link': 'https://d32dm0rphc51dk.cloudfront.net/S9Gn8hpcc24pnRBCrghPwA/large.jpg'})

!!! But all of them can be different. At the same time, it's seen that changes aren't dromatic -> So, not good, but we won't change or add them.

It would be better for checking the info if I got artwork's page link...

And later we'll clean the artists_dict.

Also we won't change or add different price, year and gallery. In fact, we just add the category_id and gene_id info.

In [278]:
# Dict updating

# {'artsy_id': {'artist': ' ', 'artist_id': ' ', 'category_id': [''], 'gallery': ' ', 'gene_id': [''], 
#               'link': ' ', 'name': ' ', 'parsing_date': '', 'price': ' ', 'updating_date': '', 'year': ' '}}

for id in cross_artwork_id:
  new_cat = list(set(artworks_dict[id]['category_id'] + cat))
  artworks_dict[id].update({'category_id': new_cat})
  new_gene = list(set(artworks_dict[id]['gene_id'] + artworks_dict_new[id]['gene_id']))
  artworks_dict[id].update({'gene_id': new_gene})

print(len(artworks_dict))

447649


In [279]:
# cross_artwork_id sample
samp_id = sample(cross_artwork_id, 1)[0]
print(samp_id)
artworks_dict_new[samp_id]

61547a659e1628000c474a79


{'artist': 'Angelo Lelii',
 'artist_id': '41239',
 'category_id': ['12'],
 'gallery': 'Dimoregallery',
 'gene_id': ['12014', '12016'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/ac-tQDovR1gplNHBBDY72Q/large.jpg',
 'name': 'Pair of wall lights by Angelo Lelii',
 'parsing_date': '2022-04-19',
 'price': '€6,750–€7,750 ',
 'updating_date': '2022-04-19',
 'year': '1960’s'}

In [280]:
artworks_dict[samp_id]

{'artist': 'Angelo Lelii',
 'artist_id': '41239',
 'category_id': ['12', '04'],
 'gallery': 'Dimoregallery',
 'gene_id': ['12014', '04016', '12016'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/ac-tQDovR1gplNHBBDY72Q/large.jpg',
 'name': 'Pair of wall lights by Angelo Lelii',
 'parsing_date': '2022-04-08',
 'price': '€6,750–€7,750 ',
 'updating_date': '2022-04-08',
 'year': '1960’s'}

#### new_artwork_id addition

In [281]:
# {'artsy_id': {'artist': ' ', 'artist_id': ' ', 'category_id': [''], 'gallery': ' ', 'gene_id': [''], 
#               'link': ' ', 'name': ' ', 'parsing_date': '', 'price': ' ', 'updating_date': '', 'year': ' '}}

for id in new_artwork_id:
  artworks_dict.update({id: {'artist': artworks_dict_new[id]['artist'], 'artist_id': artworks_dict_new[id]['artist_id'], 'category_id': artworks_dict_new[id]['category_id'], 'gallery': artworks_dict_new[id]['gallery'], 'gene_id': artworks_dict_new[id]['gene_id'], 'link': artworks_dict_new[id]['link'], 'name': artworks_dict_new[id]['name'], 'parsing_date': artworks_dict_new[id]['parsing_date'], 'price': artworks_dict_new[id]['price'], 'updating_date': artworks_dict_new[id]['updating_date'], 'year': artworks_dict_new[id]['year']}})

print('New dict length must be:', artworks_dict_total_len)
print('New dict length is:', len(artworks_dict))

New dict length must be: 450574
New dict length is: 450574


In [282]:
# new_artwork_id sample
samp_id = sample(new_artwork_id, 1)[0]
print(samp_id)
artworks_dict_new[samp_id]

611659b36c3e63000d562178


{'artist': 'Aniela Ovadiuc',
 'artist_id': '40872',
 'category_id': ['12'],
 'gallery': 'Galeria Senso',
 'gene_id': ['12014', '12012'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/Xc3mksb8wMMkGxWBYYF_Pw/large.jpg',
 'name': 'The Drop Series – “The Beneficial Harm”',
 'parsing_date': '2022-04-19',
 'price': '€1,470 ',
 'updating_date': '2022-04-19',
 'year': '2019'}

In [283]:
artworks_dict[samp_id]

{'artist': 'Aniela Ovadiuc',
 'artist_id': '40872',
 'category_id': ['12'],
 'gallery': 'Galeria Senso',
 'gene_id': ['12014', '12012'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/Xc3mksb8wMMkGxWBYYF_Pw/large.jpg',
 'name': 'The Drop Series – “The Beneficial Harm”',
 'parsing_date': '2022-04-19',
 'price': '€1,470 ',
 'updating_date': '2022-04-19',
 'year': '2019'}

In [284]:
save_dict(artworks_dict, artwork_dict, 'live') # 1. Save json dict

### artists_dict cleaning

In [285]:
dif_artist_id = []
c = 0
for id in cross_artwork_id:
  if artworks_dict[id]['artist'] != artworks_dict_new[id]['artist']:
    dif_artist_id.append(artworks_dict_new[id]['artist_id'])
    dif_artist_id = list(set(dif_artist_id))
    c += 1

print(c, 'ids have different artist')
print(len(dif_artist_id), 'artists are exstra new')

128 ids have different artist
61 artists are exstra new


In [286]:
#artists_dict = open_dict(artist_dict, 'live')
len(artists_dict)

46737

In [287]:
# Replacing
print(len(artists_dict) - len(dif_artist_id))
artists_dict = {k: v for k, v in artists_dict.items() if k not in dif_artist_id}
print(len(artists_dict))

46676
46676


In [288]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Matt Connors'}
artists_dict_samp

{'21007': {'artsy_id': ['61fe8114b9ff6c000b32bbb6',
   '622cb6e45463fe000bb811af'],
  'category_id': ['03', '05', '02', '06', '08'],
  'gene_id': ['02155', '08049', '03008', '06045', '05011', '06175'],
  'name': 'Matt Connors'}}

In [289]:
save_dict(artists_dict, artist_dict, 'live') # 1. Save json dict

### genes_dict - at the very end

#### expansion

In [11]:
genes_dict = open_dict(gen_dict, 'live') # 2. Open json dict
genes_dict_old = open_dict(gen_dict, 'live') # 2. Open json dict
len(genes_dict)

1032

In [12]:
artists_dict = open_dict(artist_dict, 'live') # 2. Open json dict
len(artists_dict)

46676

In [13]:
artworks_dict = open_dict(artwork_dict, 'live') # 2. Open json dict
len(artworks_dict)

450574

In [15]:
genes_dict_old['bad_painting']

{'artist_id': ['10640', '05253'],
 'artsy_id': ['54f0c97a7261693fa3ea1400',
  '553a8e737261697635630000',
  '54f0c97c726169731f9f1400',
  '553a8ea872616909c03a0000',
  '553a8e61726169623a4f0000',
  '553be2df7261692b035d0400'],
 'category_id': '03',
 'id': '03001',
 'link': '/gene/bad-painting',
 'name': '"Bad Painting"',
 'parsing_date': '2022-03-29',
 'updating_date': '2022-03-29'}

Update genes_dict with 'artist_id' and 'artwork_id':
- list type parametr - ***one to many connection***
- id info is in the **dict**

In [16]:
artist_id_list_old = genes_dict_old['bad_painting']['artist_id']
artsy_id_list_old = genes_dict_old['bad_painting']['artsy_id']

In [17]:
#param = 'artist_id'

for v in genes_dict.values():
  v.update({'artist_id': []})
  for key, value in artists_dict.items():
    if v['id'] in value['gene_id']:
      v_new = list(set(v['artist_id'] + [key]))
      v.update({'artist_id': v_new})

print(len(genes_dict))

1032


In [18]:
#param = 'artsy_id'

for v in genes_dict.values():
  v.update({'artsy_id': []})
  for key, value in artworks_dict.items():
    if v['id'] in value['gene_id']:
      v_new = list(set(v['artsy_id'] + [key]))
      v.update({'artsy_id': v_new})

print(len(genes_dict))

1032


In [23]:
artist_id_list_new = genes_dict['bad_painting']['artist_id']
artsy_id_list_new = genes_dict['bad_painting']['artsy_id']

print(list(set(artist_id_list_new))==list(set(artist_id_list_old)))
print(list(set(artsy_id_list_new))==list(set(artsy_id_list_old)))

False
True


In [20]:
genes_dict['bad_painting']
#'William Nelson Copley'
#'Joan Brown'

{'artist_id': ['25055', '21189'],
 'artsy_id': ['553be2df7261692b035d0400',
  '553a8ea872616909c03a0000',
  '54f0c97a7261693fa3ea1400',
  '54f0c97c726169731f9f1400',
  '553a8e737261697635630000',
  '553a8e61726169623a4f0000'],
 'category_id': '03',
 'id': '03001',
 'link': '/gene/bad-painting',
 'name': '"Bad Painting"',
 'parsing_date': '2022-03-29',
 'updating_date': '2022-03-29'}

In [26]:
artists_dict['25055']

{'artsy_id': ['553a8ea872616909c03a0000',
  '54f0c97a7261693fa3ea1400',
  '5da7b49cf52b8c00129b9694',
  '54f0c97c726169731f9f1400',
  '553a8e61726169623a4f0000',
  '553a8e737261697635630000'],
 'category_id': ['01', '03'],
 'gene_id': ['03001', '01005'],
 'name': 'Joan Brown'}

In [27]:
save_dict(genes_dict, gen_dict, 'live') # 1. Save json dict

#### addition

In [None]:
genes_dict = open_dict(gen_dict, 'live') # 2. Open json dict
len(genes_dict)

1032

In [None]:
genes_dict['abstract_landscape']

{'artist_id': ' ',
 'artsy_id': ' ',
 'category_id': '02',
 'id': '02001',
 'link': '/gene/abstract-landscape',
 'name': 'Abstract Landscape',
 'parsing_date': '2022-03-29',
 'updating_date': '2022-03-29'}

In [None]:
# artsy_id
group_df = df_cat_dat_total.groupby(['gene_id'])['artsy_id'].apply(list).apply(set).apply(list).reset_index(name='artsy_id')
group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])

In [None]:
# artist_id
artists_dict = open_dict(artist_dict)
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]

group_df = df_cat_dat_total.groupby(['gene_id'])['artist'].apply(list).apply(set).apply(list).reset_index(name='artist')
group_df['artist_id'] = group_df['artist'].dropna().apply(lambda x: [keys_list[values_list.index(y)] for y in x])
group_df = group_df.drop(['artist'], axis=1)

group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])

In [None]:
#genes_dict['abstract_landscape']

In [None]:
save_dict(genes_dict, gen_dict, 'live') # 1. Save json dict

## Backup saving

In [28]:
save_dict(artworks_dict, artwork_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict
save_dict(artists_dict, artist_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict
save_dict(genes_dict, gen_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict

# Images

In [None]:
# Sample from API tables

# 'image': {'href': 'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/{image_version}.jpg',
#                      'templated': True}
#i = 'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/normalized.jpg'
#for p in i.split('.jpg')[0].split('/'):
#  print(len(p), p)

6 https:
0 
29 d32dm0rphc51dk.cloudfront.net
22 NOpIAwQa-3r51Cg9qXKbfA
10 normalized


In [None]:
#! pip install pillow --user

import PIL
from PIL import Image
import os
import cv2

%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

Populating the interactive namespace from numpy and matplotlib


In [None]:
image = requests.get(img, headers={'User-Agent': 'Chrome/97.0.4692.71'})
name = art_project_path + '/' + 'portrait_of_a_man_in_armour' + '.jpg'

img = open(name, 'wb')
img.write(image.content)
img.close()

img = Image.open(name)
plt.imshow(img)
plt.axis('off')
plt.show()

img.close()