# Library

In [48]:
import logging
import requests
from bs4 import BeautifulSoup
from urllib.error import HTTPError

import http.client as httplib  # or http.client if you're on Python 3 # httplib
httplib._MAXHEADERS = 10000
# ChunkedEncodingError

import time
import datetime
from datetime import datetime, timedelta

import re

import json

import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import sys

In [2]:
# pathes
art_project_path = '/content/drive/MyDrive/Art/Art_Project'
art_project_path_backups = '/content/drive/MyDrive/Art/Art_Project/parsing_backups'
art_project_path_tests = '/content/drive/MyDrive/Art/Art_Project/parsing_tests'

url_add = 'https://www.artsy.net'

# files
cat_dict = '/artsy_genom_cat_dict.txt'
gen_dict = '/artsy_genom_gen_dict.txt'
artwork_dict = '/artsy_genom_artwork_dict.txt'
artist_dict = '/artsy_genom_artist_dict.txt'
temp_data = '/artsy_temp_data.csv'

# id length
cat_id_len = 2
gen_id_len = 5
artist_id_len = 5

# for addition and updating
recently_added_pref = '?sort=-published_at'
recently_updated_pref = '?sort=-partner_updated_at'

# Functions

## Openings and savings

In [3]:
# 1. Save json dict
def save_dict(tree_dict, file, folder):
  if folder=='live':
    with open(art_project_path + file, 'w') as outfile:
      json.dump(tree_dict, outfile)
  elif folder=='backup':
    with open(art_project_path_backups + file, 'w') as outfile:
      json.dump(tree_dict, outfile)

# 2. Open json dict
def open_dict(file, folder):
  if folder=='live':
    with open(art_project_path + file, 'rb') as infile:
      tree_dict = json.load(infile)
  elif folder=='backup':
    with open(art_project_path_backups + file, 'rb') as infile:
      tree_dict = json.load(infile)
  return tree_dict

# 3. Open file.csv
def open_file_csv(file, folder): # folder: live, backup or test
  if folder=='live':
    df = pd.read_csv(art_project_path + file)
  elif folder=='backup':
    df = pd.read_csv(art_project_path_backups + file)
  elif folder=='test':
    df = pd.read_csv(art_project_path_tests + file)
  return df

# 4. Save df to csv file
def save_df_file(df, file, folder): # folder: live, backup or test
  if folder=='live':
    df.to_csv(art_project_path + file, index=False)
  elif folder=='backup':
    df.to_csv(art_project_path_backups + file, index=False)
  elif folder=='test':
    df.to_csv(art_project_path_tests + file, index=False)


## Parsing

### get_page

In [70]:
# 5. get_page
logging.basicConfig(level=logging.DEBUG, format="%(message)s")

def get_page(url, session):
  #response = requests.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
  try:
    response = session.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
  except: # requests.exceptions.ChunkedEncodingError:
    time.sleep(0.5)
    session = requests.Session()
    response = session.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})

  try:
    response.raise_for_status()
  except requests.exceptions.HTTPError: # for correction
    sys.exit(print('HTTPError: ' + str(url)))

  bs_page = BeautifulSoup(response.text, 'html.parser')
  return bs_page

### get next pages links list

In [62]:
# 6. get next pages links list
def get_next_page_list(url, page):
  try:
    last_page_data = page.find('main').find_all('nav', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 ibHUpM gzwvJD')[0].find_all('a')[-2]
    last_page = last_page_data.get_text()
    page_link = last_page_data.get('href')[:-len(last_page)]
    next_page_link_list_new = [url + page_link + str(p) for p in range(2, int(last_page)+1)]
  except:
    next_page_link_list_new = []

  return next_page_link_list_new

### get data from pages

In [63]:
# 7. get data from pages
def get_page_data(page):
  code_block = page.find('main').find_all('div', class_='ArtworkGrid-sc-1jsqquq-0 djwrUe')
  try:
    object_block = code_block[0].find_all('div', relay='[object Object]')
  
    # for the beggining into a DataFrame
    artsy_id_list = [obj.get('data-id') for obj in object_block]
    imgage_list = [obj.find_all('a')[0].find('img').get('src') for obj in object_block]
    artist_list = [obj.find_all('a')[1].find_all('div')[0].get_text() for obj in object_block]
  
    gallery_list = []
    for obj in object_block:
      try:
        gallery_list.append(obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn hENCPo').get_text())
      except:
        gallery_list.append(np.nan)
  
    price_list = []
    for obj in object_block:
      try:
        price_list.append(obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 eXbAnU jkuGdd').get_text())
      except:
        price_list.append(np.nan)
  
    # can be no year info !!!
    name_list = []
    year_list = []
    for obj in object_block:
      text = obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn kySEpG').get_text()
      name = obj.find_all('a')[1].find('div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 caIGcn kySEpG').find('i').get_text()
      year = text.replace(name, '').replace(', ', '')
      if len(year)==0:
        year = np.nan
      name_list.append(name)
      year_list.append(year)

    df_data_new = pd.DataFrame({'artsy_id': artsy_id_list,
                                'name': name_list, 
                                'link': imgage_list, 
                                'year': year_list,
                                'artist': artist_list,
                                'gallery': gallery_list,
                                'price': price_list
                                })
  except:
    df_data_new = pd.DataFrame({'artsy_id': np.nan,
                                'name': np.nan, 
                                'link': np.nan, 
                                'year': np.nan,
                                'artist': np.nan,
                                'gallery': np.nan,
                                'price': np.nan
                                }, index=[0])
  
  return df_data_new

### get category data into df - aggregator

In [64]:
# Function aggregator
# 8. Get the all category data into DataFrame
def get_df_data(df_input, genes_links_list, genes_ids_list, cat):
  start_time = time.time()
  print(datetime.now().time())
  df_output = df_input.copy()
  df_output = df_output[0:0]

  for link in genes_links_list:
    session = requests.Session()
    total_link = url_add + link
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has started')
    # 5. get_page
    gene_page = get_page(total_link, session)
    #           get_page(url, session) -> 
    # -> bs_page
    
    # 6. get next pages links list
    next_page_link_list = get_next_page_list(url_add, gene_page)
    #                     get_next_page_list(url, page) -> 
    # -> next_page_link_list_new
    print(len(next_page_link_list)+1, 'pages for parsing')

    # 7. get data from pages
    df_data = get_page_data(gene_page)
    #         get_page_data(page) -> 
    # -> df_data_new
    df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
    df_data['category_id'] = cat
    df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')

    df_output = pd.concat([df_output, df_data])

    # Get data from next pages
    if len(next_page_link_list) > 0:
      for page_link in next_page_link_list:
        # 5. get_page
        gene_page = get_page(page_link, session)
        #           get_page(url, session) -> 
        # -> bs_page

        # 7. get data from pages
        df_data = get_page_data(gene_page)
        #         get_page_data(page) -> 
        # -> df_data_new
        df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
        df_data['category_id'] = cat
        df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
        df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')

        df_output = pd.concat([df_output, df_data])
      
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has finished')
    print((time.time() - start_time), 'seconds have passed')
    print(datetime.now().time())
    print('')
    time.sleep(0.5)

  return df_output

In [None]:
# Function aggregator
# 8.1 Get the first pages category data into DataFrame
def get_df_data_add_up(df_input, genes_links_list, genes_ids_list, cat):
  start_time = time.time()
  print(datetime.now().time())
  df_output = df_input.copy()
  df_output['pages_count'] = np.nan
  df_output['second_page_link'] = np.nan
  df_output = df_output[0:0]

  for link in genes_links_list:
    session = requests.Session()
    total_link = url_add + link
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has started')
    # 5. get_page
    gene_page = get_page(total_link, session)
    #           get_page(url, session) -> 
    # -> bs_page
    
    # 6. get next pages links list
    next_page_link_list = get_next_page_list(url_add, gene_page)
    #                     get_next_page_list(url, page) -> 
    # -> next_page_link_list_new
    print(len(next_page_link_list)+1, 'pages for parsing')

    # 7. get data from pages
    df_data = get_page_data(gene_page)
    #         get_page_data(page) -> 
    # -> df_data_new
    df_data['gene_id'] = genes_ids_list[genes_links_list.index(link)]
    df_data['category_id'] = cat
    df_data['parsing_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['updating_date'] = datetime.now().date().strftime('%Y-%m-%d')
    df_data['pages_count'] = len(next_page_link_list)+1
    if len(next_page_link_list) > 0:
      df_data['second_page_link'] = next_page_link_list[0]
    else:
      df_data['second_page_link'] = np.nan

    df_output = pd.concat([df_output, df_data])
      
    print('Parsing of gene', genes_ids_list[genes_links_list.index(link)], total_link, 'has finished')
    print((time.time() - start_time), 'seconds have passed')
    print(datetime.now().time())
    print('')
    time.sleep(0.5)

  return df_output

### correct image link

In [65]:
# 9. correct image link
def img_link_corr(img):
  parts = img.split('.jpg')[0].replace('?', '/').replace('%2F', '/').split('/')
  img_new = parts[0] + '//' + parts[5] + '/' + parts[6] + '/' + parts[7]  + '.jpg'
  return img_new

### filtering

In [66]:
# 10. Filtering
def get_cat_id(cat):
  return cat

def gene_links_samples(gen_key):
  cat_id = get_cat_id(cat)
  return genes_dict[gen_key]['category_id'] == cat_id

# Gens dict

In [None]:
url = 'https://www.artsy.net/categories'

In [None]:
response = requests.get(url, headers={'User-Agent': 'Chrome/97.0.4692.71'})
genome_page = BeautifulSoup(response.text, 'html.parser')
genome_page.title.string

'Artsy — Discover, Buy, and Sell Fine Art'

## categories_dict

In [None]:
# !!!!! For the first time !!!!!
confirmation = input('Confirm the categories_dict re/seting (y/n)')
if confirmation == 'y':
  categories_dict = {}
  print('categories_dict has been re/set {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict re/seting (y/n)y
categories_dict has been re/set 2022-03-30 06:01
dictionary's length is - 0


In [None]:
# !!!!! All next times !!!!!
confirmation = input('Confirm the categories_dict opening (y/n)')
if confirmation == 'y':
  categories_dict = open_dict(cat_dict) # 2. Open json dict
  print('categories_dict has been opened {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict opening (y/n)n
nothing has happened


In [None]:
categories_block = genome_page.find('div', class_='sticky-inner-wrapper').find_all('div', class_='Box-sc-15se88d-0 Flex-cw39ct-0 elJsUD')

#categories_dict = {'artistic_disciplines': {'id': '02', 'name': 'Artistic Disciplines', 'link': 'jump--artistic-disciplines'}}
id = len(categories_dict) + 1
for cat in categories_block:
  key = cat.find('a').get('href').replace('#jump--', '').replace('-', '_')
  if key not in categories_dict.keys():
    categories_dict.update({key: {'id': str(id) if len(str(id))>1 else '0'+str(id), 'name': cat.get_text(), 'link': cat.find('a').get('href').replace('#', ''), 'genes_id': [], 'parsing_date': datetime.now().date().strftime('%Y-%m-%d'), 'updating_date': datetime.now().date().strftime('%Y-%m-%d')}})
    id += 1
    print('New key has been added - {}'.format(key))
  else:
    if categories_dict[key]['link'] != cat.find('a').get('href').replace('#', '') or categories_dict[key]['name'] != cat.get_text():
      print('an Old name - {}\nor an Old link - {}'.format(categories_dict[key]['name'], categories_dict[key]['link']))
      categories_dict[key].update({'link': cat.find('a').get('href').replace('#', '')})
      categories_dict[key].update({'name': cat.get_text()})
      categories_dict[key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})
      print('has been updated with \na New name - {}\nor a New link - {}'.format(categories_dict[key]['name'], categories_dict[key]['link']))
      print('')

New key has been added - artistic_disciplines
New key has been added - subject_matter
New key has been added - styles_and_movements
New key has been added - design_movements
New key has been added - visual_qualities
New key has been added - medium_and_techniques
New key has been added - time_periods
New key has been added - geographic_regions
New key has been added - materials
New key has been added - design_concepts_and_techniques
New key has been added - cultural_and_religious_styles
New key has been added - furniture_and_lighting
New key has been added - textiles
New key has been added - jewelry_and_fashion_object_types
New key has been added - tableware_vessels_and_objects
New key has been added - antiquities_artifacts_and_religious_objects


In [None]:
len(categories_dict)

16

In [None]:
categories_dict['artistic_disciplines']

{'genes_id': [],
 'id': '01',
 'link': 'jump--artistic-disciplines',
 'name': 'Artistic Disciplines',
 'parsing_date': '2022-03-30',
 'updating_date': '2022-03-30'}

In [None]:
confirmation = input('Confirm the categories_dict saving (y/n)')
if confirmation == 'y':
  save_dict(categories_dict, cat_dict) # 1. Save json dict
  print('categories_dict has been saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict saving (y/n)y
categories_dict has been saved 2022-03-30 06:02
dictionary's length is - 16


## genes_dict

In [None]:
# !!!!! For the first time !!!!!
confirmation = input('Confirm the genes_dict re/seting (y/n)')
if confirmation == 'y':
  genes_dict = {}
  print('genes_dict has been re/set {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict re/seting (y/n)y
genes_dict was re/set


In [None]:
# !!!!! All next times !!!!!
confirmation = input('Confirm the genes_dict opening (y/n)')
if confirmation == 'y':
  genes_dict = open_dict(gen_dict) # 2. Open json dict
  print('genes_dict has been opened {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict opening (y/n)y
genes_dict has been opened 2022-03-30 04:12
dictionary's length is - 1032


In [None]:
# !!!!! All next times !!!!!
# Determining the count of genes in the category for creating an id (using () instead of [] is also possible)
count_list = [sum([1 for v in genes_dict.values() if v['category_id']==value['id']]) for value in categories_dict.values()]
id_len = len(str(max(count_list)))
print(id_len)
print(count_list)

3
[7, 213, 252, 26, 75, 175, 20, 68, 39, 22, 69, 27, 6, 7, 19, 7]


In [None]:
#genes_dict = {'architecture_1': {'id': '0201', 'name': 'Architecture', 'link': '/gene/architecture-1', 'category_id': '02'}}
# !!! working link is 'https://www.artsy.net' + link !!!
for value in categories_dict.values():
  category = genome_page.find('div', id=value['link']).find('h2').text
  if category == value['name']:
    count = sum([1 for v in genes_dict.values() if v['category_id'] == value['id']])
    id = count+1
    gens_block = genome_page.find('div', id=value['link']).find_all('a')
    for a in gens_block:
      gen_key = a.get('href').replace('/gene/', '').replace('-', '_')
      if gen_key not in genes_dict.keys():
        genes_dict.update({gen_key: {'id': value['id']+('0'*(id_len-len(str(id)))+str(id)), 'name': a.get_text(), 'link': a.get('href'), 'category_id': value['id'], 'artsy_id': [], 'artist_id': [], 'parsing_date': datetime.now().date().strftime('%Y-%m-%d'), 'updating_date': datetime.now().date().strftime('%Y-%m-%d')}})
        id += 1
        print('New key has been added - {}'.format(gen_key)) # !!! Not for the first time
      else:
        if genes_dict[gen_key]['link'] != a.get('href') or genes_dict[gen_key]['name'] != a.get_text():
          print('an Old name - {}\nor an Old link - {}'.format(genes_dict[gen_key]['name'], genes_dict[gen_key]['link']))
          genes_dict[gen_key].update({'link': a.get('href')})
          genes_dict[gen_key].update({'name': a.get_text()})
          genes_dict[gen_key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})
          print('has been updated with \na New name - {}\nor a New link - {}'.format(genes_dict[gen_key]['name'], genes_dict[gen_key]['link']))
          print('')
  else:
    print("Category name doesn't match")

In [None]:
len(genes_dict)

1032

In [None]:
for value in genes_dict.values():
  if value['category_id'] == '01': # '02'
    print(value)

{'id': '01001', 'name': 'Architecture', 'link': '/gene/architecture-1', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01002', 'name': 'Design', 'link': '/gene/design', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01003', 'name': 'Fashion Design and Wearable Art', 'link': '/gene/fashion-design-and-wearable-art', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01004', 'name': 'Graphic Design', 'link': '/gene/graphic-design', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01005', 'name': 'Jewelry', 'link': '/gene/jewelry', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01006', 'name': 'Music', 'link': '/gene/music', 'category_id': '01', 'parsing_date': '2022-03-29', 'updating_date': '2022-03-29'}
{'id': '01007', 'name': 'Written Word', 'link': '/gene/written-word', 'category_

In [None]:
confirmation = input('Confirm the genes_dict saving (y/n)')
if confirmation == 'y':
  save_dict(genes_dict, gen_dict) # 1. Save json dict
  print('genes_dict was saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(genes_dict)))
else:
  print('nothing has happened')

Confirm the genes_dict saving (y/n)y
genes_dict was saved 2022-03-30 04:16
dictionary's length is - 1032


### addition/updating lists of genes' id into the categories_dict

In [None]:
# Dict of genes' id for the categories_dict
#cat_gen_id_dict = {}
#for value in categories_dict.values():
#  gen_id_list = [v['id'] for v in genes_dict.values() if v['category_id'] == value['id']]
#  cat_gen_id_dict.update({value['id']: gen_id_list})

#len(cat_gen_id_dict)

16

In [None]:
#cat_gen_id_dict['01']

['01001', '01002', '01003', '01004', '01005', '01006', '01007']

In [None]:
#for k, v in categories_dict.items():
#  for key, value in cat_gen_id_dict.items():
#    if v['id'] == key and categories_dict[k]['genes_id'] != value:
#      categories_dict[k].update({'genes_id': value})
#      categories_dict[k].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})

#len(categories_dict)

16

In [None]:
for key, value in categories_dict.items():
  gen_id_list = [v['id'] for v in genes_dict.values() if v['category_id'] == value['id']]
  if categories_dict[key]['genes_id'] != gen_id_list:
      categories_dict[key].update({'genes_id': gen_id_list})
      categories_dict[key].update({'updating_date': datetime.now().date().strftime('%Y-%m-%d')})

len(categories_dict)

In [None]:
categories_dict['artistic_disciplines']

{'genes_id': ['01001', '01002', '01003', '01004', '01005', '01006', '01007'],
 'id': '01',
 'link': 'jump--artistic-disciplines',
 'name': 'Artistic Disciplines',
 'parsing_date': '2022-03-30',
 'updating_date': '2022-03-30'}

In [None]:
confirmation = input('Confirm the categories_dict saving (y/n)')
if confirmation == 'y':
  save_dict(categories_dict, cat_dict) # 1. Save json dict
  print('categories_dict has been saved {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M')))
  print("dictionary's length is - {}".format(len(categories_dict)))
else:
  print('nothing has happened')

Confirm the categories_dict saving (y/n)y
categories_dict has been saved 2022-03-30 06:03
dictionary's length is - 16


# Artworks and Artists dicts

## Filtering

The fastest and simplest way to collect data is to go through the genes_dict and read data from a tape of objects like from a news feed. On a tape they have:
- artist's name - to the artists_dict !!!
- artwork name - to the artworks_dict
- artwork year - to the artworks_dict
- Galleries and Institutions name - to the galleries_dict !!!
- price - to the artworks_dict
- image link - to the artworks_dict

That's anough for me.

In parallel I'm going to collect artists_dict and galleries_dict.

Artwork can belong to different genes, but I don't need this repetition in the artworks_dict. So I can't create a composed id for it (like gene id). Artwork id should be end-to-end.

Seems like I should choose the most comprehensive category, collect its database, and then add to it only new information.

But how much artworks are in Artsy database? Via their API I collected around 26 000 once.

In [14]:
categories_dict = open_dict(cat_dict, 'live') # 2. Open json dict
print("dictionary's length is - {}".format(len(categories_dict)))

dictionary's length is - 16


In [26]:
def by_value(item):
  return len(item[1]['genes_id'])

for k, v in sorted(categories_dict.items(), key=by_value, reverse=True):
  print(k, '->', 'id', categories_dict[k]['id'], '->', 'genes count' , len(categories_dict[k]['genes_id']))

styles_and_movements -> 03 -> 252
subject_matter -> 02 -> 213
medium_and_techniques -> 06 -> 175
visual_qualities -> 05 -> 75
cultural_and_religious_styles -> 11 -> 69
geographic_regions -> 08 -> 68
materials -> 09 -> 39
furniture_and_lighting -> 12 -> 27
design_movements -> 04 -> 26
design_concepts_and_techniques -> 10 -> 22
time_periods -> 07 -> 20
tableware_vessels_and_objects -> 15 -> 19
artistic_disciplines -> 01 -> 7
jewelry_and_fashion_object_types -> 14 -> 7
antiquities_artifacts_and_religious_objects -> 16 -> 7
textiles -> 13 -> 6


03 styles_and_movements looks like the the most comprehensive category.
Next will be:
- subject_matter -> 02
- medium_and_techniques -> 06

In [13]:
genes_dict = open_dict(gen_dict, 'live') # 2. Open json dict
print("dictionary's length is - {}".format(len(genes_dict)))

dictionary's length is - 1032


In [15]:
cat = '06'
#prefix = recently_added_pref
#prefix = recently_updated_pref

filtered_dict = {k: v for k, v in genes_dict.items() if v['category_id'] == cat}
genes_links_list = [v['link'] for v in filtered_dict.values()]
# for addition or updating
#genes_links_list = [v['link']+prefix for v in filtered_dict.values()]
genes_ids_list = [v['id'] for v in filtered_dict.values()]
print(len(genes_links_list))
print(len(genes_ids_list))

175
175


## Data collecting

### Parsing

In [50]:
# too many genes -> let's collect by 50 at a time
backup_names = []

#### First collection

In [78]:
df_temp_data_old = open_file_csv(temp_data, 'live') # 3. Open file.csv
# 8. Get the all category data into DataFrame                     !!!!!                    !!!!!
df_temp_data_new = get_df_data(df_temp_data_old, genes_links_list[150:], genes_ids_list[150:], cat)
#                  get_df_data(df_input, genes_links_list, genes_ids_list, cat) ->
# -> df_output
save_df_file(df_temp_data_new, temp_data, 'live') # 4. Save df to csv file

new_name_csv = temp_data.replace('.csv', '') + '_' + str(datetime.now()) + '.csv'
backup_names.append(new_name_csv)
save_df_file(df_temp_data_new, new_name_csv, 'backup') # 4. Save df to csv file

Starting new HTTPS connection (1): www.artsy.net:443


12:42:13.323624
Parsing of gene 06151 https://www.artsy.net/gene/soft-sculpture has started


https://www.artsy.net:443 "GET /gene/soft-sculpture HTTP/1.1" 200 None


8 pages for parsing


https://www.artsy.net:443 "GET /gene/soft-sculpture?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/soft-sculpture?page=8 HTTP/1.1" 200 None


Parsing of gene 06151 https://www.artsy.net/gene/soft-sculpture has finished
15.884018898010254 seconds have passed
12:42:29.207693



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06152 https://www.artsy.net/gene/sound-art has started


https://www.artsy.net:443 "GET /gene/sound-art HTTP/1.1" 200 None


28 pages for parsing


https://www.artsy.net:443 "GET /gene/sound-art?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sound-art

Parsing of gene 06152 https://www.artsy.net/gene/sound-art has finished
86.20177984237671 seconds have passed
12:43:39.525750



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06153 https://www.artsy.net/gene/splattered-slash-dripped has started


https://www.artsy.net:443 "GET /gene/splattered-slash-dripped HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/splattered-slash-dripped?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GE

Parsing of gene 06153 https://www.artsy.net/gene/splattered-slash-dripped has finished
324.0691194534302 seconds have passed
12:47:37.393420



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06154 https://www.artsy.net/gene/spray-paint has started


https://www.artsy.net:443 "GET /gene/spray-paint HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/spray-paint?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/spray-paint?page=14 HTTP/1.1" 200 None
https://www.artsy.ne

Parsing of gene 06154 https://www.artsy.net/gene/spray-paint has finished
579.6360769271851 seconds have passed
12:51:52.959755



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06155 https://www.artsy.net/gene/staged-photography has started


https://www.artsy.net:443 "GET /gene/staged-photography HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/staged-photography?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/staged-photography?page=13 HTTP/1.1" 200 None
https://www.

Parsing of gene 06155 https://www.artsy.net/gene/staged-photography has finished
795.3020813465118 seconds have passed
12:55:28.626103



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06156 https://www.artsy.net/gene/stains-slash-washes has started


https://www.artsy.net:443 "GET /gene/stains-slash-washes HTTP/1.1" 200 None


72 pages for parsing


https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/stains-slash-washes?page=13 HTTP/1.1" 200 None


Parsing of gene 06156 https://www.artsy.net/gene/stains-slash-washes has finished
958.3741602897644 seconds have passed
12:58:11.698190



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06157 https://www.artsy.net/gene/study has started


https://www.artsy.net:443 "GET /gene/study HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/study?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/study?page=15 HTTP/1.1" 200 None
https://www.artsy.net:443 "G

Parsing of gene 06157 https://www.artsy.net/gene/study has finished
1189.454053401947 seconds have passed
13:02:02.778081



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06158 https://www.artsy.net/gene/suspended-slash-hanging has started


https://www.artsy.net:443 "GET /gene/suspended-slash-hanging HTTP/1.1" 200 None


15 pages for parsing


https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/suspended-slash-hanging?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/sus

Parsing of gene 06158 https://www.artsy.net/gene/suspended-slash-hanging has finished
1220.555858373642 seconds have passed
13:02:33.879550



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06159 https://www.artsy.net/gene/tempera has started


https://www.artsy.net:443 "GET /gene/tempera HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/tempera?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tempera?page=15 HTTP/1.1" 200 None


Parsing of gene 06159 https://www.artsy.net/gene/tempera has finished
1443.4725539684296 seconds have passed
13:06:16.796556



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06160 https://www.artsy.net/gene/time-lapse-photography-and-film has started


https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film HTTP/1.1" 200 None


14 pages for parsing


https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-photography-and-film?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/time-lapse-

Parsing of gene 06160 https://www.artsy.net/gene/time-lapse-photography-and-film has finished
1481.912610054016 seconds have passed
13:06:55.236575



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06161 https://www.artsy.net/gene/tondo has started


https://www.artsy.net:443 "GET /gene/tondo HTTP/1.1" 200 None


10 pages for parsing


https://www.artsy.net:443 "GET /gene/tondo?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/tondo?page=10 HTTP/1.1" 200 None


Parsing of gene 06161 https://www.artsy.net/gene/tondo has finished
1505.1503546237946 seconds have passed
13:07:18.474346



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06162 https://www.artsy.net/gene/topographic-photography has started


https://www.artsy.net:443 "GET /gene/topographic-photography HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/topographic-photography?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/topographic-photography?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/top

Parsing of gene 06162 https://www.artsy.net/gene/topographic-photography has finished
1738.5851016044617 seconds have passed
13:11:11.908780



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06163 https://www.artsy.net/gene/traditional-photographic-techniques has started


https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques HTTP/1.1" 200 None


41 pages for parsing


https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-photographic-techniques?page=11 HTTP/1.1" 200 None
https://

Parsing of gene 06163 https://www.artsy.net/gene/traditional-photographic-techniques has finished
1828.803575515747 seconds have passed
13:12:42.127253



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06164 https://www.artsy.net/gene/traditional-scroll-painting has started


https://www.artsy.net:443 "GET /gene/traditional-scroll-painting HTTP/1.1" 200 None


3 pages for parsing


https://www.artsy.net:443 "GET /gene/traditional-scroll-painting?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/traditional-scroll-painting?page=3 HTTP/1.1" 200 None


Parsing of gene 06164 https://www.artsy.net/gene/traditional-scroll-painting has finished
1835.8468935489655 seconds have passed
13:12:49.170876



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06165 https://www.artsy.net/gene/transfer has started


https://www.artsy.net:443 "GET /gene/transfer HTTP/1.1" 200 None


7 pages for parsing


https://www.artsy.net:443 "GET /gene/transfer?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/transfer?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/transfer?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/transfer?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/transfer?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/transfer?page=7 HTTP/1.1" 200 None


Parsing of gene 06165 https://www.artsy.net/gene/transfer has finished
1849.1209788322449 seconds have passed
13:13:02.445709



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06166 https://www.artsy.net/gene/trompe-loeil has started


https://www.artsy.net:443 "GET /gene/trompe-loeil HTTP/1.1" 200 None


19 pages for parsing


https://www.artsy.net:443 "GET /gene/trompe-loeil?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/trompe-loeil?page=14 HTTP/1.1" 200 None
https:/

Parsing of gene 06166 https://www.artsy.net/gene/trompe-loeil has finished
1888.2874670028687 seconds have passed
13:13:41.611568



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06167 https://www.artsy.net/gene/typography has started


https://www.artsy.net:443 "GET /gene/typography HTTP/1.1" 200 None


29 pages for parsing


https://www.artsy.net:443 "GET /gene/typography?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typography?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /g

Parsing of gene 06167 https://www.artsy.net/gene/typography has finished
1951.8204081058502 seconds have passed
13:14:45.144095



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06168 https://www.artsy.net/gene/typologies has started


https://www.artsy.net:443 "GET /gene/typologies HTTP/1.1" 200 None


37 pages for parsing


https://www.artsy.net:443 "GET /gene/typologies?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/typologies?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /g

Parsing of gene 06168 https://www.artsy.net/gene/typologies has finished
2033.4553456306458 seconds have passed
13:16:06.779432



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06169 https://www.artsy.net/gene/use-of-traditional-techniques has started


https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/use-of-traditional-techniques?p

Parsing of gene 06169 https://www.artsy.net/gene/use-of-traditional-techniques has finished
2249.3334391117096 seconds have passed
13:19:42.657110



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06170 https://www.artsy.net/gene/virtual-and-augmented-reality has started


https://www.artsy.net:443 "GET /gene/virtual-and-augmented-reality HTTP/1.1" 200 None


1 pages for parsing
Parsing of gene 06170 https://www.artsy.net/gene/virtual-and-augmented-reality has finished
2251.3585743904114 seconds have passed
13:19:44.682648



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06171 https://www.artsy.net/gene/wall-sculpture-and-installation has started


https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpture-and-installation?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/wall-sculpt

Parsing of gene 06171 https://www.artsy.net/gene/wall-sculpture-and-installation has finished
2476.0005984306335 seconds have passed
13:23:29.324278



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06172 https://www.artsy.net/gene/watercolor has started


https://www.artsy.net:443 "GET /gene/watercolor HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/watercolor?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/watercolor?page=14 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /g

Parsing of gene 06172 https://www.artsy.net/gene/watercolor has finished
2693.979080915451 seconds have passed
13:27:07.302755



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06173 https://www.artsy.net/gene/website has started


https://www.artsy.net:443 "GET /gene/website HTTP/1.1" 200 None


1 pages for parsing
Parsing of gene 06173 https://www.artsy.net/gene/website has finished
2696.3552882671356 seconds have passed
13:27:09.678971



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06174 https://www.artsy.net/gene/woodcut-and-linocut has started


https://www.artsy.net:443 "GET /gene/woodcut-and-linocut HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/woodcut-and-linocut?page=13 HTTP/1.1" 200 None


Parsing of gene 06174 https://www.artsy.net/gene/woodcut-and-linocut has finished
2924.5840270519257 seconds have passed
13:30:57.907705



Starting new HTTPS connection (1): www.artsy.net:443


Parsing of gene 06175 https://www.artsy.net/gene/work-on-paper has started


https://www.artsy.net:443 "GET /gene/work-on-paper HTTP/1.1" 200 None


100 pages for parsing


https://www.artsy.net:443 "GET /gene/work-on-paper?page=2 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=3 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=4 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=5 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=6 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=7 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=8 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=9 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=10 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=11 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=12 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=13 HTTP/1.1" 200 None
https://www.artsy.net:443 "GET /gene/work-on-paper?page=14 HTTP/1.1" 200

Parsing of gene 06175 https://www.artsy.net/gene/work-on-paper has finished
3150.753273963928 seconds have passed
13:34:44.078153



Saving blocks of 50 genes' data in separate files in beckup folder. Then we'll join them together and check if some pages have been missed during the parsing.

In [80]:
backup_names

['/artsy_temp_data_2022-04-06 07:01:26.758130.csv',
 '/artsy_temp_data_2022-04-06 07:19:01.562291.csv',
 '/artsy_temp_data_2022-04-06 08:35:48.400500.csv',
 '/artsy_temp_data_2022-04-06 10:31:00.254918.csv',
 '/artsy_temp_data_2022-04-06 11:58:44.750998.csv',
 '/artsy_temp_data_2022-04-06 12:41:26.025146.csv',
 '/artsy_temp_data_2022-04-06 13:34:44.996171.csv']

#### Addition and Updating

In [None]:
df_temp_data_old = open_file_csv(temp_data, 'live') # 3. Open file.csv
#  8.1 Get the first pages category data into DataFrame
df_temp_data_new = get_df_data_add_up(df_temp_data_old, genes_links_list, genes_ids_list, cat)
#                  get_df_data_add_up(df_input, genes_links_list, genes_ids_list, cat) ->
# -> df_output
save_df_file(df_temp_data_new, temp_data, 'live') # 4. Save df to csv file

new_name_csv = temp_data.replace('.csv', '') + '_' + str(datetime.now()) + '.csv'
backup_names.append(new_name_csv)
save_df_file(df_temp_data_new, new_name_csv, 'backup') # 4. Save df to csv file

In [None]:
backup_names

### Errors' parsing checking

Checking the completeness of a new data after collecting error genes

In [None]:
#df_temp_data_new_1 = open_file_csv('/artsy_temp_data_2022-04-02 10:08:12.779350.csv', 'backup') # 3. Open file.csv
#df_temp_data_new_2 = open_file_csv('/artsy_temp_data_2022-04-04 11:49:43.338279.csv', 'backup') # 3. Open file.csv

In [None]:
#df_temp_data_new_1 = df_temp_data_new_1.dropna(subset=['artsy_id'], axis=0)
#df_temp_data_new = pd.concat([df_temp_data_new_1, df_temp_data_new_2])

#df_temp_data_new['category_id'] = df_temp_data_new['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
#df_temp_data_new['gene_id'] = df_temp_data_new['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

In [59]:
df_temp_data_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1062 entries, 0 to 11
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   artsy_id       1062 non-null   object
 1   name           1062 non-null   object
 2   link           1062 non-null   object
 3   year           1054 non-null   object
 4   artist         1062 non-null   object
 5   gallery        1062 non-null   object
 6   price          1062 non-null   object
 7   gene_id        1062 non-null   object
 8   category_id    1062 non-null   object
 9   parsing_date   1062 non-null   object
 10  updating_date  1062 non-null   object
dtypes: object(11)
memory usage: 99.6+ KB


Everything is correct in case of empty df.

In [None]:
df_temp_data_new.loc[df_temp_data_new['artsy_id'].isna()==True]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date


In [None]:
#df_temp_data_new['category_id'] = df_temp_data_new['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
#df_temp_data_new['gene_id'] = df_temp_data_new['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

df_temp_data_new.loc[df_temp_data_new['gene_id']=='02061'].head()

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
4809,56eaaa62cd530e658b00023c,Untitled 730,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2015,Françoise Nielly,Villa del Arte Galleries,"€27,600",2061,2,2022-04-02,2022-04-02
4810,56eaaa45b202a3658800039f,Wesley,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2008,Françoise Nielly,Villa del Arte Galleries,"€48,490",2061,2,2022-04-02,2022-04-02
4811,56bceac76d932d46410000d1,Selves Portrait,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2015,Phillip Thomas,RJD Gallery,"US$28,000",2061,2,2022-04-02,2022-04-02
4812,5425efc672616935b3af0400,Marisa Virgin Gorda,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2002,Russell James,NTW Gallery,"US$6,000–US$12,000",2061,2,2022-04-02,2022-04-02
4813,56bceac5258faf175e0000b2,Pic,https://d7hftxdivxxvm.cloudfront.net?resize_to...,2015,Phillip Thomas,RJD Gallery,"US$28,000",2061,2,2022-04-02,2022-04-02


In [None]:
print(url_add + genes_links_list[60])
print(genes_ids_list[60])

https://www.artsy.net/gene/eye-contact
02061


In [None]:
genes_ids_list_er = [genes_ids_list[60]]
genes_links_list_er = [genes_links_list[60]]

### Primary parsing checking

Checking the completeness of the data

In [None]:
backup_names

['/artsy_temp_data_2022-04-02 05:00:45.611793.csv',
 '/artsy_temp_data_2022-04-02 06:13:31.079731.csv',
 '/artsy_temp_data_2022-04-02 07:33:47.918499.csv',
 '/artsy_temp_data_2022-04-02 08:46:26.043191.csv',
 '/artsy_temp_data_2022-04-02 09:13:50.131075.csv']

In [None]:
backup_names = ['/artsy_temp_data_2022-04-02 05:00:45.611793.csv',
                '/artsy_temp_data_2022-04-02 06:13:31.079731.csv',
                '/artsy_temp_data_2022-04-02 07:33:47.918499.csv',
                '/artsy_temp_data_2022-04-02 08:46:26.043191.csv',
                '/artsy_temp_data_2022-04-02 09:13:50.131075.csv']

In [None]:
df_cat_dat = open_file_csv(backup_names[0], 'backup') # 3. Open file.csv
for name in backup_names[1:]:
  df_cat_dat_part = open_file_csv(name, 'backup') # 3. Open file.csv
  df_cat_dat = pd.concat([df_cat_dat, df_cat_dat_part])

df_cat_dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275680 entries, 0 to 20027
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   artsy_id       275675 non-null  object
 1   name           275675 non-null  object
 2   link           275675 non-null  object
 3   year           262858 non-null  object
 4   artist         275675 non-null  object
 5   gallery        275554 non-null  object
 6   price          275675 non-null  object
 7   gene_id        275680 non-null  int64 
 8   category_id    275680 non-null  int64 
 9   parsing_date   275680 non-null  object
 10  updating_date  275680 non-null  object
dtypes: int64(2), object(9)
memory usage: 25.2+ MB


In [None]:
df_cat_dat['category_id'] = df_cat_dat['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
df_cat_dat['gene_id'] = df_cat_dat['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

In [None]:
df_cat_dat.loc[df_cat_dat['artsy_id'].isna()==True]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
6398,,,,,,,,2007,2,2022-04-02,2022-04-02
6683,,,,,,,,2061,2,2022-04-02,2022-04-02
12080,,,,,,,,2062,2,2022-04-02,2022-04-02
51117,,,,,,,,2137,2,2022-04-02,2022-04-02
31734,,,,,,,,2173,2,2022-04-02,2022-04-02


In [None]:
df_cat_dat.loc[df_cat_dat['gene_id']=='02007'].head()

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
4868,558acb14726169705a0000a3,Statue of Liberty,https://d7hftxdivxxvm.cloudfront.net?resize_to...,1970's,Andy Warhol,Galerie Andrea Caratsch,"US$35,000",2007,2,2022-04-02,2022-04-02
4869,4fd66ea94c2e7b00010001a2,"American House, Ghost Town, Colorado",https://d7hftxdivxxvm.cloudfront.net?resize_to...,1931,Paul Strand,Aperture Foundation,"Under US$1,000",2007,2,2022-04-02,2022-04-02
4870,52d39d258b3b81a1070000c0,"VJ Day, Times Square",https://d7hftxdivxxvm.cloudfront.net?resize_to...,1945,Alfred Eisenstaedt,GALLERY M,"US$48,000",2007,2,2022-04-02,2022-04-02
4871,520d02d78b3b816617000489,"Motel Drive, Fresno, California",https://d7hftxdivxxvm.cloudfront.net?resize_to...,1991,Jeff Brouws,Robert Klein Gallery,"US$2,500–US$7,500",2007,2,2022-04-02,2022-04-02
4872,528fe0578b3b81821000024e,Untitled (Van Door 3),https://d7hftxdivxxvm.cloudfront.net?resize_to...,2007,Richard Prince,Two Palms,"US$350,000",2007,2,2022-04-02,2022-04-02


Getting list of genes with missing pages parsing and lounch parsing of these genes (not pages !!!) again.

In [None]:
genes_ids_list_er = [g_id for g_id in df_cat_dat.loc[df_cat_dat['artsy_id'].isna()==True]['gene_id']]
genes_ids_list_er = list(set(genes_ids_list_er))
genes_ids_list_er

['02007', '02061', '02137', '02173', '02062']

In [None]:
genes_links_list_er = [genes_links_list[genes_ids_list.index(g_id)] for g_id in genes_ids_list_er]
genes_links_list_er

['/gene/americana',
 '/gene/eye-contact',
 '/gene/popular-culture',
 '/gene/sports-slash-athletics',
 '/gene/face']

In [None]:
df_cat_dat = df_cat_dat.dropna(subset=['artsy_id'], axis=0)

!!! One item can belong not only to different categoryes but also to different genes into one category. That's very important for joining with reparsed data !!!

In [None]:
print(df_cat_dat['artsy_id'].count())
print(df_cat_dat['artsy_id'].nunique())

275675
115444


In [None]:
print(df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].count().max())
print(df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].nunique().max())

2
1


In [None]:
gene_artsy_groupped = df_cat_dat.groupby(['gene_id', 'artsy_id'])['artsy_id'].agg(['count']).reset_index()
gene_artsy_groupped.loc[gene_artsy_groupped['count']>1]

Unnamed: 0,gene_id,artsy_id,count
8533,02010,5f7f7b2237a6ab000f953573,2
10467,02010,6244aef50dd5b0000ed1f51d,2
29265,02029,575157e28b3b812764000dae,2
29677,02030,538314569c18db033e00048f,2
29977,02030,5813fad8275b2454c0000472,2
...,...,...,...
237125,02181,623eca70c61a06000cafa3ee,2
243498,02191,54494d887261692d60500300,2
247421,02192,575c5feb7622dd660700083f,2
260906,02203,54b3d567726169345a490000,2


In [None]:
df_cat_dat.loc[df_cat_dat['artsy_id']=='4d8b93484eb68a1b2c00125b']

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
14694,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2016,2,2022-04-02,2022-04-02
36082,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2081,2,2022-04-02,2022-04-02
59502,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2098,2,2022-04-02,2022-04-02
2289,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2102,2,2022-04-02,2022-04-02
31397,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2122,2,2022-04-02,2022-04-02
14150,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2210,2,2022-04-02,2022-04-02
14164,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2210,2,2022-04-02,2022-04-02
16898,4d8b93484eb68a1b2c00125b,"Mendocino County Coast, Pacific Ocean",https://d7hftxdivxxvm.cloudfront.net?resize_to...,2009,Chip Hooper,Robert Mann Gallery,Price on request,2211,2,2022-04-02,2022-04-02


In [None]:
for link in df_cat_dat.loc[df_cat_dat['artsy_id']=='4d8b93484eb68a1b2c00125b']['link']:
  print(link)

https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&quality=80
https://d7hftxdivxxvm.cloudfront.net?resize_to=fit&src=https%3A%2F%2Fd32dm0rphc51dk.cloudfront.net%2F8NpaJhOeMqPmSGH2IdLcWw%2Flarge.jpg&width=445&height=353&qua

There can be item duplicates inside one gene items' list also. Delete them.

### Mergening of all the data

In [None]:
df_cat_dat_total = pd.concat([df_cat_dat, df_temp_data_new])
df_cat_dat_total.drop_duplicates(subset = ['gene_id', 'artsy_id'], keep = 'first', inplace = True)
df_cat_dat_total = df_cat_dat_total.reset_index().drop(['index'], axis=1) # !!!
df_cat_dat_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275417 entries, 0 to 790
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   artsy_id       275417 non-null  object
 1   name           275417 non-null  object
 2   link           275417 non-null  object
 3   year           262595 non-null  object
 4   artist         275417 non-null  object
 5   gallery        275296 non-null  object
 6   price          275417 non-null  object
 7   gene_id        275417 non-null  object
 8   category_id    275417 non-null  object
 9   parsing_date   275417 non-null  object
 10  updating_date  275417 non-null  object
dtypes: object(11)
memory usage: 25.2+ MB


In [None]:
save_df_file(df_cat_dat_total, temp_data, 'live') # 4. Save df to csv file
#df_cat_dat_total = open_file_csv(temp_data, 'live') # 3. Open file.csv

Genes checking: sometimes there may be a problem (like with cat 03)

In [None]:
# !!!!!!!!!!!!!!!
print(len(genes_ids_list))
df_cat_dat_total['gene_id'].dropna().nunique()

213


213

In [None]:
check_gene_id = df_cat_dat_total['gene_id'].dropna().unique().tolist()
len(check_gene_id)

213

In [None]:
# !!! cat 03 !!!
genes_ids_list_er = list(set(genes_ids_list) - set(check_gene_id))
genes_ids_list_er

['03133', '03185']

In [None]:
genes_links_list_er = [genes_links_list[genes_ids_list.index(g_id)] for g_id in genes_ids_list_er]
for link in genes_links_list_er:
  print(url_add + link)

['/gene/group-material', '/gene/nul-group']

Pages of these genes are empty, so everything's allright.

## Data studying

In [None]:
df_cat_dat_total = open_file_csv(temp_data, 'live') # 3. Open file.csv
df_cat_dat_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275417 entries, 0 to 275416
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   artsy_id       275417 non-null  object
 1   name           275417 non-null  object
 2   link           275417 non-null  object
 3   year           262595 non-null  object
 4   artist         275417 non-null  object
 5   gallery        275296 non-null  object
 6   price          275417 non-null  object
 7   gene_id        275417 non-null  int64 
 8   category_id    275417 non-null  int64 
 9   parsing_date   275417 non-null  object
 10  updating_date  275417 non-null  object
dtypes: int64(2), object(9)
memory usage: 23.1+ MB


In [None]:
print(df_cat_dat_total.groupby(['gene_id', 'artsy_id'])['artsy_id'].count().max())
print(df_cat_dat_total.groupby(['gene_id', 'artsy_id'])['artsy_id'].nunique().max())

1
1


Transform id from int into str and link into a working one.

In [None]:
df_cat_dat_total['category_id'] = df_cat_dat_total['category_id'].dropna().apply(lambda x: str(x) if len(str(x))==cat_id_len else ('0'*(cat_id_len-len(str(x)))+str(x)))
df_cat_dat_total['gene_id'] = df_cat_dat_total['gene_id'].dropna().apply(lambda x: str(x) if len(str(x))==gen_id_len else ('0'*(gen_id_len-len(str(x)))+str(x)))

df_cat_dat_total['link'] = df_cat_dat_total['link'].dropna().apply(lambda x: img_link_corr(x))

df_cat_dat_total.sample(5)

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
142589,544e5463726169293b3e0f00,Our Generation: Bye-Bye!,https://d32dm0rphc51dk.cloudfront.net/DZGKYSRa...,2007,Gao Xiaowu 高孝午,Ode to Art,"US$30,000–US$40,000",2117,2,2022-04-02,2022-04-02
260909,55034eab72616915b38e3b00,"Woman with Heavy Bundles of Clothing, New York...",https://d32dm0rphc51dk.cloudfront.net/FrgMDYRi...,1912,Lewis Wickes Hine,Robert Klein Gallery,"US$1,000–US$2,500",2203,2,2022-04-02,2022-04-02
252486,533dc5f47622dd9f8300018a,March 10 2014: U.S. scientists have developed ...,https://d32dm0rphc51dk.cloudfront.net/aKzjzjgJ...,2014,Ben McLaughlin,Hosfelt Gallery,Price on request,2198,2,2022-04-02,2022-04-02
219641,5cab9b4897da0817e6f3718b,Black Crows,https://d32dm0rphc51dk.cloudfront.net/wulEAOPo...,2017,John Alexander,McClain Gallery,Price on request,2170,2,2022-04-02,2022-04-02
253472,57179a717622dd65990004f3,"Mallard's Reach, Midday",https://d32dm0rphc51dk.cloudfront.net/xbtYHPkv...,2016,Michael Jackson (b.1966),photo-eye Gallery,On hold,2198,2,2022-04-02,2022-04-02


In [None]:
for link in df_cat_dat_total['link'].sample(5):
  print(link)

https://d32dm0rphc51dk.cloudfront.net/565FKKDYEaIfulVqHNqsBQ/large.jpg
https://d32dm0rphc51dk.cloudfront.net/GVIL9qS-4Gg2hM9THrem7g/large.jpg
https://d32dm0rphc51dk.cloudfront.net/xYS3euE107bqQXJqCC7qdA/large.jpg
https://d32dm0rphc51dk.cloudfront.net/yaG9pLm6TKttVxRCQTIiiw/large.jpg
https://d32dm0rphc51dk.cloudfront.net/16G8Mh4xIYx4omXgPNkraw/large.jpg


Count of unique artsy_id and link should be equal. But sometimes they're not (like in cat 03).

In [None]:
print(df_cat_dat_total['artsy_id'].nunique())
print(df_cat_dat_total['link'].nunique())

115482
115482


In [None]:
# !!! links' problem solving !!!
artsy_id_link = df_cat_dat_total[['artsy_id', 'link']]
artsy_id_link.drop_duplicates(subset = ['link', 'artsy_id'], keep = 'first', inplace = True)
artsy_id_link.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168754 entries, 0 to 255790
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   artsy_id  168754 non-null  object
 1   link      168754 non-null  object
dtypes: object(2)
memory usage: 3.9+ MB


In [None]:
# !!! links' problem solving !!!

In [None]:
artsy_link_groupped = artsy_id_link.groupby(['artsy_id'])['link'].agg(['count']).reset_index()
artsy_link_groupped.loc[artsy_link_groupped['count']>1]

Unnamed: 0,artsy_id,count
167397,62448553758a48000c81676a,2


In [None]:
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']=='62448553758a48000c81676a']

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
6995,62448553758a48000c81676a,Perspectiva,https://d32dm0rphc51dk.cloudfront.net/6OibNMhJ...,2022,Rita Moreno Mir,ENCANT,"€1,700",3008,3,2022-04-01,2022-04-01
51793,62448553758a48000c81676a,Perspectiva,https://d32dm0rphc51dk.cloudfront.net/6OibNMhJ...,2022,Rita Moreno Mir,ENCANT,"€1,700",3056,3,2022-04-01,2022-04-01
243946,62448553758a48000c81676a,Perspectiva,https://d32dm0rphc51dk.cloudfront.net/_u9EMRf_...,2022,Rita Moreno Mir,ENCANT,"€1,700",3248,3,2022-04-01,2022-04-01


In [None]:
for link in df_cat_dat_total.loc[df_cat_dat_total['artsy_id']=='62448553758a48000c81676a']['link']:
  print(link)

https://d32dm0rphc51dk.cloudfront.net/6OibNMhJeHc6LyGTagxHGA/large.jpg
https://d32dm0rphc51dk.cloudfront.net/6OibNMhJeHc6LyGTagxHGA/large.jpg
https://d32dm0rphc51dk.cloudfront.net/6OibNMhJeHc6LyGTagxHGA/large.jpg


These pictures are the same but shots were done with different light. Replace the last one.

In [None]:
df_cat_dat_total.at[243946, 'link'] = 'https://d32dm0rphc51dk.cloudfront.net/6OibNMhJeHc6LyGTagxHGA/large.jpg'

In [None]:
print(df_cat_dat_total['artsy_id'].nunique())
print(df_cat_dat_total['link'].nunique())

168753
168753


In [None]:
# !!! links' problem solving !!!

### Artists

We need to create artist id for artworks dict. Artists seem to be ok.

In [None]:
print(df_cat_dat_total['artist'].dropna().nunique())
artists_list = sorted(df_cat_dat_total['artist'].dropna().unique().tolist())
print(len(artists_list))

19738
19738


In [None]:
artists_list[:10]

[' \t Guru Dragpo, a wrathful form of Padmasambhava, 15th centuryRubin Museum of Art',
 ' Bwa Culture, Burkina Faso',
 ' Possibly by Protarchos',
 ' Possibly the Kontoleon Master, Cycladic, active 2700 - 2600 B.C.,  Greece, Cyclades, Europe',
 ' Probably by Aegisthus Painter, Greek (Attic), active about 480 - about 460 B.C.,  Athens, Greece, Europe',
 ' Psiax, Greek (Attic), active about 525 - 510 B.C.,  Athens, Greece, Europe',
 ' Signed by Apelles, Greek, active about 100 - 1 B.C.,  Rome, Italy, Lazio, Europe',
 ' Signed by Aristodamos of Argos, Greek (Argive), active 590 - 570 B.C.,  Argos, Greece, Europe',
 ' Unknown',
 ' Unknown,  Alexandria, Egypt, Africa']

In [None]:
artists_list[-10:]

['Øystein Aasan',
 'Česlovas Lukenskas',
 'İbrahim Örs',
 'İnci Eviner',
 'İrem İncedayı',
 'Şükran Moral',
 'Žygimantas Augustinas',
 'Ștefan Tănase',
 '“Striking of his head” (Kwe’ata’re’esu) Triptych, Mid 17th-early 18th CenturyHarn Museum of Art',
 '付经岩 Fu Jingyan']

In [None]:
artists_list[(len(artists_list)//2):(len(artists_list)//2+10)]

['João Maria Gusmão & Pedro Paiva',
 'João Onofre',
 'João Pedro Vale',
 'João Penalva',
 'João Tabarra',
 'Joël Andrianomearisoa',
 'Joël Degbo',
 'Joël Stein',
 'Jože Meglič',
 'Ju Ming 朱銘']

### Galleries

We also need to create gallery id for artworks dict.

In [None]:
print(df_cat_dat_total['gallery'].nunique())
gallery_list = sorted(df_cat_dat_total['gallery'].dropna().unique().tolist())
print(len(gallery_list))

4746
4746


In [None]:
gallery_list[:10]

[' "A Brush with Reality: Calligraphic Paintings by Lobsang Choephel" at Tibet House US, New York (2015)',
 ' "Disguise: Masks and Global African Art" at Seattle Art Museum, Seattle (2015)',
 ' "Yoko Ono: One Woman Show, 1960-1971" at Museum of Modern Art, New York (2015)',
 '"2015 Wolfgang Hahn Prize: Michael Krebber and R. H. Quaytman" at Museum Ludwig, Cologne (2015)',
 '"2016 Recognition of Art by Women (RAW) Artist: Njideka Akunyili Crosby" at Norton Museum of Art, West Palm Beach',
 '"21er Raum: Iman Issa - Material" at 21er Haus, Vienna',
 '"4 REAL & TRUE 2. Wim Wenders. Landscapes. Photographs." at Museum Kunstpalast, Düsseldorf (2015)',
 '"A Beautiful Lie – Eckersberg" at Statens Museum for Kunst, Copenhagen ',
 '"A Mind of Winter: Photographs by Abelardo Morell" at the Bowdoin College Museum of Art',
 '"A New Dynasty – Created in China"  Venue: ARoS Aarhus Museum of Art, Aarhus ']

In [None]:
gallery_list[-10:]

['Österreichische Galerie Belvedere, Vienna',
 'Österreichische Nationalbibliothek, Vienna',
 '“Clifford Ross: Landscape Seen & Imagined” at MASS MoCA, North Adams (2015)',
 '“Collectionism and Modernity. Two Case Studies: The Im Obersteg and Rudolf Staechelin Collections” at the Museo Reina Sofía, Madrid',
 '“Created by a Hand with but a Chisel Armed…” Sculpture in St Petersburg’s Palaces in the 19th Century at The State Hermitage Museum, 2016',
 '“In the Eye of the Thunderstorm: Effervescent Practices from the Arab World” at ArsCulture, Venice (2015)',
 '“L’image volée” at Fondazione Prada, Milan (2016)',
 '√K Contemporary',
 '首都藝術中心 Capital Art Center',
 '鳩ノ森美術 / HATONOMORI ART']

Seems like in some cases there's a exhibition name in a gellery name box. Let's try to divide them.

In [None]:
exhib_list = []
gal_list = []

for g in gallery_list:
  try:
    gal = g.split(' at ')[-1]
    gal_list.append(gal)
    exhib_list.append(g.replace(' at ' + gal, ''))
  except:
    gal_list.append(g)

gal_list = sorted(list(set(gal_list)))
print(len(gal_list))
exhib_list = sorted(list(set(exhib_list)))
print(len(exhib_list))

5256
5360


In [None]:
gal_list[:10]

[' Museum Ludwig, Cologne',
 '"A New Dynasty – Created in China"  Venue: ARoS Aarhus Museum of Art, Aarhus ',
 '"Anselm Kiefer"at Centre Pompidou, Paris',
 '"Barbie"at Musée des Arts Décoratifs, Paris',
 '"Chen Zhen: Without going to New York and Paris, life could be internationalized"  Venue: Rockbund Art Museum, Shanghai',
 '"Inhuman"at Fridericianum, Kassel, Germany',
 '"Joaquín Torres-García: The Arcadian Modern"at the Museum of Modern Art, New York',
 '"Landscapes of Belgium" Musée d\'Ixelles, Brussels (2015)',
 '"Philippe Parreno: Anywhere, Anywhere, Out Of The World", Palais de Tokyo, Paris (2013)',
 '"This Art is Your Art" Competition: The White House Historical Association, Artsy, and the Robert Rauschenberg Foundation']

The data is not clear and this info is not critical, leave it as it is, won't do gallery id.

In [None]:
#save_df_file(df_cat_dat_total, temp_data, 'live') # 4. Save df to csv file

### Artworks

In [None]:
df_cat_dat_total.columns

Index(['artsy_id', 'name', 'link', 'year', 'artist', 'gallery', 'price',
       'gene_id', 'category_id', 'parsing_date', 'updating_date'],
      dtype='object')

For ***one to one connections***

In [None]:
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'year', 'artist', 'gallery', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index()
one_to_one.sample(5)

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,parsing_date,updating_date,count
35214,54cff4c37261697c8bfe0000,"Huashan Project – Performance,",https://d32dm0rphc51dk.cloudfront.net/BjzXkBGx...,2014,Shi Jinsong 史金淞,Eli Klein Gallery,Price on request,2022-04-02,2022-04-02,2
48931,560d4f2b7261696c5a00007f,Les Mondes Caches,https://d32dm0rphc51dk.cloudfront.net/D_J71ZFG...,2015,Yoël Benharrouche,Eden Gallery,Price on request,2022-04-02,2022-04-02,3
91390,613f74c305858d000c3680b6,"Teddy Bear, Intense Yellow and Carnation Pink",https://d32dm0rphc51dk.cloudfront.net/EA60nfqn...,2019,Judy Ledgerwood,1301PE,"US$15,000–US$20,000",2022-04-02,2022-04-02,1
29766,5449619b7261696393e61400,Latent Horizon,https://d32dm0rphc51dk.cloudfront.net/q1iLUtXw...,ca. 2014,Anne Packard,Quidley & Company,Sold,2022-04-02,2022-04-02,2
41515,554c8c29726169698bbb0400,Marc Per Un Paisatge,https://d32dm0rphc51dk.cloudfront.net/FI2eQUf8...,2002,Joan Hernández Pijuan,Mario Mauroner Contemporary Art Salzburg,Price on request,2022-04-02,2022-04-02,2


In [None]:
print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

115482
111130
111127


In [None]:
print('artsy_id count must be', df_cat_dat_total['artsy_id'].nunique())
print('')
for col in ['name', 'link', 'year', 'artist', 'gallery', 'price', 'parsing_date', 'updating_date']:
  group_df = df_cat_dat_total.groupby(['artsy_id', col])['category_id'].agg(['count']).reset_index()
  print('by column', col)
  print('artsy_id count is', group_df['artsy_id'].count())
  print('artsy_id nunique is', group_df['artsy_id'].nunique())
  print('count and nunique is the same - ', group_df['artsy_id'].count()==group_df['artsy_id'].nunique())
  print(list(set(df_cat_dat_total['artsy_id'].unique()) - set(group_df['artsy_id'].unique())))
  print('')

artsy_id count must be 115482

by column name
artsy_id count is 115482
artsy_id nunique is 115482
count and nunique is the same -  True
[]

by column link
artsy_id count is 115482
artsy_id nunique is 115482
count and nunique is the same -  True
[]

by column year
artsy_id count is 111191
artsy_id nunique is 111191
count and nunique is the same -  True
['533b5b9c8b3b81a12a0001f0', '547b558b7261692d652e0000', '547f86227261695738000700', '5445717872616973b7b60100', '561ffd7172616969b300012c', '50fdd0afd0c2ebe891000107', '5b240db9a6ca6d169be09b72', '50eefb11d0c2eb50680007a9', '55b15d7d72616912980002ef', '555f8c0b7261694535300400', '52f2a0f57622ddd9c000006e', '53ab52557261692d42680300', '532df89e275b244bc40002a8', '54f0b35f72616927db861200', '52aa0f55cd530ed529000238', '5a0b66b4c9dc24225614bf07', '541aa033726169452cce0700', '57ba6dfd275b24564d001e95', '5733c6758b3b813b34000476', '53a61ce572616956de170500', '5aaa9d409c18db520ca10202', '50eefa50d0c2eb5068000459', '52bdd4659c18db18e8000560', '

In [None]:
df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(['528a7d17b202a3596800026d', '5526fdc37261690beecb0700', '5163f0afe6222a52c1000075'])]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
42192,5163f0afe6222a52c1000075,Maostache,https://d32dm0rphc51dk.cloudfront.net/bvh4MJrC...,2012,Oliver Jeffers,,,2038,2,2022-04-02,2022-04-02
53571,528a7d17b202a3596800026d,Binary Sofa,https://d32dm0rphc51dk.cloudfront.net/qVMlvTla...,2013,Benjamin Rollins Caldwell,,,2046,2,2022-04-02,2022-04-02
8332,5163f0afe6222a52c1000075,Maostache,https://d32dm0rphc51dk.cloudfront.net/bvh4MJrC...,2012,Oliver Jeffers,,,2061,2,2022-04-02,2022-04-02
42671,5163f0afe6222a52c1000075,Maostache,https://d32dm0rphc51dk.cloudfront.net/bvh4MJrC...,2012,Oliver Jeffers,,,2084,2,2022-04-02,2022-04-02
21038,5526fdc37261690beecb0700,Flight Take Off,https://d32dm0rphc51dk.cloudfront.net/DZpIUoat...,2008,Geoff Mann,,,2116,2,2022-04-02,2022-04-02
38271,528a7d17b202a3596800026d,Binary Sofa,https://d32dm0rphc51dk.cloudfront.net/qVMlvTla...,2013,Benjamin Rollins Caldwell,,,2126,2,2022-04-02,2022-04-02
59733,5526fdc37261690beecb0700,Flight Take Off,https://d32dm0rphc51dk.cloudfront.net/DZpIUoat...,2008,Geoff Mann,,,2197,2,2022-04-02,2022-04-02


Due to NaNs in 'name' (sometimes), 'year' and 'gallery' we can loose ids. And we have problems with prices.

In [None]:
artsy_id_price = df_cat_dat_total[['artsy_id', 'price']]
artsy_id_price.drop_duplicates(subset = ['price', 'artsy_id'], keep = 'first', inplace = True)
id_price_g = artsy_id_price.groupby(['artsy_id'])['price'].agg(['count']).reset_index()

id_prob_price = id_price_g.loc[id_price_g['count']>1]['artsy_id'].tolist()
len(id_prob_price)

3

In [None]:
id_prob_price

['56ce217976143f18e700032f',
 '56fb2abc9c18db1b5c00573c',
 '6128eb8a1ed761000dcaa2e4']

In different genes mentioned price could be different. In most cases prices are very similar -> put the mode or one that in the interval. Sometims there isn't a mode or it's 'Price on request'. Put the maximum. We don't need history here.

In [None]:
inx = 2
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_price[inx]]
#df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(id_prob_price)]

Unnamed: 0,artsy_id,name,link,year,artist,gallery,price,gene_id,category_id,parsing_date,updating_date
199493,6128eb8a1ed761000dcaa2e4,"Sharpies (Blue, Red) - 캐서린 베른하르트",https://d32dm0rphc51dk.cloudfront.net/JWVhPBjo...,2019,Katherine Bernhardt,Frank Fluegel Gallery,"US$12,500",2155,2,2022-04-02,2022-04-02
235160,6128eb8a1ed761000dcaa2e4,"Sharpies (Blue, Red) - 캐서린 베른하르트",https://d32dm0rphc51dk.cloudfront.net/JWVhPBjo...,2019,Katherine Bernhardt,Frank Fluegel Gallery,"US$13,000",2181,2,2022-04-02,2022-04-02


In [None]:
df_cat_dat_total.loc[df_cat_dat_total['artsy_id']==id_prob_price[inx]]['price'].unique()

array(['US$12,500 ', 'US$13,000 '], dtype=object)

In [None]:
df_cat_dat_total.at[221047, 'price'] = 'Bidding closed '
df_cat_dat_total.at[221046, 'price'] = 'Bidding closed '
df_cat_dat_total.at[199493, 'price'] = 'US$13,000 '

In [None]:
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'artist', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index()

print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

115482
115482
115482


In [None]:
save_df_file(df_cat_dat_total, '/artsy_cat_data.csv', 'backup') # 4. Save df to csv file

## Dict creating

### artists_dict

#### initiation

In [None]:
#artists_dict = {'000000': {'name': 'Jacopo Bassano', 
#                           'artwork_id: ['000000', ''], # Portrait of a Man in Armour
#                           -- 'gallery_id': ['000000', ''], # Robilant+Voena
#                           'gene_id': ['21603', ''],
#                           'category_id': ['03', '']}}

In [None]:
print(df_cat_dat_total['artist'].dropna().nunique())
artists_list = sorted(df_cat_dat_total['artist'].dropna().unique().tolist())
print(len(artists_list))

In [None]:
# artist_id creation
at_id_len = len(str(len(artists_list)))
at_id_list = []
c = 1
for a in artists_list:
  id = '0'*(at_id_len-len(str(c)))+str(c)
  at_id_list.append(id)
  c += 1

print(len(at_id_list))
print(df_cat_dat_total['artist'].dropna().nunique())

20563
20563


***One to one connection***

In [None]:
# Dict creation
artists_dict = pd.DataFrame({'id': at_id_list, 'name': artists_list}).set_index('id').to_dict('index')
len(artists_dict)

In [None]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Jacopo Bassano'}
artists_dict_samp

#### expansion

with 'artwork_id', 'gene_id' and 'category_id'
- ***one to many connection***
- id info is in the artists_dict

In [None]:
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()

#artists_dict = {k: v.update({'category_id': [cat[0]]}) for k, v in artists_dict.items()}
for k in artists_dict:
  artists_dict[k].update({'category_id': [cat[0]]})

In [None]:
# one to many connection from df
def artist_dict_expan_otm(param):
  group_df = df_cat_dat_total.groupby(['artist'])[param].apply(list).apply(set).apply(list).reset_index(name=param)
  group_dict = group_df.set_index('artist').to_dict('index')

  for k, v in artists_dict.items():
    if v['name'] in group_dict.keys():
      artists_dict[k].update(group_dict[v['name']])
    else:
      artists_dict[k].update({param: [' ']})

In [None]:
# gene_id
artist_dict_expan_otm('gene_id')

In [None]:
# artsy_id
artist_dict_expan_otm('artsy_id')

In [None]:
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Jacopo Bassano'}
artists_dict_samp

In [None]:
save_dict(artists_dict, artist_dict) # 1. Save json dict

#### addition

In [None]:
artists_dict = open_dict(artist_dict, 'live')
print(len(artists_dict))
print('')
artists_dict_samp = {k: v for k, v in artists_dict.items() if v['name'] == 'Joseph Henry Sharp'}
artists_dict_samp

26678



{'10100': {'artsy_id': ['55073e2972616942f7760200',
   '5a6260c1a09a6715fd6355bc',
   '55073e2972616942ed960100'],
  'category_id': ['02', '03'],
  'gene_id': ['02184', '03162', '02069', '02015', '02168', '02189', '03004'],
  'name': 'Joseph Henry Sharp'}}

In [None]:
print(df_cat_dat_total['artist'].dropna().nunique())
artists_list = sorted(df_cat_dat_total['artist'].dropna().unique().tolist())
print(len(artists_list))

19738
19738


In [None]:
# Dict updating

# Old dict data
# {'id': {'artsy_id': [''], 'category_id': [''], 'gene_id': [''], 'name': ''}}
c = len(artists_dict)+1 # for id creation
id_list = list(artists_dict.keys()) # list of str
name_list = [v['name'] for v in artists_dict.values()] # list of str
artwork_list = [v['artsy_id'] for v in artists_dict.values()] # list of lists with str
cat_list = [v['category_id'] for v in artists_dict.values()] # list of lists with str
gene_list = [v['gene_id'] for v in artists_dict.values()] # list of lists with str

# New df data
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()
# gene_id
gene_df = df_cat_dat_total.groupby(['artist'])['gene_id'].apply(list).apply(set).apply(list).reset_index(name='gene_id')
gene_dict = gene_df.set_index('artist').to_dict('index')
# artsy_id
artwork_df = df_cat_dat_total.groupby(['artist'])['artsy_id'].apply(list).apply(set).apply(list).reset_index(name='artsy_id')
artwork_dict = artwork_df.set_index('artist').to_dict('index')

# New dict
new_artists_id = []
cross_artists_id = []
for name in artists_list:
  if name not in name_list:
    id = '0'*(artist_id_len-len(str(c)))+str(c)
    new_artists_id.append(id)
    artists_dict.update({id: {'artsy_id': artwork_dict[name]['artsy_id'], 'category_id': [cat[0]], 'gene_id': gene_dict[name]['gene_id'], 'name': name}})
    c += 1
  else:
    id = id_list[name_list.index(name)]
    cross_artists_id.append(id)
    new_cat = list(set(cat_list[name_list.index(name)] + cat))
    artists_dict[id].update({'category_id': new_cat})
    new_gene = list(set(gene_list[name_list.index(name)] + gene_dict[name]['gene_id']))
    artists_dict[id].update({'gene_id': new_gene})
    new_artwork = list(set(artwork_list[name_list.index(name)] + artwork_dict[name]['artsy_id']))
    artists_dict[id].update({'artsy_id': new_artwork})

print(len(artists_dict))
print(len(new_artists_id))
print(len(cross_artists_id))

26678
6115
13623


In [None]:
artists_dict[new_artists_id[len(new_artists_id)//2]]

{'artsy_id': ['5c48bd6255146100270b0086'],
 'category_id': ['02'],
 'gene_id': ['02173'],
 'name': 'KangHee Kim'}

In [None]:
artists_dict[cross_artists_id[len(cross_artists_id)//2]]

{'artsy_id': ['55073e2972616942f7760200',
  '5a6260c1a09a6715fd6355bc',
  '55073e2972616942ed960100'],
 'category_id': ['02', '03'],
 'gene_id': ['02184', '03162', '02069', '02015', '02168', '02189', '03004'],
 'name': 'Joseph Henry Sharp'}

In [None]:
save_dict(artists_dict, artist_dict, 'live') # 1. Save json dict

### genes_dict

#### expansion

In [None]:
genes_dict = open_dict(gen_dict) # 2. Open json dict
len(genes_dict)

Update genes_dict with 'artwork_id':
- list type parametr - ***one to many connection***
- id info is in the **df**

In [None]:
group_df = df_cat_dat_total.groupby(['gene_id'])['artsy_id'].apply(list).apply(set).apply(list).reset_index(name='artsy_id')
group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])
  else:
    genes_dict[k].update({'artsy_id': [' ']})

Fill in genes_dict with 'artist_id':
- list type parametr - ***one to many connection***
- id info is in the **artists_dict**

In [None]:
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]

group_df = df_cat_dat_total.groupby(['gene_id'])['artist'].apply(list).apply(set).apply(list).reset_index(name='artist')
group_df['artist_id'] = group_df['artist'].dropna().apply(lambda x: [keys_list[values_list.index(y)] for y in x])
group_df = group_df.drop(['artist'], axis=1)

group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])
  else:
    genes_dict[k].update({'artist_id': [' ']})

In [None]:
group_dict['03001']

In [None]:
#bad-painting
#abstract-landscape
genes_dict['bad_painting']

In [None]:
save_dict(genes_dict, gen_dict) # 1. Save json dict

#### addition

In [None]:
genes_dict = open_dict(gen_dict, 'live') # 2. Open json dict
len(genes_dict)

1032

In [None]:
genes_dict['abstract_landscape']

{'artist_id': ' ',
 'artsy_id': ' ',
 'category_id': '02',
 'id': '02001',
 'link': '/gene/abstract-landscape',
 'name': 'Abstract Landscape',
 'parsing_date': '2022-03-29',
 'updating_date': '2022-03-29'}

In [None]:
# artsy_id
group_df = df_cat_dat_total.groupby(['gene_id'])['artsy_id'].apply(list).apply(set).apply(list).reset_index(name='artsy_id')
group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])

In [None]:
# artist_id
artists_dict = open_dict(artist_dict)
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]

group_df = df_cat_dat_total.groupby(['gene_id'])['artist'].apply(list).apply(set).apply(list).reset_index(name='artist')
group_df['artist_id'] = group_df['artist'].dropna().apply(lambda x: [keys_list[values_list.index(y)] for y in x])
group_df = group_df.drop(['artist'], axis=1)

group_dict = group_df.set_index('gene_id').to_dict('index')

for k, v in genes_dict.items():
  if v['id'] in group_dict.keys():
    genes_dict[k].update(group_dict[v['id']])

In [None]:
#genes_dict['abstract_landscape']

In [None]:
save_dict(genes_dict, gen_dict, 'live') # 1. Save json dict

### artworks_dict

#### initiation

In [None]:
#artworks_dict = {'artsy_id': {'name': 'Portrait of a Man in Armour', 
#                              'link': 'https://d32dm0rphc51dk.cloudfront.net/yaB__0LXFvbXC4jiUK4Vpw/large.jpg', 
#                              'year': 'ca. 1560',
#                              'artist_id': '000000', # Jacopo Bassano
#                              -- 'gallery_id': '000000', # Robilant+Voena
#                              'price': 'Price on request',
#                              'gene_id': ['21603', ''],
#                              'category_id': ['03', '']}}

***One to one connections***

Let's create a dict from complete data.

In [None]:
one_to_one = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'artist', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index().drop('count', axis=1)

check_id_list = one_to_one['artsy_id'].sample(10).tolist()

print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one['artsy_id'].count())
print(one_to_one['artsy_id'].nunique())

115482
115482
115482


In [None]:
# Dict creation
artworks_dict = one_to_one.set_index('artsy_id').to_dict('index')
len(artworks_dict)

In [None]:
for id in check_id_list:
  print(artworks_dict[id])

#### expansion

In [None]:
#artworks_dict = open_dict(artwork_dict)

Columns with NaNs:
- 'name' (not always), 'year', 'gallery'
- ***one to one connection***
- info is in the **df**

In [None]:
# one to one connection from df
def artwork_dict_expan_oto(df, main_dict, param):
  group_df = df.groupby(['artsy_id', param])['category_id'].agg(['count']).reset_index().drop('count', axis=1)
  group_dict = group_df.set_index('artsy_id').to_dict('index')

  for k in main_dict:
    if k in group_dict.keys():
      main_dict[k].update(group_dict[k])
    else:
      main_dict[k].update({param: ' '}) # !!!!

# one to many connection from df
def artwork_dict_expan_otm(df, main_dict, param):
  group_df = df.groupby(['artsy_id'])[param].apply(list).apply(set).apply(list).reset_index(name=param)
  group_dict = group_df.set_index('artsy_id').to_dict('index')

  for k in main_dict:
    if k in group_dict.keys():
      main_dict[k].update(group_dict[k])
    else:
      main_dict[k].update({param: [' ']}) # !!!!

In [None]:
# name
#artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'name')

In [None]:
# year
artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'year')

In [None]:
# gallery
artwork_dict_expan_oto(df_cat_dat_total, artworks_dict, 'gallery')

In [None]:
for id in check_id_list:
  print(artworks_dict[id])

Columns with repeating values:
- 'gene_id', 'category_id'
- list type parametr - ***one to many connection***
- id info is in the **df**

In [None]:
# category_id
cat = df_cat_dat_total['category_id'].unique().tolist()

#artworks_dict = {k: v.update({'category_id': [cat[0]]}) for k, v in artworks_dict.items()}

for k in artworks_dict:
  artworks_dict[k].update({'category_id': [cat[0]]})

In [None]:
# gene_id
#artwork_dict_extent_otm('gene_id')
group_df = df_cat_dat_total.groupby(['artsy_id'])['gene_id'].apply(list).apply(set).apply(list).reset_index(name='gene_id')
group_dict = group_df.set_index('artsy_id').to_dict('index')

for k in artworks_dict:
  if k in group_dict.keys():
    artworks_dict[k].update(group_dict[k])
  else:
    artworks_dict[k].update({'gene_id': [' ']}) # !!!!

for id in check_id_list:
  print(group_dict[id])

In [None]:
for id in check_id_list:
  print(artworks_dict[id])

Replace/Add 'artist' on 'artist_id'  in artworks_dict
- ***one to one connection***

In [None]:
group_df = df_cat_dat_total.groupby(['artsy_id', 'artist'])['artist'].agg(['count']).reset_index().drop(['count'], axis=1)

keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]
group_df['artist_id'] = group_df['artist'].dropna().apply(lambda x: keys_list[values_list.index(x)])
group_df = group_df.drop(['artist'], axis=1)

group_dict = group_df.set_index('artsy_id').to_dict('index')

for k in artworks_dict:
  if k in group_dict.keys():
    artworks_dict[k].update(group_dict[k])
  else:
    artworks_dict[k].update({'artist_id': ' '})

In [None]:
group_dict['623f201d30b258000cc3053a']

In [None]:
artworks_dict['623f201d30b258000cc3053a']

In [None]:
# Replacing
#artworks_dict = {k: artworks_dict[k] for k in artworks_dict.keys() - {'artist'}}
#artworks_dict['623f201d30b258000cc3053a']

In [None]:
#save_dict(artworks_dict, artwork_dict) # 1. Save json dict

#### addition

In [None]:
artworks_dict = open_dict('/artsy_genom_artwork_dict_2022-04-04.csv', 'backup')
len(artworks_dict)

168753

In [None]:
artworks_dict['623f201d30b258000cc3053a']

{'artist': 'Tuukka Tammisaari',
 'artist_id': '19134',
 'category_id': ['03'],
 'gallery': 'Kristof De Clercq',
 'gene_id': ['03056', '03013', '03090'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/hrEVTH4ekjqm_iKE1HStzg/large.jpg',
 'name': 'Untitled',
 'parsing_date': '2022-04-01',
 'price': '€6,600 ',
 'updating_date': '2022-04-01',
 'year': '2021'}

In [None]:
one_to_one_df = df_cat_dat_total.groupby(['artsy_id', 'name', 'link', 'artist', 'price', 'parsing_date', 'updating_date'])['category_id'].agg(['count']).reset_index().drop('count', axis=1)

check_id_list = one_to_one_df['artsy_id'].sample(10).tolist()

print(df_cat_dat_total['artsy_id'].nunique())
print(one_to_one_df['artsy_id'].count())
print(one_to_one_df['artsy_id'].nunique())

115482
115482
115482


In [None]:
# Dict updating

# Old dict data
# {'artsy_id': {'artist': ' ', 'artist_id': ' ', 'category_id': [''], 'gallery': ' ', 'gene_id': [''], 
#               'link': ' ', 'name': ' ', 'parsing_date': '', 'price': ' ', 'updating_date': '', 'year': ' '}}

old_id_list = list(artworks_dict.keys()) # list of str

artist_list = [v['artist'] for v in artworks_dict.values()] # list of str
#artist_id_list = [v['artist_id'] for v in artworks_dict.values()] # list of str ????????????
gallery_list = [v['gallery'] for v in artworks_dict.values()] # list of str
link_list = [v['link'] for v in artworks_dict.values()] # list of str
name_list = [v['name'] for v in artworks_dict.values()] # list of str
#price_list = [v['price'] for v in artworks_dict.values()] # list of str
year_list = [v['year'] for v in artworks_dict.values()] # list of str

cat_list = [v['category_id'] for v in artworks_dict.values()] # list of lists with str
gene_list = [v['gene_id'] for v in artworks_dict.values()] # list of lists with str

# New df data
new_id_list = df_cat_dat_total['artsy_id'].unique()
cat = df_cat_dat_total['category_id'].unique().tolist() # category_id
print('New category:', cat[0])

new_artwork_id = list(set(new_id_list) - set(old_id_list))
cross_artwork_id = list(set(old_id_list) & set(new_id_list))
print('Check the division:', len(new_id_list)==len(new_artwork_id)+len(cross_artwork_id))

one_to_one_new_df = one_to_one_df.loc[one_to_one_df['artsy_id'].isin(new_artwork_id)]
one_to_one_cross_df = one_to_one_df.loc[one_to_one_df['artsy_id'].isin(cross_artwork_id)]
print('New id count:', one_to_one_new_df['artsy_id'].nunique())
print('Cross id count:', one_to_one_cross_df['artsy_id'].nunique())

expaned_new_df = df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(new_artwork_id)]
expaned_cross_df = df_cat_dat_total.loc[df_cat_dat_total['artsy_id'].isin(cross_artwork_id)]
print('New id count:', expaned_new_df['artsy_id'].nunique())
print('Cross id count:', expaned_cross_df['artsy_id'].nunique())

artworks_dict_total_len = len(old_id_list) + len(new_artwork_id)
print('New dict length must be:', artworks_dict_total_len)

New category: 02
Check the division: True
New id count: 46583
Cross id count: 68899
New id count: 46583
Cross id count: 68899
New dict length must be: 215336


##### Old part dict updating

In [None]:
artworks_dict_cross = one_to_one_cross_df.set_index('artsy_id').to_dict('index') # with 'name'

dif_name = []
dif_link = []
dif_artist = []

for id in cross_artwork_id:
  if artworks_dict_cross[id]['name'] != artworks_dict[id]['name']:
    dif_name.append(id)
  if artworks_dict_cross[id]['link'] != artworks_dict[id]['link']:
    dif_link.append(id)
  if artworks_dict_cross[id]['artist'] != artworks_dict[id]['artist']:
    dif_artist.append(id)

print('Cross id count:', len(artworks_dict_cross))
print(len(dif_name), 'ids have different name')
print(len(dif_link), 'ids have different link')
print(len(dif_artist), 'ids have different artist')

Cross id count: 68899
3 ids have different name
15 ids have different link
0 ids have different artist


In [None]:
par = 'link'
dif_list = dif_link

artworks_dict_cross_diff = {k: v for k, v in artworks_dict_cross.items() if k in dif_list}
artworks_dict_diff = {k: v for k, v in artworks_dict.items() if k in dif_list}

for k, v in artworks_dict_cross_diff.items():
  print(k, '->', v[par])
print('')
for k, v in artworks_dict_diff.items():
  print(k, v[par])

518d1edf7a6d642fd300008d -> https://d32dm0rphc51dk.cloudfront.net/kANxTalIbPKpIOxcVucMNA/large.jpg
58302f678b3b8135c3001024 -> https://d32dm0rphc51dk.cloudfront.net/y61wp-fWJUgCHt2UIXlP0A/large.jpg
5d09ef962949c800125b37ea -> https://d32dm0rphc51dk.cloudfront.net/9PTqe26j1gjtrD0nDPkyxg/large.jpg
61e56803386822000b3568a6 -> https://d32dm0rphc51dk.cloudfront.net/x18s13ls7Pqy5BbOYkxNcg/large.jpg
61ec0e914aa734000b7af6ca -> https://d32dm0rphc51dk.cloudfront.net/aQYqaAX97xigvJnPNDFmew/large.jpg
61fc752dd29b4d000dc1bbdf -> https://d32dm0rphc51dk.cloudfront.net/NOc7r-JZqUFl-q8QFJh1XQ/large.jpg
623b2c2ad7998a000dfda713 -> https://d32dm0rphc51dk.cloudfront.net/W5-UW_M44ipohPAnFCzMOQ/large.jpg
623b99e48cfd7b000d8dfd31 -> https://d32dm0rphc51dk.cloudfront.net/gVQuIJ18rJZ2cYio0q342w/large.jpg
623b9a8dda5caf000bc535ad -> https://d32dm0rphc51dk.cloudfront.net/Q_GcVrkSImhYzIi6agi_1Q/large.jpg
62410aee4df752000cce42e0 -> https://d32dm0rphc51dk.cloudfront.net/8KG7evNuY5iTzmedBzEefw/large.jpg
624190aacb

In [None]:
# name
#62418ef1292e30000baee482 -> C.H.R.I.S.T Sur Fiction
#6241a12649cd3b000c305ee5 -> Airbattle (Delaunay - Klee - Mondrian)
#624427ecdfca54000cc66f35 -> Sunset

#62418ef1292e30000baee482 Crist-sur-Fiction
#6241a12649cd3b000c305ee5 Airbattle 
#624427ecdfca54000cc66f35 The Bather

!!! Names and images can be different -> won't change them

In [None]:
one_to_one_columns = one_to_one_df.columns[1:-2].tolist()
print(one_to_one_columns)
expaned_columns = list(set(df_cat_dat_total.columns[1:-3].tolist()) - set(one_to_one_columns))
print(expaned_columns)

['name', 'link', 'artist', 'price']
['gallery', 'gene_id', 'year']


In [None]:
# Updated dict
# for gene_id
gene_df = expaned_cross_df.groupby(['artsy_id'])['gene_id'].apply(list).apply(set).apply(list).reset_index(name='gene_id')
gene_dict = gene_df.set_index('artsy_id').to_dict('index')

for id in cross_artwork_id:
  # category_id
  new_cat = list(set(cat_list[old_id_list.index(id)] + cat))
  artworks_dict[id].update({'category_id': new_cat})
  # gene_id
  new_gene = list(set(gene_list[old_id_list.index(id)] + gene_dict[id]['gene_id']))
  artworks_dict[id].update({'gene_id': new_gene})

len(artworks_dict)

168753

In [None]:
artworks_dict[cross_artwork_id[0]]

{'artist': 'Cathy Daley',
 'artist_id': '03126',
 'category_id': ['03', '02'],
 'gallery': 'Newzones',
 'gene_id': ['03059', '02173'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/_zFjgf3AD2c-e98K2M3wNg/large.jpg',
 'name': 'Untitled 1153',
 'parsing_date': '2022-04-01',
 'price': 'C$11,900 ',
 'updating_date': '2022-04-01',
 'year': '2019'}

##### New part dict creation

In [None]:
artworks_dict_new = one_to_one_new_df.set_index('artsy_id').to_dict('index') # with 'name'

# name
#artwork_dict_expan_oto(expaned_new_df, artworks_dict_new, 'name')
# year
artwork_dict_expan_oto(expaned_new_df, artworks_dict_new, 'year')
# gallery
artwork_dict_expan_oto(expaned_new_df, artworks_dict_new, 'gallery')
# gene_id
artwork_dict_expan_otm(expaned_new_df, artworks_dict_new, 'gene_id')
# category_id
for k in artworks_dict_new:
  artworks_dict_new[k].update({'category_id': [cat[0]]})

len(artworks_dict_new)

46583

In [None]:
# artist_id
group_df = expaned_new_df.groupby(['artsy_id', 'artist'])['artist'].agg(['count']).reset_index().drop(['count'], axis=1)

artists_dict = open_dict(artist_dict, 'live')
keys_list = list(artists_dict.keys())
values_list = [v['name'] for v in artists_dict.values()]
group_df['artist_id'] = group_df['artist'].dropna().apply(lambda x: keys_list[values_list.index(x)])
group_df = group_df.drop(['artist'], axis=1)

group_dict = group_df.set_index('artsy_id').to_dict('index')

for k in artworks_dict_new:
  if k in group_dict.keys():
    artworks_dict_new[k].update(group_dict[k])
  else:
    artworks_dict_new[k].update({'artist_id': ' '})

len(artworks_dict_new)

46583

In [None]:
artworks_dict_new[new_artwork_id[0]]

{'artist': 'Ralph Allen Massey',
 'artist_id': '25288',
 'category_id': ['02'],
 'gallery': 'bG Gallery',
 'gene_id': ['02161'],
 'link': 'https://d32dm0rphc51dk.cloudfront.net/G4Rr6jBiC_PGEDOAAZ9vLQ/large.jpg',
 'name': 'Information Vortex',
 'parsing_date': '2022-04-02',
 'price': 'US$4,400 ',
 'updating_date': '2022-04-02',
 'year': '2020'}

In [None]:
artworks_dict.update(artworks_dict_new)
print('New dict length must be:', artworks_dict_total_len)
print('New dict length is:', len(artworks_dict))

New dict length must be: 215336
New dict length is: 215336


In [None]:
save_dict(artworks_dict, artwork_dict, 'live') # 1. Save json dict

## Backup saving

In [None]:
save_dict(artworks_dict, artwork_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict
save_dict(artists_dict, artist_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict
save_dict(genes_dict, gen_dict.replace('.txt', '') + '_' + str(datetime.now().date()) + '.txt', 'backup') # 1. Save json dict

# Images

In [None]:
# Sample from API tables

# 'image': {'href': 'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/{image_version}.jpg',
#                      'templated': True}
#i = 'https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa-3r51Cg9qXKbfA/normalized.jpg'
#for p in i.split('.jpg')[0].split('/'):
#  print(len(p), p)

6 https:
0 
29 d32dm0rphc51dk.cloudfront.net
22 NOpIAwQa-3r51Cg9qXKbfA
10 normalized


In [None]:
#! pip install pillow --user

import PIL
from PIL import Image
import os
import cv2

%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

Populating the interactive namespace from numpy and matplotlib


In [None]:
image = requests.get(img, headers={'User-Agent': 'Chrome/97.0.4692.71'})
name = art_project_path + '/' + 'portrait_of_a_man_in_armour' + '.jpg'

img = open(name, 'wb')
img.write(image.content)
img.close()

img = Image.open(name)
plt.imshow(img)
plt.axis('off')
plt.show()

img.close()