# Art Institute of Chicago web scraper

In [None]:
# Terms of use

# include AIC-User-Agent header:
headers = {"AIC-User-Agent": "artworks-semantic-search-application (annabozhenko2003@gmail.com)"}

In [1]:
from pathlib import Path 
import os

import json

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Explore artpiece json structure

In [2]:
# downloaded from the official source zipped folder and unpacked
chicago_collection_dir = Path(r"D:\artworks_search_engine\data_retrieval\artic-api-data\json\artworks")


In [None]:
# which entities are in the chicago  aside 
# from Prints and drawings and paintings
# 
# 

In [1]:
important_fields = [
'title',
'place_of_origin',
'medium_display',
'classification_titles',
'subject_titles',
'style_title', # handle null
'technique_titles',
'term_titles',
'material_titles',
'description',
'category_titles',
'artist_title',
'id',
'date_start',
'date_end',
'department_title',
'image_id']


In [2]:
print(important_fields)

['title', 'place_of_origin', 'medium_display', 'classification_titles', 'subject_titles', 'style_title', 'technique_titles', 'term_titles', 'material_titles', 'description', 'category_titles', 'artist_title', 'id', 'date_start', 'date_end', 'department_title', 'image_id']


In [6]:
chicago_collection_dir = Path(r"D:\artworks_search_engine\data_retrieval\artic-api-data\json\artworks")
artworks = {field: [] for field in important_fields}

for f in os.listdir(chicago_collection_dir):
    with open(Path(chicago_collection_dir, f)) as f:
        artwork = json.load(f)
        [artworks[field].append(artwork[field]) for field in important_fields]
        

In [7]:
from datasets import Dataset

In [8]:
ds = Dataset.from_dict(artworks)

In [9]:
ds

Dataset({
    features: ['title', 'place_of_origin', 'medium_display', 'classification_titles', 'subject_titles', 'style_title', 'technique_titles', 'term_titles', 'material_titles', 'description', 'category_titles', 'artist_title', 'id', 'date_start', 'date_end', 'department_title', 'image_id'],
    num_rows: 134078
})

In [72]:
# list_ts = [
#  'classification_titles',
#  'subject_titles',
#  'technique_titles',
#  'material_titles',
#  'category_titles',
#  'term_titles']

# ds = ds.map(lambda x: {f: ", ".join(x[f]) for f in list_ts})


Map: 100%|██████████| 134078/134078 [00:30<00:00, 4329.00 examples/s]


In [10]:
ds = ds.rename_columns({'classification_titles': 'classification',
                   'subject_titles': 'subjects',
                   'technique_titles': 'techniques',
                   'material_titles': 'materials',
                   'category_titles': 'categories',
                   'term_titles': 'tags',
                   'artist_title': 'artist',
                   })

In [11]:
N = len(ds)

ds = ds.add_column("museum", ["The Art Institute of Chicago"] * N)
ds = ds.add_column("museum_region", ["Chicago, Illinois"] * N)
ds = ds.add_column("museum_country", ["USA"] * N)


In [12]:
ds = ds.map(lambda batch: {"image_url": [f"https://www.artic.edu/iiif/2/{x}/full/843,/0/default.jpg" for x in batch['image_id']]},
            batched=True,
            batch_size=1000)


Map: 100%|██████████| 134078/134078 [00:36<00:00, 3656.81 examples/s]


In [1]:
from datasets import load_from_disk


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chicago_ds = load_from_disk("chicago_ds")


In [6]:
N = len(chicago_ds)

chicago_ds = chicago_ds.add_column("museum", ["The Art Institute of Chicago"] * N)
chicago_ds = chicago_ds.add_column("museum_region", ["Chicago"] * N)
chicago_ds = chicago_ds.add_column("museum_country", ["USA"] * N)


In [15]:
ds[0]

{'title': 'Simon Vouet',
 'place_of_origin': 'Flanders',
 'medium_display': 'Engraving in black on cream laid paper',
 'classification': ['engraving', 'print', 'prints and drawing'],
 'subjects': [],
 'style_title': None,
 'techniques': [],
 'tags': ['engraving', 'paper (fiber product)', 'print', 'prints and drawing'],
 'materials': ['paper (fiber product)'],
 'description': None,
 'categories': ['Prints and Drawings'],
 'artist': 'Robert van Voerst',
 'id': 100,
 'date_start': 1630,
 'date_end': 1636,
 'department_title': 'Prints and Drawings',
 'image_id': '03c0fd45-3690-5fae-8023-4a3c16d3bfad',
 'museum': 'The Art Institute of Chicago',
 'museum_region': 'Chicago, Illinois',
 'museum_country': 'USA',
 'image_url': 'https://www.artic.edu/iiif/2/03c0fd45-3690-5fae-8023-4a3c16d3bfad/full/843,/0/default.jpg'}

In [3]:
full_info_fields = [
 'title',
 'classification',
 'subjects',
 'techniques',
 'materials',
 'description',
 'categories',
 'artist',
 'date_start',
 'date_end'
]

In [16]:
ds = ds.map(lambda batch: {"url": [f"https://www.artic.edu/artworks/{x}" for x in batch["id"]]},
                            batched = True,
                            batch_size=1000)

Map: 100%|██████████| 134078/134078 [00:00<00:00, 234776.89 examples/s]


In [52]:
ds.save_to_disk("./chicago_ds")


Saving the dataset (1/1 shards): 100%|██████████| 134078/134078 [00:00<00:00, 296462.20 examples/s]


In [54]:
from datasets import load_from_disk

In [55]:
ds = load_from_disk("./chicago_ds")

In [56]:
ds

Dataset({
    features: ['title', 'place_of_origin', 'medium_display', 'classification', 'subjects', 'style_title', 'techniques', 'tags', 'materials', 'description', 'categories', 'artist', 'id', 'date_start', 'date_end', 'department_title', 'image_id', 'museum', 'museum_region', 'museum_country', 'image_url', 'url'],
    num_rows: 134078
})

# Redundant code (very specific) start

In [7]:
 
def safe_string(x):
    return '' if x is None else x

ds = ds.map(lambda x: {"full_info": ". ".join([f"{f}: {safe_string(x[f])}"
                                                for f in full_info_fields]) + f". museum: The Art Institute of Chicago, location: Chicago, USA"})


Map: 100%|██████████| 134078/134078 [00:40<00:00, 3283.28 examples/s]


In [None]:
 
def safe_string(x):
    return '' if x is None else x

ds = ds.map(lambda x: {"full_info": ". ".join([f"{f}: {safe_string(x[f])}" 
                                                for f in full_info_fields]) + f". museum: The Art Institute of Chicago, location: Chicago, USA"})


In [None]:
ds = ds.map(lambda batch: {"url": [f"https://www.artic.edu/artworks/{x}" for x in batch["id"]]},
                            batched = True,
                            batch_size=1000)


In [20]:
from datasets import Features, Value

In [24]:
ds.features

{'title': Value(dtype='string', id=None),
 'place_of_origin': Value(dtype='string', id=None),
 'medium_display': Value(dtype='string', id=None),
 'classification': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'subjects': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'style_title': Value(dtype='string', id=None),
 'techniques': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'materials': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'description': Value(dtype='string', id=None),
 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'artist': Value(dtype='string', id=None),
 'id': Value(dtype='int64', id=None),
 'date_start': Value(dtype='int64', id=None),
 'date_end': Value(dtype='int64', id=None),
 'department_title': Value(dtype='string', id=None),
 'image_id': Value(dtype='stri

In [None]:
# convert specific fields to int
ds = ds.cast(Features({feature_name: Value("string") for feature_name in ["id", "date_start", "date_end"]}))

In [46]:
nodescription = ds.filter(lambda b: [x is None or x == "" for x in b["description"]],
                          batched=True,
                          batch_size=1000)
nodescription

Filter: 100%|██████████| 134078/134078 [00:00<00:00, 227453.37 examples/s]


Dataset({
    features: ['title', 'place_of_origin', 'medium_display', 'classification', 'subjects', 'style_title', 'techniques', 'tags', 'materials', 'description', 'categories', 'artist', 'id', 'date_start', 'date_end', 'department_title', 'image_id', 'museum', 'museum_region', 'museum_country', 'image_url', 'url'],
    num_rows: 121037
})

In [51]:
nodescription[10000]

{'title': "Rhapsodent II - My Dentist's Dream",
 'place_of_origin': 'United States',
 'medium_display': 'Collage composed of cut and pasted papers, graphite and colored pencils, on off-white cardboard',
 'classification': ['collage',
  'drawings (visual works)',
  'prints and drawing'],
 'subjects': [],
 'style_title': None,
 'techniques': ['collage (technique)'],
 'tags': ['collage',
  'graphite',
  'collage (technique)',
  'colored pencil',
  'pencil',
  'paper (fiber product)',
  'drawings (visual works)',
  'prints and drawing'],
 'materials': ['graphite',
  'colored pencil',
  'pencil',
  'paper (fiber product)'],
 'description': None,
 'categories': ['Prints and Drawings',
  'Women artists',
  'Chicago Artists',
  'SAIC Alumni and Faculty'],
 'artist': 'Barbara Rossi',
 'id': 116577,
 'date_start': 1974,
 'date_end': 1984,
 'department_title': 'Prints and Drawings',
 'image_id': '440b0a36-2473-b808-43c3-499f7eaded88',
 'museum': 'The Art Institute of Chicago',
 'museum_region': '

In [41]:
ds.save_to_disk("./chicago_ds")


Saving the dataset (1/1 shards): 100%|██████████| 134078/134078 [00:00<00:00, 395489.48 examples/s]


In [None]:
ds.save_to_disk("./chicago_ds")


# Redundant code (very specific) end