In [1]:
import polars as pl

In [2]:
df = pl.read_csv("MetObjects.txt", ignore_errors=True)
df.glimpse()

Rows: 484956
Columns: 54
$ Object Number            <str> '1979.486.1', '1980.264.5', '67.265.9', '67.265.10', '67.265.11', '67.265.12', '67.265.13', '67.265.14', '67.265.15', '1979.486.3'
$ Is Highlight            <bool> False, False, False, False, False, False, False, False, False, False
$ Is Timeline Work        <bool> False, False, False, False, False, False, False, False, False, False
$ Is Public Domain        <bool> False, False, False, False, False, False, False, False, False, False
$ Object ID                <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Gallery Number           <i64> None, None, None, None, None, None, None, None, None, None
$ Department               <str> 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing', 'The American Wing'
$ AccessionYear            <i64> 1979, 1980, 1967, 1967, 1967, 1967, 1967, 1967, 1967, 1979
$ Object Name   

In [3]:
available_images = pl.read_csv("available_images.csv")
available_images.head(2)

blob_path
str
"""100/0.jpg"""
"""1000/0.jpg"""


In [4]:
available_images = available_images.with_columns(
    object_id=pl.col("blob_path").str.split("/").list.first().cast(int)
)
available_images.head()

blob_path,object_id
str,i64
"""100/0.jpg""",100
"""1000/0.jpg""",1000
"""10000/0.jpg""",10000
"""10001/0.jpg""",10001
"""100015/0.jpg""",100015


In [5]:
df = df.join(
    available_images,
    left_on="Object ID",
    right_on="object_id",
    how="left"
).filter(
    pl.col("blob_path").is_not_null()
)
df.head(2)

Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL,blob_path
str,bool,bool,bool,i64,i64,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""64.62""",False,False,False,33,774,"""The American Wing""",1964,"""Bust""","""Bust of Abraham Lincoln""","""American""",,,,,16459,"""Maker""",""" ""","""James Gillinder and Sons""","""American, 1861–ca. 1930""",""" ""","""Gillinder and Sons, James""","""American""","""1861 ""","""1930 """,,"""http://vocab.getty.edu/page/ul…","""https://www.wikidata.org/wiki/…","""1876""",1876,1876,"""Pressed glass""","""2 3/4 x 3 1/2 x 2 3/4 in. (7 x…","""Gift of Teunis G. B. Cortelyou…","""Made in""","""Philadelphia""",,,"""United States""",,,,,,,,,"""http://www.metmuseum.org/art/c…","""https://www.wikidata.org/wiki/…",,"""Metropolitan Museum of Art, Ne…","""Men|Abraham Lincoln|Portraits""","""http://vocab.getty.edu/page/aa…","""https://www.wikidata.org/wiki/…","""33/0.jpg"""
"""1970.289.6""",False,False,True,34,774,"""The American Wing""",1970,"""Clock""","""Acorn Clock""","""American""",,,,,108,"""Maker""",""" ""","""Forestville Manufacturing Comp…","""1835–1853""",""" ""","""Forestville Manufacturing Comp…","""American""","""1835 ""","""1853 """,,,,"""1847–50""",1847,1850,"""Mahogany, laminated""","""24 3/8 x 14 5/8 x 5 1/8 in. (6…","""Gift of Mrs. Paul Moore, 1970""","""Made in""","""Bristol""",,,"""United States""",,,,,,,,,"""http://www.metmuseum.org/art/c…","""https://www.wikidata.org/wiki/…",,"""Metropolitan Museum of Art, Ne…","""Landscapes|Boats""","""http://vocab.getty.edu/page/aa…","""https://www.wikidata.org/wiki/…","""34/0.jpg"""


In [6]:
# test that images correspond with descriptions
from google.cloud import storage
def download_public_file(bucket_name, source_blob_name, destination_file_name):
    storage_client = storage.Client.create_anonymous_client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(
        "Downloaded public blob {} from bucket {} to {}.".format(
            source_blob_name, bucket.name, destination_file_name
        )
    )
download_public_file("gcs-public-data--met", "34/0.jpg", "test.jpg")

Downloaded public blob 34/0.jpg from bucket gcs-public-data--met to test.jpg.


In [7]:
df[["Department"]].unique().glimpse()

Rows: 19
Columns: 1
$ Department <str> 'Greek and Roman Art', 'European Sculpture and Decorative Arts', 'Egyptian Art', 'Islamic Art', 'European Paintings', 'The American Wing', 'The Cloisters', 'Modern and Contemporary Art', 'Asian Art', 'Drawings and Prints'



In [8]:
pl.Config.set_tbl_rows(20)
df["Department"].value_counts(sort=True)

Department,count
str,u32
"""Asian Art""",67439
"""Drawings and Prints""",60438
"""European Sculpture and Decorat…",57327
"""The American Wing""",27299
"""Islamic Art""",25862
"""Egyptian Art""",23793
"""Costume Institute""",20547
"""Medieval Art""",19506
"""Greek and Roman Art""",18308
"""Photographs""",13624


In [9]:
# Filter out non-painting arts
# decided to not filter anything for now
# df = df.filter(
#     ~pl.col("Department").is_in(["European Sculpture and Decorative Arts", "Costume Institute", "Photographs", "Arms and Armor", "The Cloisters", "Musical Instruments", "The Libraries"])
# )

In [10]:
df.shape

(377784, 55)

In [13]:
to_rename = {col: col.lower().replace(" ", "_") for col in df.columns}
df = df.rename(
    to_rename
)
df.head(1)

object_number,is_highlight,is_timeline_work,is_public_domain,object_id,gallery_number,department,accessionyear,object_name,title,culture,period,dynasty,reign,portfolio,constituent_id,artist_role,artist_prefix,artist_display_name,artist_display_bio,artist_suffix,artist_alpha_sort,artist_nationality,artist_begin_date,artist_end_date,artist_gender,artist_ulan_url,artist_wikidata_url,object_date,object_begin_date,object_end_date,medium,dimensions,credit_line,geography_type,city,state,county,country,region,subregion,locale,locus,excavation,river,classification,rights_and_reproduction,link_resource,object_wikidata_url,metadata_date,repository,tags,tags_aat_url,tags_wikidata_url,blob_path
str,bool,bool,bool,i64,i64,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""64.62""",False,False,False,33,774,"""The American Wing""",1964,"""Bust""","""Bust of Abraham Lincoln""","""American""",,,,,16459,"""Maker""",""" ""","""James Gillinder and Sons""","""American, 1861–ca. 1930""",""" ""","""Gillinder and Sons, James""","""American""","""1861 ""","""1930 """,,"""http://vocab.getty.edu/page/ul…","""https://www.wikidata.org/wiki/…","""1876""",1876,1876,"""Pressed glass""","""2 3/4 x 3 1/2 x 2 3/4 in. (7 x…","""Gift of Teunis G. B. Cortelyou…","""Made in""","""Philadelphia""",,,"""United States""",,,,,,,,,"""http://www.metmuseum.org/art/c…","""https://www.wikidata.org/wiki/…",,"""Metropolitan Museum of Art, Ne…","""Men|Abraham Lincoln|Portraits""","""http://vocab.getty.edu/page/aa…","""https://www.wikidata.org/wiki/…","""33/0.jpg"""


In [16]:
df = df[["object_id", "department", "object_name", "title", "culture", 
         "period", "dynasty", "reign", "portfolio", "artist_role",
         "artist_display_name", "artist_display_bio", "artist_nationality",
         "artist_begin_date", "artist_end_date", "artist_gender",
         "object_date", "medium", "dimensions", "geography_type", "city", "state", "country", "region",
         "blob_path"]]

In [17]:
df.write_ndjson("met_objects.json")