In [None]:
from datasets import load_from_disk, concatenate_datasets
from datasets import Sequence, Features, Value
from huggingface_hub import notebook_login

import re

In [None]:
louvre_ds = load_from_disk("/content/drive/MyDrive/artistic_styles/paintings/louvre_ds_22_13")
chicago_ds = load_from_disk("/content/drive/MyDrive/artistic_styles/paintings/chicago_ds")


### Clean Louvre's dataset

In [None]:
louvre_ds = louvre_ds.rename_columns({"image": "image_url",
                                      "arkId": "id"})


In [None]:
N = len(louvre_ds)
louvre_ds = louvre_ds.add_column("museum", ["Musée du Louvre"] * N)
louvre_ds = louvre_ds.add_column("museum_region", ["Paris"] * N)
louvre_ds = louvre_ds.add_column("museum_country", ["France"] * N)



In [None]:
louvre_ds = louvre_ds.map(lambda batch: {"years of creation": [f"{y_s}-{y_e}" for y_s, y_e in zip(batch['date_start'], batch['date_end'])]},
                          batched=True,
                          batch_size=1000)


In [None]:
louvre_ds = louvre_ds.remove_columns(["title", "attribution", "index", "inscriptions", "description", "materialsAndTechniques"])

In [None]:
renamings = {f"translated_{x}": x for x in ["title", "attribution", "index", "inscriptions", "description", "materialsAndTechniques"]}

In [None]:
louvre_ds = louvre_ds.rename_columns(renamings)

In [None]:
def form_fullinfo(batch, fields, connecting_char):
  batch_infos = []
  for i in range(len(batch[fields[0]])):
    info = connecting_char.join([f"{field}: {batch[field][i]}" for field in fields if len(batch[field][i]) > 0]) + f". museum: Musée du Louvre. location: Paris, France"
    batch_infos.append(info)
  return batch_infos


In [None]:
fields = ['title', 'artist', 'attribution', 'years of creation', 'description', 'index', 'inscriptions']

louvre_ds = louvre_ds.map(lambda x: {
    "full_info": ". ".join([f"{field}: {x[field]}" for field in fields if len(x[field]) > 0])
    }
)


In [None]:
museum_ids = { "louvre": 0, "AIC": 1}

In [None]:
# Set id for Louvre museum: `1`
louvre_ds = louvre_ds.map(lambda batch: {"museum_id": [museum_ids["louvre"] for _ in range(len(batch["id"]))]},
                              batched=True,
                              batch_size=1000)

### Clean AIC's dataset

In [None]:
fields_to_concat = [field_name for field_name, t in chicago_ds.features.items() if isinstance(t, Sequence)]

In [None]:
concat_char = ", "
chicago_ds = chicago_ds.map(
    lambda b: {f: [concat_char.join(x) for x in b[f]] for f in fields_to_concat},
    batched=True,
    batch_size=1000
)


In [None]:
chicago_ds = chicago_ds.cast(Features({feature_name: Value("string") for feature_name in chicago_ds.features}))


In [None]:
int_fields = ['id', 'date_start', 'date_end']
chicago_ds = chicago_ds.map(
    lambda b: {f: [str(x) for x in b[f]] for f in int_fields},
    batched=True,
    batch_size=1000
)


In [None]:
chicago_ds = chicago_ds.map(
    lambda batch: {"years of creation": [f"{y_s}-{y_e}" for y_s, y_e in zip(batch['date_start'], batch['date_end'])]},
                          batched=True,
                          batch_size=1000
)

In [None]:
df = chicago_ds.with_format("pandas")[:]
none_fields = df.isna().sum()
none_fields = none_fields[none_fields > 0].index


In [None]:
chicago_ds = chicago_ds.map(lambda b: {f: [x if x is not None else ''
                                            for x in b[f]]
                                       for f in none_fields},
                            batched=True,
                            batch_size=1000)


In [None]:
def remove_html_tags(text):
  return re.sub("<[^>]*>", "", text)

chicago_ds = chicago_ds.map(
    lambda batch: {"description": [remove_html_tags(x) for x in batch["description"]]},
    batched=True,
    batch_size=1000,
    num_proc=1
)


In [None]:
# form `full_info`
full_info_fields = ['title', 'artist', 'years of creation', 'tags', 'style_title', 'subjects', 'classification', 'techniques', 'materials', 'description', 'categories', 'place_of_origin']
concat_char = ". "

chicago_ds = chicago_ds.map(
    lambda x: {"full_info": concat_char.join([f"{' '.join(f.split('_'))}: {x[f]}"
                                                for f in full_info_fields
                                              if len(x[f]) > 0]) + f". museum: The Art Institute of Chicago, location: Chicago, USA"}
    )


In [None]:
# identify museum
chicago_ds = chicago_ds.map(lambda batch: {"museum_id": [museum_ids["AIC"] for _ in range(len(batch["id"]))]},
                              batched=True,
                              batch_size=1000)


In [None]:
artworks = concatenate_datasets(
    [louvre_ds.select_columns(['id', 'url', 'image_url', 'title', 'artist', 'date_start', 'date_end', 'full_info', 'museum', 'museum_region', 'museum_country', 'museum_id']),
     chicago_ds.select_columns(['id', 'url', 'image_url', 'title', 'artist', 'date_start', 'date_end', 'full_info', 'museum', 'museum_region', 'museum_country', 'museum_id'])]
)


In [None]:
artworks.save_to_disk("/content/drive/MyDrive/artistic_styles/paintings/artworks_ds")

In [None]:
# !git config --global user.email "ms.anna.bozhenko.03@gmail.com"
# !git config --global user.name "Anna Bozhenko"

notebook_login()
artworks.push_to_hub("anna-bozhenko/artworks", commit_message="Created dataset of shallow discriptions from online collections of the Louvre Museum and The Art Institute of Chicago.")