In [None]:
from datasets import load_from_disk
from transformers import MarianMTModel, MarianTokenizer



In [None]:
louvre_ds = load_from_disk("/content/drive/MyDrive/artistic_styles/paintings/louvre_ds_22_13")

In [None]:
src="fr"
dest="en"
model_name = f"Helsinki-NLP/opus-mt-{src}-{dest}" # source: fr (only)
tokenizer = MarianTokenizer.from_pretrained(model_name)
translator = MarianMTModel.from_pretrained(model_name).cuda()

In [None]:
dummy_str = "i" # character either for empty strings or too long once tokenized into tokens and fed into module input (> max_length)

In [None]:
def translate(data, device="cuda"):
  batch = tokenizer(data, return_tensors="pt", padding=True, )
  tokenized_batch = {k: v.to(device) for k, v in batch.items()}
  gen = translator.generate(**tokenized_batch)
  tran = tokenizer.batch_decode(gen, skip_special_tokens=True)
  return tran


# Translate materialsAndTechniques



In [None]:
outlier_idxs = [13438] # indices of too long strings (couldn't yet solve the problem with translating too long texts. to be honest  could, there was a solution but time costly)

In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {"materialsAndTechniques_copy": x["materialsAndTechniques"] if i not in outlier_idxs else dummy_str},
                          with_indices=True)

In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_materialsAndTechniques": translate(batch['materialsAndTechniques_copy'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {
    "translated_materialsAndTechniques": x["translated_materialsAndTechniques"] if i not in outlier_idxs else ""
},
    with_indices=True
)


In [None]:
# translated_materialsAndTechniques for empty fields
outliers_idxs = [i for i, x in enumerate(louvre_ds['materialsAndTechniques'][:]) if len(x) == 0]

louvre_ds = louvre_ds.map(lambda x, i: {"translated_materialsAndTechniques": x["translated_materialsAndTechniques"] if i not in outliers_idxs else ""},
                          with_indices=True)

# Translate `description`

In [1]:
outlier_idxs = [1690, 1774, 3798, 4241, 4866, 4867, 4868, 4869, 4870, 4871, 4872, 4878, 4879, 4881, 4882, 4883, 4884, 4885, 4886, 4887, 4888, 4900, 6632, 6831, 8095, 8096, 8159, 8250, 8315, 8450, 8456, 8459, 8495, 8496, 8597, 9174, 9305, 9678, 10418, 10752, 10766, 10787, 10788, 10798, 10832, 15053, 15054, 15055, 15773]


In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {"description_copy": x["description"] if i not in outlier_idxs else dummy_str},
                          with_indices=True)

In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_description": translate(batch['description_copy'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {
    "translated_description": x["translated_description"] if i not in outlier_idxs else ""
},
    with_indices=True
)

In [None]:
# description
outliers_idxs = [i for i, x in enumerate(louvre_ds['description'][:]) if len(x) == 0]

louvre_ds = louvre_ds.map(lambda x, i: {"translated_description": x["translated_description"] if i not in outliers_idxs else ""},
                          with_indices=True)

# Translate `inscriptions`

In [None]:
# There are indices with token vectors, exaggerating `max_size`=512:
outlier_idxs = [7575]

In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {"inscriptions_copy": x["inscriptions"] if i not in outlier_idxs else dummy_str},
                          with_indices=True)

In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_inscriptions": translate(batch['inscriptions_copy'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {
    "translated_inscriptions": x["translated_inscriptions"] if i not in outlier_idxs else ""
},
    with_indices=True
)

In [None]:
# translated_inscriptions
outliers_idxs = [i for i, x in enumerate(louvre_ds['inscriptions'][:]) if len(x) == 0]

louvre_ds = louvre_ds.map(lambda x, i: {"translated_inscriptions": x["translated_inscriptions"] if i not in outliers_idxs else ""},
                          with_indices=True)

# Translate `title`

In [None]:
# there are `no-name` artworks
ouliers_idxs = [i for i, x in enumerate(louvre_ds['title'][:]) if len(x) == 0]

In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {"title_copy": x["title"] if i not in outlier_idxs else dummy_str},
                          with_indices=True)

In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_title": translate(batch['title_copy'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
louvre_ds = louvre_ds.map(lambda x, i: {
    "translated_title": x["translated_title"] if i not in ouliers_idxs else ""
},
    with_indices=True   )

# Translate `index`


In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_index": translate(batch['index'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
# translated_index
outliers_idxs = [i for i, x in enumerate(louvre_ds['index'][:]) if len(x) == 0]

louvre_ds = louvre_ds.map(lambda x, i: {"translated_index": x["translated_index"] if i not in outliers_idxs else ""},
                          with_indices=True)


# Translate `attribution`

In [None]:
louvre_ds = louvre_ds.map(lambda batch: {
    "translated_attribution": translate(batch['attribution'], "cuda")
    },
                          batched=True,
                          batch_size=10
)


In [None]:
# translated_attribution
outliers_idxs = [i for i, x in enumerate(louvre_ds['attribution'][:]) if len(x) == 0]

louvre_ds = louvre_ds.map(lambda x, i: {"translated_attribution": x["translated_attribution"] if i not in outliers_idxs else ""},
                          with_indices=True)


In [None]:
louvre_ds = louvre_ds.remove_columns(["materialsAndTechniques_copy", "description_copy", "inscriptions_copy", "title_copy"])

In [None]:
louvre_ds.save_to_disk("/content/drive/MyDrive/artistic_styles/paintings/louvre_ds_22_13")
