# Preprocess data
This notebook normalizes the names of types and styles of paintings, removes the hard-to-comprehend artworks (based on style and type), filteres out artworks that are duplicated or are not paintings and places the filtered dataset in one directory. 

### 0. Import libraries and load data

In [None]:
import os
import json
import shutil

import polars as pl
from tqdm import tqdm

from preprocess_data_utils import *

COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

In [None]:
RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"

In [None]:
try:
    os.mkdir(INTERMEDIATE_DATA_PATH + "filtered_paintings/")
except FileExistsError:
    pass

try:
    os.mkdir(RAW_DATA_PATH + "filtered_paintings/")
except FileExistsError:
    pass

In [None]:
datasets_names = ["met", "wikiart", "wga"]
datasets = []

for dataset_name in datasets_names:
    datasets.append(
        pl.read_json(
            f"{INTERMEDIATE_DATA_PATH}{dataset_name}_paintings/{dataset_name}_paintings_enhanced_data.json"
        ).with_columns(pl.lit(dataset_name).alias("source"))
    )

data = pl.concat(datasets)
data

### 1. Normalize styles and types of paintings

In [None]:
data = data.filter(
    ~pl.col("fine_grained_type").str.contains_any(FILTERED_OUT_TYPES)
    | pl.col("fine_grained_type").is_null()
)

data = (
    data.with_columns(
        pl.col("fine_grained_type")
        .str.split_exact(", ", 1)
        .struct.rename_fields(["first_type", "second_type"])
        .map_elements(lambda x: sort_elements(x, "first_type", "second_type"))
    )
    .with_columns(pl.col("fine_grained_type").list.get(0).alias("first_fine_grained_type"))
    .with_columns(pl.col("fine_grained_type").list.get(1).alias("second_fine_grained_type"))
).drop("fine_grained_type")

data = (
    data.with_columns(
        pl.col("first_fine_grained_type")
        .replace(FINE_GRAINED_TYPES_MAPPING)
        .alias("first_fine_grained_type")
    )
    .with_columns(
        pl.col("second_fine_grained_type")
        .replace(FINE_GRAINED_TYPES_MAPPING)
        .alias("second_fine_grained_type")
    )
    .with_columns(
        pl.when(pl.col("first_fine_grained_type") == pl.col("second_fine_grained_type"))
        .then(pl.lit(None))
        .otherwise(pl.col("second_fine_grained_type"))
        .alias("second_fine_grained_type")
    )
)

In [None]:
data = data.filter(
    ~pl.col("coarse_type").str.contains_any(FILTERED_OUT_COARSE_TYPES)
    | pl.col("coarse_type").is_null()
)
data

In [None]:
data = data.filter(
    ~pl.col("style").str.contains_any(FILTERED_OUT_STYLES) | pl.col("style").is_null()
)

data = (
    data.with_columns(
        pl.col("style")
        .str.split_exact(", ", 1)
        .struct.rename_fields(["first_style", "second_style"])
        .map_elements(lambda x: sort_elements(x, "first_style", "second_style"))
    )
    .with_columns(pl.col("style").list.get(0).alias("first_style"))
    .with_columns(pl.col("style").list.get(1).alias("second_style"))
).drop("style")

data = (
    data.with_columns(pl.col("first_style").replace(STYLES_MAPPING).alias("first_style"))
    .with_columns(pl.col("second_style").replace(STYLES_MAPPING).alias("second_style"))
    .with_columns(
        pl.when(pl.col("first_style") == pl.col("second_style"))
        .then(pl.lit(None))
        .otherwise(pl.col("second_style"))
        .alias("second_style")
    )
)

### 2. Filter MET painting based on their medium type
For example, avoid including miniature paintings

In [None]:
met_paintings = data.filter(pl.col("source") == "met")
non_met_paintings = data.filter(pl.col("source") != "met")

In [None]:
met_object_types = (
    pl.read_csv(RAW_DATA_PATH + "met_paintings/met_data.csv", ignore_errors=True)
    .select("Title", "Artist Display Name", "Object Name")
    .with_columns(
        (pl.col("Title") + " " + pl.col("Artist Display Name")).alias("Painting Identifier")
    )
    .select("Painting Identifier", "Object Name")
    .unique()
)
met_object_types

In [None]:
filtered_met_paintings = (
    met_paintings.filter(pl.col("source") == "met")
    .with_columns((pl.col("title") + " " + pl.col("artist")).alias("Painting Identifier"))
    .join(met_object_types, on="Painting Identifier")
    .unique()
    .filter(pl.col("Object Name").is_in(MET_KEPT_OBJECT_TYPES))
    .drop("Object Name")
    .drop("Painting Identifier")
    .unique()
)

data = pl.concat([non_met_paintings, filtered_met_paintings]).sort("id")
data

### 3. Filter WGA painting based on their medium type
For example, avoid including fresco, murals, etc.

In [None]:
wga_paintings = data.filter(pl.col("source") == "wga")
non_wga_paintings = data.filter(pl.col("source") != "wga")

In [None]:
wgq_techniques = (
    pl.read_csv(INTERMEDIATE_DATA_PATH + "wga_paintings/wga_processed.csv")
    .with_columns(
        (pl.col("raw_title") + " " + pl.col("raw_artist") + " " + str(pl.col("year"))).alias(
            "Painting Identifier"
        )
    )
    .select("Painting Identifier", "technique")
    .unique()
)
wgq_techniques

In [None]:
wga_with_techniques = wga_paintings.with_columns(
    (pl.col("title") + " " + pl.col("artist") + " " + str(pl.col("year"))).alias(
        "Painting Identifier"
    )
).join(wgq_techniques, on="Painting Identifier")

WGA_ALL_KEPT_TECHNIQUES = set(
    [
        x[0]
        for x in wga_with_techniques["technique"]
        .value_counts()
        .sort("count", descending=True)
        .to_numpy()
        if ("canvas" in x[0] or "paper" in x[0] or "pastel" in x[0] or "pencil" in x[0])
        and "fresco" not in x[0]
        and "mural" not in x[0]
        and "panel" not in x[0]
    ]
    + WGA_KEPT_TECHNIQUES
)

wga_filtered_paintings = (
    wga_with_techniques.filter(pl.col("technique").is_in(list(WGA_ALL_KEPT_TECHNIQUES)))
    .drop("technique", "Painting Identifier")
    .unique()
)
wga_filtered_paintings

data = pl.concat([non_wga_paintings, wga_filtered_paintings]).sort("id")
data

### 4. Remove WGA duplicate paintings and detailed views of paintings

In [None]:
data = data.with_columns(
    (
        pl.col("title") + pl.col("artist") + pl.col("year").cast(pl.String) + pl.col("description")
    ).alias("Painting Identifier")
)
data

In [None]:
data_wo_detailed_view = data.filter(
    ~((pl.col("title").str.contains("(detail)")) & (pl.col("source") == "wga"))
)
data_wo_detailed_view

In [None]:
paintings_identifiers = (
    data_wo_detailed_view["Painting Identifier"]
    .value_counts()
    .sort("count")
    .filter(pl.col("count") > 1)["Painting Identifier"]
    .to_list()
)
data_wo_duplicates_one = (
    data_wo_detailed_view.filter(
        pl.col("Painting Identifier").is_in(paintings_identifiers)
    )
    .group_by("Painting Identifier")
    .first()
)
data_wo_detailed_view_two = data_wo_detailed_view.filter(
    ~pl.col("Painting Identifier").is_in(paintings_identifiers)
)

data = pl.concat(
    [data_wo_duplicates_one, data_wo_detailed_view_two.select(*data_wo_duplicates_one.columns)]
).drop("Painting Identifier")
data

### 5. Remove paintings earlier than 14th century
This is done to avoid having paintings from under-represented centuries.

In [None]:
data = data.filter(pl.col("year") > MIN_YEAR_FILTERING)

### 6. Store the pre-processed dataset in a new directory

In [None]:
data = data.select(
    "id",
    "title",
    "artist",
    "year",
    "coarse_type",
    "first_fine_grained_type",
    "second_fine_grained_type",
    "first_style",
    "second_style",
    "description",
    "source",
)

data_updated_id = (
    data.sort("id")
    .with_columns(pl.col("id").alias("old_id"))
    .with_columns(pl.arange(0, len(data)).alias("id"))
)
id_mapping = dict(zip(data_updated_id["old_id"].to_list(), data_updated_id["id"].to_list()))

In [None]:
source_paths = [
    f"{painting_id[0]}_paintings/{painting_id[1]}.png"
    for painting_id in data_updated_id.select("source", "old_id").to_numpy()
]
destination_paths = [
    f"{painting_id[0]}_paintings/{id_mapping[int(painting_id[1])]}.png"
    for painting_id in data_updated_id.select("source", "old_id").to_numpy()
]


for index in tqdm(range(len(source_paths))):
    source_path = RAW_DATA_PATH + source_paths[index]
    destination_path = (
        RAW_DATA_PATH + "filtered_paintings/" + destination_paths[index].split("/")[-1]
    )

    shutil.copy2(source_path, destination_path)

data_to_store = data_updated_id.drop("old_id")

In [None]:
with open(
    INTERMEDIATE_DATA_PATH + "filtered_paintings/filtered_paintings_enhanced_data_raw.json", "w"
) as json_file:
    json.dump(data_to_store.to_dicts(), json_file, indent=4)

In [None]:
def clean_description(text):
    # remove [url href=...]...[/url], keep inner text
    text = re.sub(r'\[url href=.*?\](.*?)\[/url\]', r'\1', text)
    
    # remove [i], [/i], [b], [/b], [u], [/u]
    text = re.sub(r'\[/?[ibu]\]', '', text)
    
    # remove raw URLs
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)

    # remove remaining url tags
    text = re.sub(r'\[/url\]', '', text)
    text = re.sub(r'\[url=?', '', text)

    # collapse multiple spaces and strip whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
with open(f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data_raw.json") as f:
    entire_dataset = json.load(f)

entire_dataset[2000]["description"]

In [None]:
for painting in tqdm(entire_dataset):
    painting["description"] = clean_description(painting["description"])

In [None]:
entire_dataset[2000]["description"]

In [None]:
with open(f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data.json", "w") as f:
    json.dump(entire_dataset, f, indent=4)