# Create ground truth miniset
This notebook selects the paintings that will be manually annotated. They will be used to optimize and measure the performance of the prompts needed to automate the dataset annotation process.

### 0. Import libraries

In [None]:
import os
import polars as pl
from PIL import Image

In [None]:
RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"
ANNOTATIONS_DATA_PATH = "../../data/annotations/"

### 1. Load data

In [None]:
try:
    os.mkdir(ANNOTATIONS_DATA_PATH)
except FileExistsError:
    pass

In [None]:
data = (
    pl.read_json(
        INTERMEDIATE_DATA_PATH + "filtered_paintings/filtered_paintings_enhanced_data.json"
    )
    .with_columns(
        pl.col("description")
        .map_elements(lambda x: len(x.split(" ")), return_dtype=pl.Int64)
        .alias("words_no")
    )
    .select("source", "coarse_type", "first_style", "words_no", "id", "description")
    .sample(fraction=1.0, shuffle=True)
)
data


### 2. Select paintings for the train and val ground truth sets 
There are selected paintings considering all combinations of data source (3) and type (8). Additionally, the stratified sample includes artworks from all styles.

In [None]:
source_type_pairs = (
    data.sort("words_no")
    .group_by("source", "coarse_type")
    .len()
    .sort("len")
    .select("source", "coarse_type")
    .to_numpy()
)
source_type_pairs


In [None]:
selected_paintings = {"id": [], "source": [], "type": [], "style": [], "words_no": []}

for source_type in source_type_pairs:
    source = source_type[0]
    coarse_type = source_type[1]

    try:
        if coarse_type is None:
            id, style, words_no = (
                data.filter(
                    (pl.col("source") == source)
                    & (pl.col("coarse_type").is_null())
                    & (pl.col("first_style").is_not_null())
                    & (~pl.col("first_style").is_in(selected_paintings["style"]))
                )
                .select("id", "first_style", "words_no")
                .to_numpy()[0]
            )
        else:
            id, style, words_no = (
                data.filter(
                    (pl.col("source") == source)
                    & (pl.col("coarse_type") == source_type[1])
                    & (pl.col("first_style").is_not_null())
                    & (~pl.col("first_style").is_in(selected_paintings["style"]))
                )
                .select("id", "first_style", "words_no")
                .to_numpy()[0]
            )
    except:
        id, style, words_no = (
            data.filter((pl.col("source") == source) & (pl.col("coarse_type") == source_type[1]))
            .select("id", "first_style", "words_no")
            .to_numpy()[0]
        )
        
    selected_paintings["id"].append(id)
    selected_paintings["source"].append(source)
    selected_paintings["type"].append(source_type[1])
    selected_paintings["style"].append(style)
    selected_paintings["words_no"].append(words_no)

selected_paintings_df = pl.from_dict(selected_paintings).sort("words_no")
selected_paintings_df


In [None]:
left_out_styles = set(data["first_style"].to_list()) - set(selected_paintings_df["style"].to_list())

for style in left_out_styles:
    if style is not None:
        id, source, coarse_type, words_no = data.filter(pl.col("first_style") == style).select("id", "source", "coarse_type", "words_no").to_numpy()[0]

    selected_paintings["id"].append(id)
    selected_paintings["source"].append(source)
    selected_paintings["type"].append(coarse_type)
    selected_paintings["style"].append(style)
    selected_paintings["words_no"].append(words_no)

selected_paintings_df = pl.from_dict(selected_paintings).sort("words_no").unique("id")
selected_paintings_df


In [None]:
for painting_data in selected_paintings_df.to_dicts():
    print(f"Type: {painting_data["type"]}\nStyle: {painting_data["style"]}")
    print(
        data.filter(pl.col("id") == painting_data["id"])["description"][0]
        .replace("\n", " ")
        .strip()
    )
    painting = Image.open(f"{RAW_DATA_PATH}filtered_paintings/{painting_data["id"]}.png")

    width, height = painting.size
    new_size = (width // 3, height // 3)
    resized_paining = painting.resize(new_size)

    display(resized_paining)


### 3. Select paintings that will be used as examples for the LLM that will annotate the data

In [None]:
select_paintings_for_prompt = (
    data.filter(~pl.col("id").is_in(selected_paintings_df["id"].to_list()))
    .group_by("coarse_type")
    .agg(pl.col("*").get(8))
    .rename({"coarse_type": "type", "first_style": "style"})
)
select_paintings_for_prompt


In [None]:
for painting_data in select_paintings_for_prompt.to_dicts():
    print(f"Type: {painting_data["type"]}\nStyle: {painting_data["style"]}")
    print(data.filter(pl.col("id") == painting_data["id"])["description"][0].replace("\n", " "))
    painting = Image.open(f"{RAW_DATA_PATH}filtered_paintings/{painting_data["id"]}.png")

    width, height = painting.size
    new_size = (width // 3, height // 3)
    resized_paining = painting.resize(new_size)

    display(resized_paining)


### 4. Store the selected paintings and their attributes

In [None]:
paintings_to_manually_annotate = pl.concat(
    [
        selected_paintings_df.with_columns(pl.lit("train_val").alias("set")).select(
            "id", "source", "type", "style", "words_no", "set"
        ),
        select_paintings_for_prompt.with_columns(pl.lit("prompt").alias("set")).select(
            "id", "source", "type", "style", "words_no", "set"
        ),
    ]
)
paintings_to_manually_annotate.write_csv(
    f"{ANNOTATIONS_DATA_PATH}paintings_to_manually_annotate.csv"
)
paintings_to_manually_annotate
