# Analyze the filtered dataset
This notebook analyzes each feature of the dataset and is leveraged to understand which type of paintings are under-represented.

### 0. Import libraries and load data

In [None]:
import numpy as np
import polars as pl
from PIL import Image
from tqdm import tqdm
import plotly.express as px
from collections import Counter

RAW_DATA_PATH = "../../data/raw/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/"
PROCESSED_DATA_PATH = "../../data/processed/"
COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

In [None]:
DATASET_TYPE = "all_paintings_dataset"

if DATASET_TYPE == "final_annotated_dataset":
    data = pl.read_json(f"{PROCESSED_DATA_PATH}paintings_with_filtered_objects.json", infer_schema_length=10000).drop("objects")
elif DATASET_TYPE == "filtered_paintings_dataset":
    data = pl.read_json(
        f"{INTERMEDIATE_DATA_PATH}filtered_paintings/filtered_paintings_enhanced_data.json"
    )

data

In [None]:
data.filter(~pl.col("first_style").is_null() & ~pl.col("coarse_type").is_null() &  ~pl.col("first_fine_grained_type").is_null())

In [None]:
data["source"].value_counts().sort("count")

### 1. Artists

In [None]:
artist_frequency = data["artist"].value_counts().sort("count").rename({"count": "frequency"})
print(f"Number of artists: {len(set(data['artist'].to_list()))}")

fig = px.histogram(
    artist_frequency, x="frequency", title="Artist Frequency", color_discrete_sequence=COLORS[2:3]
)
fig.show()

In [None]:
data["artist"].value_counts().sort("count")

In [None]:
paintings_with_type_or_style = data.filter(
    pl.col("coarse_type").is_not_null()
    | pl.col("first_fine_grained_type").is_not_null()
    | pl.col("second_fine_grained_type").is_not_null()
    | pl.col("first_style").is_not_null()
    | pl.col("second_style").is_not_null()
)

artists_with_type_or_style = len(set(paintings_with_type_or_style["artist"].to_list()))
print(f"Artists that have paintings with associated style or type: {artists_with_type_or_style}")

### 2. Year of creation

In [None]:
paintings_per_century = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .with_columns(pl.col("len") / data.shape[0] * 100)
    .rename({"len": "percentage"})
)
print(f"Covered period: {data['year'].min()} - {data['year'].max()}")

fig = px.bar(
    paintings_per_century.sort("source"),
    x="century",
    y="percentage",
    color="source",
    title="Distribution of Paintings Across Centuries",
    color_discrete_sequence=COLORS[:4],
)
fig.update_layout(xaxis={"dtick": 1})
fig.show()

In [None]:
paintings_per_century_no = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .rename({"len": "paintings number"})
)

with pl.Config(tbl_rows=30):
    display(paintings_per_century_no)

### 3. Type and style

In [None]:
def plot_treemap(data, measurement_name, min_val):
    formatted_data = pl.from_dict({measurement_name: data.keys(), "count": data.values()}).sort("count")

    formatted_data = formatted_data.with_columns(
        pl.when(pl.col("count") < min_val)
        .then(pl.lit("other"))
        .otherwise(pl.col(measurement_name))
        .alias(measurement_name) 
    )

    fig = px.treemap(formatted_data, path=[px.Constant(measurement_name.capitalize()), measurement_name], values='count')
    fig.update_traces(root_color="lightgrey", marker=dict(cornerradius=5), textinfo='label+percent root')
    fig.update_layout(
        treemapcolorway = COLORS[:4],
        margin = dict(t=5, l=5, r=5, b=5),
    )
    fig.show()

In [None]:
paintings_with_type_and_style = data.filter(
    (
        pl.col("coarse_type").is_not_null()
        | pl.col("first_fine_grained_type").is_not_null()
        | pl.col("second_fine_grained_type").is_not_null()
    )
    & (pl.col("first_style").is_not_null() | pl.col("second_style").is_not_null())
)
print(f"Paintings with style and type: {paintings_with_type_and_style.shape[0]}")
print(f"Paintings with style or type: {paintings_with_type_or_style.shape[0]}")

In [None]:
paintings_with_coarse_type_no = data.filter(pl.col("coarse_type").is_not_null()).shape[0]

coarse_grained_types = Counter(
    [type_ for type_ in data["coarse_type"].to_list() if type_ is not None]
)

print(f"Number of paintings with coarse type: {paintings_with_coarse_type_no}")

print(f"The {len(coarse_grained_types)} coarse types are:")
plot_treemap(coarse_grained_types, "coarse grained types", min_val=0)


In [None]:
paintings_with_fine_grained_type_no = data.filter(
    pl.col("first_fine_grained_type").is_not_null()
    | pl.col("second_fine_grained_type").is_not_null()
).shape[0]

fine_grained_types = Counter(
    type_
    for type_ in data["first_fine_grained_type"].to_list()
    + data["second_fine_grained_type"].to_list()
    if type_ is not None
)

print(f"Number of paintings with fine-grained type: {paintings_with_fine_grained_type_no}")
print(f"The {len(fine_grained_types)} fine-grained types are:")
plot_treemap(fine_grained_types, "fine grained types", min_val=25)


In [None]:
paintings_with_style_no = data.filter(
    pl.col("first_style").is_not_null() | pl.col("second_style").is_not_null()
).shape[0]

styles = Counter(
    style_
    for style_ in data["first_style"].to_list() + data["second_style"].to_list()
    if style_ is not None
)

print(f"Number of paintings with style: {paintings_with_style_no}")
print(f"The {len(styles)} styles are:")
plot_treemap(styles, "style types", min_val=40)

### 4. Description length

In [None]:
data_description_word_count = data.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(x.split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
)

fig = px.box(
    data_description_word_count,
    x="description word count",
    title="Description Word Count",
    color_discrete_sequence=COLORS[0:3],
    color="source"
)
fig.update_xaxes(title_text="number of words")
fig.show()

In [None]:
description_lengths = ["shortest:\n", "\nmedium:\n", "\nlong:\n"]

for index, description_index in enumerate([0, 5000, -100]):
    description = data_description_word_count.sort("description word count")["description"][
        description_index
    ]

    print(f"{description_lengths[index]}{description}")