# Analyze the created dataset before pre-processing
This notebook analyzes each feature of the dataset and is leveraged to understand which type of paintings are under-represented.

### 0. Import libraries and load data

In [None]:
import polars as pl
import plotly.express as px

INTERMEDIATE_DATA_PATH = "../../data/intermediate/"
COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

In [None]:
datasets_names = ["met", "wikiart", "wga"]
datasets = []

for dataset_name in datasets_names:
    datasets.append(
        pl.read_json(
            f"{INTERMEDIATE_DATA_PATH}{dataset_name}_paintings/{dataset_name}_paintings_enhanced_data.json"
        ).with_columns(pl.lit(dataset_name).alias("source"))
    )

data = pl.concat(datasets)
data

In [None]:
data["source"].value_counts().sort("count")

### 1. Artists

In [None]:
artist_frequency = data["artist"].value_counts().sort("count").rename({"count": "frequency"})
print(f"Number of artists: {len(set(data['artist'].to_list()))}")

fig = px.histogram(
    artist_frequency, x="frequency", title="Artist Frequency", color_discrete_sequence=COLORS[2:3]
)
fig.show()

In [None]:
data["artist"].value_counts().sort("count")

In [None]:
paintings_with_type_or_style = data.filter(
    pl.col("coarse_type").is_not_null()
    | pl.col("fine_grained_type").is_not_null()
    | pl.col("style").is_not_null()
)

artists_with_type_or_style = len(set(paintings_with_type_or_style["artist"].to_list()))
print(f"Artists that have paintings with associated style or type: {artists_with_type_or_style}")

### 2. Year of creation

In [None]:
paintings_per_century = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .with_columns(pl.col("len") / data.shape[0] * 100)
    .rename({"len": "percentage"})
)
print(f"Covered period: {data['year'].min()} - {data['year'].max()}")

fig = px.bar(
    paintings_per_century,
    x="century",
    y="percentage",
    color="source",
    title="Distribution of Paintings Across Centuries",
    color_discrete_sequence=COLORS[:4],
)
fig.update_layout(xaxis={"dtick": 1})
fig.show()

In [None]:
paintings_per_century_no = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .rename({"len": "paintings number"})
)

with pl.Config(tbl_rows=30):
    display(paintings_per_century_no)

### 3. Type and style

In [None]:
paintings_with_type_and_style = data.filter(
    pl.col("coarse_type").is_not_null()
    & pl.col("fine_grained_type").is_not_null()
    & pl.col("style").is_not_null()
)
print(f"Paintings with style and type: {paintings_with_type_and_style.shape[0]}")
print(f"Paintings with style or type: {paintings_with_type_or_style.shape[0]}")

In [None]:
print(
    f"Number of painintgs with coarse type: {data.filter(pl.col('coarse_type').is_not_null()).shape[0]}"
)
coarse_grained_types = data["coarse_type"].value_counts().sort("count").to_numpy()
print(f"The {len(coarse_grained_types)} coarse types are:\n{coarse_grained_types}")

In [None]:
print(
    f"Number of paintings with fine-grained type: {data.filter(pl.col('fine_grained_type').is_not_null()).shape[0]}"
)
fine_grained_types = data["fine_grained_type"].value_counts().sort("count").to_numpy()
print(f"The {len(fine_grained_types)} fine-grained types are:\n{fine_grained_types}")

In [None]:
print(f"Number of painintgs with style: {data.filter(pl.col('style').is_not_null()).shape[0]}")
styles = data["style"].value_counts().sort("count").to_numpy()
print(f"The {len(styles)} styles are:\n{styles}")

### 4. Description length

In [None]:
data_description_word_count = data.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(x.split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
)

fig = px.box(
    data_description_word_count,
    x="description word count",
    title="Description Word Count",
    color_discrete_sequence=COLORS[0:3],
    color="source"
)
fig.update_xaxes(title_text="number of words")
fig.show()

In [None]:
description_lengths = ["shortest:\n", "\nmedium:\n", "\nlong:\n"]

for index, description_index in enumerate([0, 5000, 10000]):
    description = data_description_word_count.sort("description word count")["description"][
        description_index
    ]

    print(f"{description_lengths[index]}{description}")