# Analyze the created dataset before pre-processing
This notebook analyzes each feature of the dataset and is leveraged to understand which type of paintings are under-represented.

### 0. Import libraries and load data

In [1]:
import polars as pl
import plotly.express as px

INTERMEDIATE_DATA_PATH = "../../data/intermediate/"
COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

In [2]:
datasets_names = ["met", "wikiart", "wga"]
datasets = []

for dataset_name in datasets_names:
    datasets.append(
        pl.read_json(
            f"{INTERMEDIATE_DATA_PATH}{dataset_name}_paintings/{dataset_name}_paintings_enhanced_data.json"
        ).with_columns(pl.lit(dataset_name).alias("source"))
    )

data = pl.concat(datasets)
data

id,title,artist,year,coarse_type,fine_grained_type,style,description,source
i64,str,str,i64,str,str,str,str,str
0,"""A Ship in a Stormy Sea""","""Ivan Konstantinovich Aivazovsk…",1900,"""landscape""","""marina""","""romanticism""","""Aivazovsky was a celebrated pa…","""met"""
1,"""Saint Giles with Christ Triump…","""Miguel Alcañiz (or Miquel Alca…",1413,,,,"""These panels, from an altarpie…","""met"""
2,"""Flora and Zephyr""","""Jacopo Amigoni""",1739,"""mythological""",,,"""The composition celebrates the…","""met"""
3,"""Jérôme Bonaparte (1784–1860), …","""Giacomo Andreoli""",1813,,,,"""The following miniature is cle…","""met"""
4,"""Saint Alexander""","""Fra Angelico (Guido di Pietro)""",1430,,,,"""This early work by Fra Angelic…","""met"""
…,…,…,…,…,…,…,…,…
18485,"""Creation of Adam""","""MICHELANGELO Buonarroti""",1510,"""religious""","""religious painting""","""high renaissance""","""The fourth scene in the chrono…","""wga"""
18486,"""Herdsmen and Herds at a Waterf…","""BERCHEM, Nicolaes""",1665,"""landscape""",,,"""The landscape Herdsmen and Her…","""wga"""
18487,"""The Madonna of Foligno""","""RAFFAELLO Sanzio""",1511,"""religious""",,,"""The painting was executed for …","""wga"""
18488,"""The Second of May, 1808: The C…","""GOYA Y LUCIENTES, Francisco de""",1814,"""history""","""battle painting""","""romanticism""","""After the expulsion of the Nap…","""wga"""


In [3]:
data["source"].value_counts().sort("count")

source,count
str,u32
"""wikiart""",1304
"""met""",2011
"""wga""",15175


### 1. Artists

In [4]:
artist_frequency = data["artist"].value_counts().sort("count").rename({"count": "frequency"})
print(f"Number of artists: {len(set(data['artist'].to_list()))}")

fig = px.histogram(
    artist_frequency, x="frequency", title="Artist Frequency", color_discrete_sequence=COLORS[2:3]
)
fig.show()

Number of artists: 3799


In [5]:
data["artist"].value_counts().sort("count")

artist,count
str,u32
"""Barent Fabritius""",1
"""KNEBEL, Franz""",1
"""Berlinghiero""",1
"""VOSMAER, Jacob Woutersz""",1
"""COUSIN, Jean the Younger""",1
…,…
"""VERONESE, Paolo""",170
"""TIZIANO Vecellio""",172
"""TINTORETTO""",172
"""MICHELANGELO Buonarroti""",175


In [6]:
paintings_with_type_or_style = data.filter(
    pl.col("coarse_type").is_not_null()
    | pl.col("fine_grained_type").is_not_null()
    | pl.col("style").is_not_null()
)

artists_with_type_or_style = len(set(paintings_with_type_or_style["artist"].to_list()))
print(f"Artists that have paintings with associated style or type: {artists_with_type_or_style}")

Artists that have paintings with associated style or type: 3202


### 2. Year of creation

In [7]:
paintings_per_century = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .with_columns(pl.col("len") / data.shape[0] * 100)
    .rename({"len": "percentage"})
)
print(f"Covered period: {data['year'].min()} - {data['year'].max()}")

fig = px.bar(
    paintings_per_century,
    x="century",
    y="percentage",
    color="source",
    title="Distribution of Paintings Across Centuries",
    color_discrete_sequence=COLORS[:4],
)
fig.update_layout(xaxis={"dtick": 1})
fig.show()

Covered period: 1007 - 2022


In [8]:
paintings_per_century_no = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .rename({"len": "paintings number"})
)

with pl.Config(tbl_rows=30):
    display(paintings_per_century_no)

century,source,paintings number
i64,str,u32
11,"""wikiart""",2
11,"""wga""",14
12,"""wikiart""",1
12,"""wga""",44
13,"""wga""",115
13,"""wikiart""",21
13,"""met""",4
14,"""wga""",970
14,"""met""",55
14,"""wikiart""",6


### 3. Type and style

In [9]:
paintings_with_type_and_style = data.filter(
    pl.col("coarse_type").is_not_null()
    & pl.col("fine_grained_type").is_not_null()
    & pl.col("style").is_not_null()
)
print(f"Paintings with style and type: {paintings_with_type_and_style.shape[0]}")
print(f"Paintings with style or type: {paintings_with_type_or_style.shape[0]}")

Paintings with style and type: 5307
Paintings with style or type: 17234


In [10]:
print(
    f"Number of painintgs with coarse type: {data.filter(pl.col('coarse_type').is_not_null()).shape[0]}"
)
coarse_grained_types = data["coarse_type"].value_counts().sort("count").to_numpy()
print(f"The {len(coarse_grained_types)} coarse types are:\n{coarse_grained_types}")

Number of painintgs with coarse type: 15842
The 10 coarse types are:
[['sketch and study' 40]
 ['interior' 401]
 ['still life' 514]
 ['history' 571]
 ['genre' 1301]
 ['landscape' 1684]
 ['mythological' 1776]
 [None 2648]
 ['portrait' 2668]
 ['religious' 6887]]


In [11]:
print(
    f"Number of paintings with fine-grained type: {data.filter(pl.col('fine_grained_type').is_not_null()).shape[0]}"
)
fine_grained_types = data["fine_grained_type"].value_counts().sort("count").to_numpy()
print(f"The {len(fine_grained_types)} fine-grained types are:\n{fine_grained_types}")

Number of paintings with fine-grained type: 6711
The 146 fine-grained types are:
[['veduta, genre painting' 1]
 ['religious painting, flower painting' 1]
 ['literary painting, portrait' 1]
 ['genre painting, history painting' 1]
 ['cloudscape' 1]
 ['literary painting, battle painting' 1]
 ['landscape, history painting' 1]
 ['mythological painting, literary painting' 1]
 ['allegorical painting, symbolic painting' 1]
 ['animal painting, still life' 1]
 ['portrait, nude painting (nu)' 1]
 ['genre painting, portrait' 1]
 ['self-portrait, religious painting' 1]
 ['cloudscape, flower painting' 1]
 ['nude painting (nu), sketch and study' 1]
 ['symbolic painting, figurative' 1]
 ['landscape, nude painting (nu)' 1]
 ['religious painting, symbolic painting' 1]
 ['figurative, nude painting (nu)' 1]
 ['landscape, pastorale' 1]
 ['vanitas' 1]
 ['genre painting, wildlife painting' 1]
 ['battle painting, landscape' 1]
 ['cloudscape, sketch and study' 1]
 ['genre painting, religious painting' 1]
 ['sk

In [12]:
print(f"Number of painintgs with style: {data.filter(pl.col('style').is_not_null()).shape[0]}")
styles = data["style"].value_counts().sort("count").to_numpy()
print(f"The {len(styles)} styles are:\n{styles}")

Number of painintgs with style: 6722
The 156 styles are:
[['baroque, romanticism' 1]
 ['impressionism, japonism' 1]
 ['art nouveau (modern), romanticism' 1]
 ['international gothic, northern renaissance' 1]
 ['surrealism, feminist art' 1]
 ['hyper-realism' 1]
 ['intimism' 1]
 ['expressionism, social realism' 1]
 ['impressionism, figurative expressionism' 1]
 ['symbolism, orientalism' 1]
 ['romanticism, naïve art (primitivism)' 1]
 ['expressionism, muralism' 1]
 ['regionalism, social realism' 1]
 ['romanticism, symbolism' 1]
 ['baroque, classicism' 1]
 ['late byzantine/palaeologan renaissance (c. 1261–1453), macedonian renaissance (867–1056)'
  1]
 ['realism, surrealism' 1]
 ['op art' 1]
 ['realism, symbolism' 1]
 ['luminism, romanticism' 1]
 ['biedermeier, romanticism' 1]
 ['purism' 1]
 ['romanticism, naturalism' 1]
 ['post-impressionism, symbolism' 1]
 ['mannerism (late renaissance), high renaissance' 1]
 ['art nouveau (modern), impressionism' 1]
 ['romanticism, neoclassicism' 1]
 ['n

### 4. Description length

In [13]:
data_description_word_count = data.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(x.split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
)

fig = px.box(
    data_description_word_count,
    x="description word count",
    title="Description Word Count",
    color_discrete_sequence=COLORS[0:3],
    color="source"
)
fig.update_xaxes(title_text="number of words")
fig.show()

In [14]:
description_lengths = ["shortest:\n", "\nmedium:\n", "\nlong:\n"]

for index, description_index in enumerate([0, 5000, 10000]):
    description = data_description_word_count.sort("description word count")["description"][
        description_index
    ]

    print(f"{description_lengths[index]}{description}")

shortest:
Mastelletta's grand, open-air fêtes champêtres, combining landscape and slender, silhouette-like figures, depicted through a simplified, expressionistic design, lie at the root of his fame and originality.

medium:
The only work by Antonello still in Venice, where he resided in 1475 and 1476, it was probably painted at the beginning of his stay in the city. The splendid background scene, one of the best preserved parts of this very badly damaged painting, with the apse of the church of San Francesco d'Assisi in Messina, is the artist's tribute to the city of his birth.

long:
In addition to those in the central section of the ceiling of the main hall, four other virtues appear inside stucco frames on the walls of the room: Marital Harmony, Humility Casting Out Pride, Liberality Dispensing Gifts, and Virtue Crowning Honour.
In the Marital Harmony, the beautiful young woman is comfortably seated on a chair, the man is standing, leaning toward her and holding a heart on a chain.