# Analyze the filtered dataset
This notebook analyzes each feature of the dataset and is leveraged to understand which type of paintings are under-represented.

### 0. Import libraries and load data

In [1]:
import polars as pl
import plotly.express as px
from collections import Counter

INTERMEDIATE_DATA_PATH = "../../data/intermediate/"
COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

In [None]:
data = pl.read_json(
    f"{INTERMEDIATE_DATA_PATH}filtered_paintings/filtered_paintings_enhanced_data.json"
)

data

id,title,artist,year,coarse_type,first_fine_grained_type,second_fine_grained_type,first_style,second_style,description,source
i64,str,str,i64,str,str,str,str,str,str,str
0,"""Flora and Zephyr""","""Jacopo Amigoni""",1739,"""mythological""",,,,,"""The composition celebrates the…","""met"""
1,"""Saint Alexander""","""Fra Angelico (Guido di Pietro)""",1430,,,,,,"""This early work by Fra Angelic…","""met"""
2,"""The Crucifixion""","""Fra Angelico (Guido di Pietro)""",1445,"""religious""","""religious""",,"""early renaissance""",,"""To the left of the Crucifixion…","""met"""
3,"""The Nativity""","""Zanobi Strozzi""",1434,,,,,,"""Although long ascribed to Fra …","""met"""
4,"""Christ Crowned with Thorns""","""Antonello da Messina (Antonell…",1479,,"""religious""",,"""early renaissance""",,"""An artist of astonishing origi…","""met"""
…,…,…,…,…,…,…,…,…,…,…
12073,"""The Giudecca, Venice""","""ROBERTS, David""",1854,"""landscape""",,,,,"""David Roberts was a great trav…","""wga"""
12074,"""Herdsmen and Herds at a Waterf…","""BERCHEM, Nicolaes""",1665,"""landscape""",,,,,"""The landscape Herdsmen and Her…","""wga"""
12075,"""The Madonna of Foligno""","""RAFFAELLO Sanzio""",1511,"""religious""",,,,,"""The painting was executed for …","""wga"""
12076,"""The Second of May, 1808: The C…","""GOYA Y LUCIENTES, Francisco de""",1814,"""history""","""battle""",,"""romanticism""",,"""After the expulsion of the Nap…","""wga"""


### 1. Artists

In [3]:
artist_frequency = data["artist"].value_counts().sort("count").rename({"count": "frequency"})
print(f"Number of artists: {len(set(data['artist'].to_list()))}")

fig = px.histogram(
    artist_frequency, x="frequency", title="Artist Frequency", color_discrete_sequence=COLORS[2:3]
)
fig.show()

Number of artists: 3112


In [4]:
data["artist"].value_counts().sort("count")

artist,count
str,u32
"""KYHN, Vilhelm""",1
"""Cornelis de Vos""",1
"""MÉNAGEOT, François-Guillaume""",1
"""POTTER, Pieter Symonsz.""",1
"""LOCATELLI, Andrea""",1
…,…
"""TINTORETTO""",127
"""GRECO, El""",137
"""GOGH, Vincent van""",139
"""REMBRANDT Harmenszoon van Rijn""",139


In [5]:
paintings_with_type_or_style = data.filter(
    pl.col("coarse_type").is_not_null()
    | pl.col("first_fine_grained_type").is_not_null()
    | pl.col("second_fine_grained_type").is_not_null()
    | pl.col("first_style").is_not_null()
    | pl.col("second_style").is_not_null()
)

artists_with_type_or_style = len(set(paintings_with_type_or_style["artist"].to_list()))
print(f"Artists that have paintings with associated style or type: {artists_with_type_or_style}")

Artists that have paintings with associated style or type: 2696


### 2. Year of creation

In [6]:
paintings_per_century = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .with_columns(pl.col("len") / data.shape[0] * 100)
    .rename({"len": "percentage"})
)
print(f"Covered period: {data['year'].min()} - {data['year'].max()}")

fig = px.bar(
    paintings_per_century.sort("source"),
    x="century",
    y="percentage",
    color="source",
    title="Distribution of Paintings Across Centuries",
    color_discrete_sequence=COLORS[:4],
)
fig.update_layout(xaxis={"dtick": 1})
fig.show()

Covered period: 1305 - 2022


In [7]:
paintings_per_century_no = (
    data.with_columns((pl.col("year") // 100 + 1).alias("century"))
    .group_by("century", "source")
    .len()
    .sort("century")
    .rename({"len": "paintings number"})
)

with pl.Config(tbl_rows=30):
    display(paintings_per_century_no)

century,source,paintings number
i64,str,u32
14,"""wga""",13
14,"""met""",31
14,"""wikiart""",5
15,"""wikiart""",28
15,"""wga""",630
15,"""met""",116
16,"""wikiart""",104
16,"""met""",215
16,"""wga""",2106
17,"""wikiart""",85


### 3. Type and style

In [8]:
paintings_with_type_and_style = data.filter(
    (
        pl.col("coarse_type").is_not_null()
        | pl.col("first_fine_grained_type").is_not_null()
        | pl.col("second_fine_grained_type").is_not_null()
    )
    & (pl.col("first_style").is_not_null() | pl.col("second_style").is_not_null())
)
print(f"Paintings with style and type: {paintings_with_type_and_style.shape[0]}")
print(f"Paintings with style or type: {paintings_with_type_or_style.shape[0]}")

Paintings with style and type: 4730
Paintings with style or type: 11191


In [None]:
paintings_with_coarse_type_no = data.filter(pl.col("coarse_type").is_not_null()).shape[0]

coarse_grained_types = Counter(
    [type_ for type_ in data["coarse_type"].to_list() if type_ is not None]
)

print(f"Number of paintings with coarse type: {paintings_with_coarse_type_no}")

print(f"The {len(coarse_grained_types)} coarse types are:")
display(coarse_grained_types)

Number of paintings with coarse type: 9873
The 7 coarse types are:


Counter({'religious': 2877,
         'portrait': 2316,
         'landscape': 1575,
         'genre': 1136,
         'mythological': 1103,
         'still life': 482,
         'history': 384})

In [None]:
paintings_with_fine_grained_type_no = data.filter(
    pl.col("first_fine_grained_type").is_not_null()
    | pl.col("second_fine_grained_type").is_not_null()
).shape[0]

fine_grained_types = Counter(
    type_
    for type_ in data["first_fine_grained_type"].to_list()
    + data["second_fine_grained_type"].to_list()
    if type_ is not None
)

print(f"Number of paintings with fine-grained type: {paintings_with_fine_grained_type_no}")

print(f"The {len(fine_grained_types)} fine-grained types are:")
display(fine_grained_types)

Number of paintings with fine-grained type: 4813
The 25 fine-grained types are:


Counter({'religious': 1243,
         'portrait': 948,
         'genre': 798,
         'landscape': 474,
         'mythological': 262,
         'cityscape': 173,
         'allegorical': 161,
         'self-portrait': 118,
         'history': 104,
         'still life': 98,
         'nude': 76,
         'sketch and study': 72,
         'symbolic': 71,
         'veduta': 64,
         'marina': 58,
         'animal': 46,
         'flower': 46,
         'literary': 43,
         'battle': 33,
         'tronie': 32,
         'interior': 25,
         'capriccio': 16,
         'pastorale': 14,
         'cloudscape': 14,
         'figurative': 11})

In [None]:
paintings_with_style_no = data.filter(
    pl.col("first_style").is_not_null() | pl.col("second_style").is_not_null()
).shape[0]

styles = Counter(
    style_
    for style_ in data["first_style"].to_list() + data["second_style"].to_list()
    if style_ is not None
)

print(f"Number of paintings with style: {paintings_with_style_no}")
print(f"The {len(styles)} styles are:")
display(styles)

Number of paintings with style: 4734
The 32 styles are:


Counter({'baroque': 1153,
         'mannerism (late renaissance)': 533,
         'romanticism': 362,
         'impressionism': 351,
         'northern renaissance': 348,
         'realism': 321,
         'rococo': 288,
         'high renaissance': 258,
         'post-impressionism': 198,
         'tenebrism': 148,
         'expressionism': 139,
         'neoclassicism': 136,
         'early renaissance': 131,
         'classicism': 105,
         'surrealism': 54,
         'symbolism': 53,
         'naïve art (primitivism)': 50,
         'art nouveau': 40,
         'magic realism': 38,
         'academicism': 37,
         'neo-impressionism': 35,
         'orientalism': 24,
         'cloisonnism': 24,
         'pop art': 22,
         'neo-figurative art': 20,
         'pointillism': 19,
         'figurative expressionism': 18,
         'regionalism': 17,
         'fauvism': 13,
         'divisionism': 12,
         'neo-romanticism': 12,
         'renaissance': 12})

### 4. Description length

In [12]:
data_description_word_count = data.with_columns(
    pl.col("description")
    .map_elements(lambda x: len(x.split(" ")), return_dtype=pl.Int64)
    .alias("description word count")
)

fig = px.box(
    x=data_description_word_count["description word count"],
    title="Description Word Count",
    color_discrete_sequence=COLORS[0:1],
)
fig.update_xaxes(title_text="number of words")
fig.show()

In [13]:
description_lengths = ["shortest:\n", "\nmedium:\n", "\nlong:\n"]

for index, description_index in enumerate([0, 5000, 10000]):
    description = data_description_word_count.sort("description word count")["description"][
        description_index
    ]

    print(f"{description_lengths[index]}{description}")

shortest:
Mastelletta's grand, open-air fêtes champêtres, combining landscape and slender, silhouette-like figures, depicted through a simplified, expressionistic design, lie at the root of his fame and originality.

medium:
In the 17th century the painting was attributed to Giorgione, and later for a long time to Palma Vecchio. However, recently it was given to Tiziano.
The Bravo, so called because of the mysterious armed man with his back to us, falls into Titian's Giorgionesque phase and demonstrates an undercurrent of cruelty. The subject has been convincingly identified as the arrest of Bacchus by Pentheus, King of Thebes, who opposed the Bacchic cult. Bacchus' revenge was dire, and Pentheus was torn to pieces by his mother and sisters.

long:
Considered a rare oddity by contemporaries, this painting by "three hands" (Morazzone, Giovan Battista Crespi and Giulio Cesare Procaccini) recalls the competitions that were common in the fourteenth and fifteenth centuries, in which painter