In [1]:
import polars as pl
import hvplot

data available at https://www.kaggle.com/datasets/rtatman/lego-database?resource=download

In [2]:
inventories = pl.read_csv("archive/inventories.csv")
sets = pl.read_csv("archive/sets.csv")
inventory_parts = pl.read_csv("archive/inventory_parts.csv")
colors = pl.read_csv("archive/colors.csv")

In [3]:
# parts_by_set can be thought of as the list of parts you would have if you
# acquired one of each set.
parts_by_set = (sets
    .join(inventories, on="set_num", how="inner")
    .join(inventory_parts, 
        left_on="id",
        right_on="inventory_id",
        how="inner"))

parts_by_set.head(10)

set_num,name,year,theme_id,num_parts,id,version,part_num,color_id,quantity,is_spare
str,str,i64,i64,i64,i64,i64,str,i64,i64,str
"""7922-1""","""McDonald's Sports Set Number 6…",2004,460,4,1,1,"""48379c01""",72,1,"""f"""
"""7922-1""","""McDonald's Sports Set Number 6…",2004,460,4,1,1,"""48395""",7,1,"""f"""
"""7922-1""","""McDonald's Sports Set Number 6…",2004,460,4,1,1,"""mcsport6""",25,1,"""f"""
"""7922-1""","""McDonald's Sports Set Number 6…",2004,460,4,1,1,"""paddle""",0,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""11816pr0005""",78,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""2343""",47,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""3003""",29,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""30176""",2,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""3020""",15,1,"""f"""
"""3931-1""","""Emma's Splash Pool""",2012,494,43,3,1,"""3022""",15,2,"""f"""


In [4]:
color_counts_per_set = (parts_by_set
    .group_by("year", "set_num", "color_id")
        .agg(pl.col("quantity").sum().alias("pcs")))

color_counts_per_set.head(10)

year,set_num,color_id,pcs
i64,str,i64,i64
2007,"""7665-1""",42,5
2017,"""60139-1""",1,43
1998,"""9719-1""",22,4
2004,"""4856-1""",379,1
2000,"""1278-1""",1,2
2014,"""60064-1""",47,7
1995,"""1786-1""",33,5
2001,"""5833-1""",114,1
2010,"""9334-1""",28,1
2000,"""5825-1""",18,2


In [5]:
color_percs_per_year = (color_counts_per_set
    .group_by("year", "color_id")
        .agg(pl.col("pcs").sum())
    .with_columns(  # compute percent of pieces with color by year
        (100 * pl.col("pcs") / pl.col("pcs").sum()).over("year").alias("perc"))
    .join(colors, how="left", left_on="color_id", right_on="id")
    .with_columns(pl.col("rgb").str.pad_start(7, "#")))

color_percs_per_year.head(10)

year,color_id,pcs,perc,name,rgb,is_trans
i64,i64,i64,f64,str,str,str
2005,28,49,0.074057,"""Dark Tan""","""#958A73""","""f"""
2002,13,16,0.03072,"""Pink""","""#FC97AC""","""f"""
1997,15,2623,10.36595,"""White""","""#FFFFFF""","""f"""
2017,1003,14,0.017794,"""Glitter Trans-Light Blue""","""#68BCC5""","""t"""
2014,1000,79,0.063195,"""Glow in Dark White""","""#D9D9D9""","""f"""
2001,135,6,0.016816,"""Pearl Light Gray""","""#9CA3A8""","""f"""
2009,134,11,0.013599,"""Copper""","""#AE7A59""","""f"""
2011,191,80,0.097316,"""Bright Light Orange""","""#F8BB3D""","""f"""
2016,322,852,0.549107,"""Medium Azure""","""#36AEBF""","""f"""
2002,366,8,0.01536,"""Earth Orange""","""#FA9C1C""","""f"""


In [6]:
# select only colors that exceed 10% production at least once
popular_colors = (color_percs_per_year.select(
    pl.col("name").filter(pl.col("perc") > 10).unique()))

pop_percs_per_year = (color_percs_per_year
    .join(popular_colors, on="name", how="semi"))
pop_percs_per_year.head(10)

year,color_id,pcs,perc,name,rgb,is_trans
i64,i64,i64,f64,str,str,str
1997,15,2623,10.36595,"""White""","""#FFFFFF""","""f"""
1995,0,5735,24.756108,"""Black""","""#05131D""","""f"""
1993,47,180,1.085973,"""Trans-Clear""","""#FCFCFC""","""t"""
1998,7,5158,11.171999,"""Light Gray""","""#9BA19D""","""f"""
1958,15,711,37.980769,"""White""","""#FFFFFF""","""f"""
1968,47,131,4.120793,"""Trans-Clear""","""#FCFCFC""","""t"""
1959,4,27,41.538462,"""Red""","""#C91A09""","""f"""
2009,0,15811,19.547264,"""Black""","""#05131D""","""f"""
2009,2,1452,1.795119,"""Green""","""#237841""","""f"""
1965,14,107,9.990663,"""Yellow""","""#F2CD37""","""f"""


In [7]:
# create category for other colors never above 10% production per year
other_perc_per_year = (color_percs_per_year
    .join(popular_colors, on="name", how="anti")
    .group_by("year")
    .agg(pl.col("perc").sum())
    .with_columns(
        pl.lit("Other").alias("name"),
        pl.lit("#39ff14").alias("rgb")))

other_perc_per_year.head()

year,perc,name,rgb
i64,f64,str,str
1995,6.090823,"""Other""","""#39ff14"""
1959,1.538462,"""Other""","""#39ff14"""
2013,29.086115,"""Other""","""#39ff14"""
2000,25.686212,"""Other""","""#39ff14"""
1955,0.096339,"""Other""","""#39ff14"""


In [8]:
# change trans-clear rgb to appear light blue
new_trans_clear = (pl.when(pl.col("name") == "Trans-Clear")
    .then(pl.lit("#CDE1FE"))
    .otherwise(pl.col("rgb"))
    .alias("rgb"))

full_percs_per_year = (pl.concat(
    items=[pop_percs_per_year, other_perc_per_year],
    how="diagonal")
    .sort("year")
    .with_columns(new_trans_clear))

full_percs_per_year.head(10)

year,color_id,pcs,perc,name,rgb,is_trans
i64,i64,i64,f64,str,str,str
1950,14.0,12.0,16.901408,"""Yellow""","""#F2CD37""","""f"""
1950,47.0,2.0,2.816901,"""Trans-Clear""","""#CDE1FE""","""t"""
1950,15.0,23.0,32.394366,"""White""","""#FFFFFF""","""f"""
1950,2.0,6.0,8.450704,"""Green""","""#237841""","""f"""
1950,4.0,12.0,16.901408,"""Red""","""#C91A09""","""f"""
1950,1.0,6.0,8.450704,"""Blue""","""#0055BF""","""f"""
1950,,,14.084507,"""Other""","""#39ff14""",
1953,4.0,16.0,24.242424,"""Red""","""#C91A09""","""f"""
1953,14.0,13.0,19.69697,"""Yellow""","""#F2CD37""","""f"""
1953,15.0,23.0,34.848485,"""White""","""#FFFFFF""","""f"""


In [9]:
# plot infrequent colors
colors = (full_percs_per_year
    .select(pl.col("rgb").unique(maintain_order=True))
    .to_dict(as_series=False))["rgb"]

plot = (full_percs_per_year
    .pivot(values="perc", index="year", columns="name")
    .fill_null(0)
    .plot.line(
        x="year",
        color=colors,
        bgcolor="#EEEEEE", # between white and light gray
        width=1120,
        height=480,
        title="Change in Color Prevalence Over Time",
        xlabel="Year",
        ylabel="Percent of Bricks in All Unique Sets",
        group_label="Color"))

hvplot.save(plot, "figs/percent.png")
plot

As might be expected the four main colors (RGBY), white, black, and shades of grey were once very common. Each of these colors were plotted as is because they represented at least 10% of the combined bricks from each Lego set released in a given year. Every other color is aggregated into the "other" category. As such, the "other" category includes navy blue, sky blue, etc. Just, not the original blue.

While white, black, and greys have remained reasonably popular, the RGBY colors have gotten progressively less popular over time. Correspondingly, the percent of colors that aren't in these common groups has increased over time. A clear example of this shift to fine-grained colors is the replacment of light gray by light bluish gray during the early-2000s, and at the some time, the introduction of dark bluish gray. This raises the question, is the increase in special colors due to a greater diversity of brick colors within sets or a greater diversity in dominant colors among sets.

In [10]:
color_counts_by_set = (color_counts_per_set
    .filter(pl.col("pcs").sum().over("set_num") >= 30))  # must have 30+ pcs

color_counts_by_set.head(10)

year,set_num,color_id,pcs
i64,str,i64,i64
2007,"""7665-1""",42,5
2017,"""60139-1""",1,43
1998,"""9719-1""",22,4
2004,"""4856-1""",379,1
2014,"""60064-1""",47,7
1995,"""1786-1""",33,5
2001,"""5833-1""",114,1
2010,"""9334-1""",28,1
2000,"""5825-1""",18,2
2006,"""8674-1""",36,1


Answer question of how has diversity in sets changed.

In [11]:
eff_colors_per_set = (color_counts_by_set
    .with_columns(  # compute frequency of color within set
        (pl.col("pcs") / pl.col("pcs").sum()).over("set_num").alias("freq"))
    .group_by("set_num")  # calculate inv Simpson's idx for each set
        .agg(pl.lit(1.).alias("isi") / (pl.col("freq")**2).sum())
    .join(sets, on="set_num"))

eff_colors_per_set.head(10)

set_num,isi,name,year,theme_id,num_parts
str,f64,str,i64,i64,i64
"""00-1""",3.313384,"""Weetabix Castle""",1970,414,471
"""00-2""",2.89161,"""Weetabix Promotional House 1""",1976,413,147
"""00-3""",3.484696,"""Weetabix Promotional House 2""",1976,413,149
"""00-4""",3.321339,"""Weetabix Promotional Windmill""",1976,413,126
"""005-1""",2.115717,"""Basic Building Set in Cardboar…",1965,366,35
"""010-1""",2.660934,"""Basic Building Set in Cardboar…",1965,366,57
"""010-3""",2.676749,"""Basic Building Set""",1968,366,77
"""011-1""",2.460503,"""Basic Building Set""",1968,366,145
"""022-1""",2.643076,"""Basic Building Set""",1968,366,110
"""03093-1""",7.401813,"""The Race to Build It Board Gam…",1999,502,70


Answer question of how has diversity of dominant colors changed.

In [12]:
# nb: two equally maximal colors will both be included
dom_color_by_set = (color_counts_by_set
    .filter(pl.col("pcs").max().eq(pl.col("pcs")).over("set_num")))

dom_color_by_set.sort("set_num").head(10)

year,set_num,color_id,pcs
i64,str,i64,i64
1970,"""00-1""",15,222
1976,"""00-2""",0,71
1976,"""00-3""",4,65
1976,"""00-4""",4,48
1965,"""005-1""",4,17
1965,"""005-1""",15,17
1965,"""010-1""",4,25
1968,"""010-3""",4,34
1968,"""011-1""",4,68
1968,"""022-1""",4,52


In [13]:
dom_colors_by_year = (dom_color_by_set
    .join(sets, on="set_num")
    .group_by("year")
        .agg(pl.col("color_id").n_unique().alias("colors"))
    .sort("year"))

dom_colors_by_year.head(10)

year,colors
i64,u32
1953,4
1954,1
1955,5
1956,1
1957,5
1958,6
1960,1
1961,3
1962,8
1963,6


In [14]:
plot = (eff_colors_per_set
    .group_by("year").agg(pl.col("isi").mean())
    .sort("year")
    .plot.line(
        x="year",
        y="isi",
        width=1120,
        height=480,
        title="Change in Color Diversity Over Time (of Sets With 30+ Pieces)",
        xlabel="Year",
        ylabel="Number of Colors",
        legend="top",
        label="(Mean) Effective Number of Colors per Set"))

plot *= eff_colors_per_set.plot.scatter(
        x="year",
        y="isi",
        alpha=0.1,
        legend="top",
        label="(Mean) Effective Number of Colors per Set")

plot *= dom_colors_by_year.plot.line(
    x="year",
    y="colors",
    c="red",
    legend="top",
    label="Unique Dominant Colors by Year")

hvplot.save(plot, "figs/diversity.png")
plot

In this second visual, we cna see an increase in the diversity of colors that dominate sets, and a lesser increase in the effective number of colors. This suggests that sets still have a few influential colors, but that there is greater diversity in the particular colors being used. This makes sense as too many colors would tend to result in a "loud" and potentially unattractive set.

We eliminated sets with fewer than 30 parts, a category that includes minifigures, key chains, and microscale builds, as they all tend to be monochromatic.

The unique dominant colors measure is simply the number of unique colors that are most numerous in at least one set. So if we had two very red, and one very blue sets the unique dominant colors would be 2.  The effective number of colors metric is best explained with an example. Imagine, you have built a Lego set, and decide to include a minifigure who's top is a new color. While the total number of colors in your build has increased, if you take a step back, the most prevalent colors are still the same. Our effective number of colors metric reflects the number of important colors.