# Transforming Rebrickable Star Wars Dataset for D3 Network Graph

In [1]:
import pandas as pd
import itertools
import json
import re


In [2]:
# Load source CSVs (adjust path if needed)
inventory_minifigs = pd.read_csv("data/inventory_minifigs.csv")
inventories = pd.read_csv("data/inventories.csv")
sets = pd.read_csv("data/sets.csv")
minifigs = pd.read_csv("data/minifigs.csv")
themes = pd.read_csv("data/themes.csv")
corrections = pd.read_csv("data/minifigcorrections.csv")

# Validate
print("inventory_minifigs:", inventory_minifigs.shape)
print(inventory_minifigs.head(), "\n")

print("inventories:", inventories.shape)
print(inventories.head(), "\n")

print("sets:", sets.shape)
print(sets.head(), "\n")

print("minifigs:", minifigs.shape)
print(minifigs.head(), "\n")

print("themes:", themes.shape)
print(themes.head())


inventory_minifigs: (24199, 3)
   inventory_id     fig_num  quantity
0             3  fig-001549         1
1             4  fig-000764         1
2            19  fig-000555         1
3            25  fig-000574         1
4            26  fig-000842         1 

inventories: (43457, 3)
   id  version set_num
0   1        1  7922-1
1   3        1  3931-1
2   4        1  6942-1
3  15        1  5158-1
4  16        1   903-1 

sets: (25546, 6)
        set_num                         name  year  theme_id  num_parts  \
0  0003977811-1  Ninjago: Book of Adventures  2022       761          1   
1         001-1                        Gears  1965       756         43   
2        0011-2            Town Mini-Figures  1979        67         12   
3        0011-3   Castle 2 for 1 Bonus Offer  1987       199          0   
4        0012-1           Space Mini-Figures  1979       143         12   

                                             img_url  
0  https://cdn.rebrickable.com/media/sets/0003977...

## Clean Minifig Names

In [3]:
minifigs["name"] = minifigs["name"].str.replace("Astromech Droid, ", "", regex = False)

def clean_name(name: str) -> str:
    return re.split(r"- |,", name)[0].strip()

minifigs["clean_name"] = minifigs["name"].apply(clean_name)


      
correction_map = dict(zip(corrections["Minifig"], corrections["Corrected"]))

minifigs["clean_name"] = minifigs["clean_name"].map(correction_map).fillna(minifigs["clean_name"])

print(minifigs.head())

# Validate
#print(minifigs[["fig_num", "name", "clean_name"]].head())


      fig_num                                      name  num_parts  \
0  fig-000001                        Toy Store Employee          4   
1  fig-000002                              Customer Kid          4   
2  fig-000003                     Assassin Droid, White          8   
3  fig-000004  Man, White Torso, Black Legs, Brown Hair          4   
4  fig-000005           Captain America with Short Legs          3   

                                             img_url  \
0  https://cdn.rebrickable.com/media/sets/fig-000...   
1  https://cdn.rebrickable.com/media/sets/fig-000...   
2  https://cdn.rebrickable.com/media/sets/fig-000...   
3  https://cdn.rebrickable.com/media/sets/fig-000...   
4  https://cdn.rebrickable.com/media/sets/fig-000...   

                        clean_name  
0               Toy Store Employee  
1                     Customer Kid  
2                   Assassin Droid  
3                              Man  
4  Captain America with Short Legs  


## Join sets to themes and filter to Star Wars

In [4]:
sets_themes = sets.merge(themes, left_on="theme_id", right_on="id", suffixes=("_set", "_theme"))
print("Joined sets+themes:", sets_themes.shape)
print(sets_themes[["set_num", "name_set", "name_theme"]].head())

sw_sets = sets_themes[sets_themes["name_theme"] == "Star Wars"]
print("Star Wars sets:", sw_sets.shape)
print(sw_sets.head())


Joined sets+themes: (25546, 9)
        set_num                     name_set                      name_theme
0  0003977811-1  Ninjago: Book of Adventures  Activity Books with LEGO Parts
1         001-1                        Gears                       Samsonite
2        0011-2            Town Mini-Figures                    Classic Town
3        0011-3   Castle 2 for 1 Bonus Offer                    Lion Knights
4        0012-1           Space Mini-Figures                    Supplemental
Star Wars sets: (1019, 9)
      set_num                 name_set  year  theme_id  num_parts  \
2445  20006-1         Clone Turbo Tank  2008       158         64   
2456  20007-1  Republic Attack Cruiser  2009       158         84   
2469  20009-1             AT-TE Walker  2009       158         94   
2473  20010-1         Republic Gunship  2009       158         94   
2497  20016-1         Imperial Shuttle  2010       158         70   

                                                img_url   id name_

## Joining inv and minifigs

In [5]:
inventories = inventories[inventories["version"] == 1]

inv_sets = inventories.merge(sw_sets, on="set_num", how="inner")
print("Inventories joined with SW sets:", inv_sets.shape)

set_minifigs = inventory_minifigs.merge(inv_sets, left_on="inventory_id", right_on="id_x", how="inner")
set_minifigs = set_minifigs.merge(minifigs, on="fig_num", how="inner")

print("Final set_minifigs:", set_minifigs.shape)
print(set_minifigs.head())


Inventories joined with SW sets: (1019, 11)
Final set_minifigs: (1944, 18)
   inventory_id     fig_num  quantity  id_x  version  set_num  \
0            89  fig-001714         1    89        1  75094-1   
1            89  fig-004139         1    89        1  75094-1   
2            89  fig-004140         1    89        1  75094-1   
3            89  fig-004141         1    89        1  75094-1   
4            89  fig-004142         1    89        1  75094-1   

                    name_set  year  theme_id  num_parts_x  \
0  Imperial Shuttle Tydirium  2015       158          937   
1  Imperial Shuttle Tydirium  2015       158          937   
2  Imperial Shuttle Tydirium  2015       158          937   
3  Imperial Shuttle Tydirium  2015       158          937   
4  Imperial Shuttle Tydirium  2015       158          937   

                                           img_url_x  id_y name_theme  \
0  https://cdn.rebrickable.com/media/sets/75094-1...   158  Star Wars   
1  https://cdn.rebric

In [12]:
minifigNetwork = set_minifigs.rename(columns={
    "clean_name": "Minifig",
    "img_url_y": "Minifig_Image",
    "img_url_x": "Set_Image",
    "year": "Set_Year",
    "name_set": "Set_Name",
    "set_num": "Set_Number"
})

minifigNetwork = minifigNetwork[[
    "Minifig", "Minifig_Image", "Set_Image", "Set_Year", "Set_Name", "Set_Number"
]]

minifigTable = (
    minifigNetwork.groupby("Minifig")
      .agg(
          Appearances=("Set_Number", "nunique"),
          Sets=("Set_Name", list),
          SetNumbers=("Set_Number", list),
          FirstYear=("Set_Year", "min"),
          LastYear=("Set_Year", "max")
      )
      .reset_index()
      .sort_values("Appearances", ascending=False)
)

# save to file
minifigTable.to_json("data/lego_starwars_table.json", orient="records", indent=2)


print("wrote data/lego_starwars_table.json")



wrote data/lego_starwars_table.json


## Build Nodes

In [13]:
# --- Build nodes using count of unique sets ---
node_sizes = (
    minifigNetwork.groupby(["Minifig", "Minifig_Image"])["Set_Number"]
    .nunique()
    .reset_index()
    .rename(columns={"Set_Number": "size"})
)

print("Node summary:", node_sizes.shape)
print(node_sizes.head())

nodes = []
for minifig, group in minifigNetwork.groupby("Minifig"):
    node = {
        "id": minifig,
        "label": minifig,
        "size": group["Set_Number"].nunique(),
        "sets": group["Set_Number"].unique().tolist(),
        "set_images": group["Set_Image"].unique().tolist(),
        "years": sorted(group["Set_Year"].unique().tolist()),  # include all years for reference
        "first_year": group["Set_Year"].min(),
        "last_year": group["Set_Year"].max(),
        "img_url": group.sort_values("Set_Year", ascending=False)["Minifig_Image"].iloc[0]
    }
    nodes.append(node)


print("Sample nodes:", nodes[:5])



Node summary: (1338, 3)
                        Minifig  \
0  187th Legion Clone Commander   
1    187th Legion Clone Trooper   
2            2-1B Medical Droid   
3            2-1B Medical Droid   
4            2-1B Medical Droid   

                                       Minifig_Image  size  
0  https://cdn.rebrickable.com/media/sets/fig-012...     1  
1  https://cdn.rebrickable.com/media/sets/fig-012...     1  
2  https://cdn.rebrickable.com/media/sets/fig-003...     1  
3  https://cdn.rebrickable.com/media/sets/fig-003...     1  
4  https://cdn.rebrickable.com/media/sets/fig-004...     1  
Sample nodes: [{'id': '187th Legion Clone Commander', 'label': '187th Legion Clone Commander', 'size': 1, 'sets': ['75342-1'], 'set_images': ['https://cdn.rebrickable.com/media/sets/75342-1.jpg'], 'years': [2022], 'first_year': np.int64(2022), 'last_year': np.int64(2022), 'img_url': 'https://cdn.rebrickable.com/media/sets/fig-012579.jpg'}, {'id': '187th Legion Clone Trooper', 'label': '187th Legi

In [14]:
minifigNetwork.to_csv("data/minifigNetwork.csv", index=False)


## Build Edges

In [15]:
# --- Build edges: one co-occurrence per pair per set ---
# --- Build edges with set info ---
edges = []
for set_id, group in minifigNetwork.groupby("Set_Number"):
    figs = group["Minifig"].tolist()
    set_name = group["Set_Name"].iloc[0]
    set_img = group["Set_Image"].iloc[0]
    set_year = group["Set_Year"].iloc[0]

    for a, b in itertools.combinations(sorted(figs), 2):
        edges.append({
            "source": a,
            "target": b,
            "set_number": set_id,
            "set_name": set_name,
            "set_img": set_img,
            "set_year": set_year  # reference year here
        })

# group edges by (a, b) and collect sets
from collections import defaultdict
edge_map = defaultdict(lambda: {"sets": []})
for e in edges:
    key = tuple(sorted([e["source"], e["target"]]))
    edge_map[key]["sets"].append({
        "set_number": e["set_number"],
        "set_name": e["set_name"],
        "set_img": e["set_img"],
        "set_year": e["set_year"]
    })

links = [
    {
        "source": a,
        "target": b,
        "value": len(data["sets"]),
        "sets": data["sets"]
    }
    for (a, b), data in edge_map.items()
]

print("Number of edges:", len(links))
print("Sample edges:", links[:5])


Number of edges: 2303
Sample edges: [{'source': 'Darth Maul', 'target': 'Darth Vader', 'value': 1, 'sets': [{'set_number': '3340-1', 'set_name': 'Star Wars #1 - Sith Minifig Pack', 'set_img': 'https://cdn.rebrickable.com/media/sets/3340-1.jpg', 'set_year': np.int64(2000)}]}, {'source': 'Darth Maul', 'target': 'Emperor Palpatine', 'value': 1, 'sets': [{'set_number': '3340-1', 'set_name': 'Star Wars #1 - Sith Minifig Pack', 'set_img': 'https://cdn.rebrickable.com/media/sets/3340-1.jpg', 'set_year': np.int64(2000)}]}, {'source': 'Darth Vader', 'target': 'Emperor Palpatine', 'value': 7, 'sets': [{'set_number': '3340-1', 'set_name': 'Star Wars #1 - Sith Minifig Pack', 'set_img': 'https://cdn.rebrickable.com/media/sets/3340-1.jpg', 'set_year': np.int64(2000)}, {'set_number': '7200-1', 'set_name': 'Final Duel I', 'set_img': 'https://cdn.rebrickable.com/media/sets/7200-1.jpg', 'set_year': np.int64(2002)}, {'set_number': '7264-1', 'set_name': 'Imperial Inspection', 'set_img': 'https://cdn.rebri

In [18]:
graph_data = {
    "nodes": nodes,
    "links": links
}
import numpy as np

def default_converter(o):
    if isinstance(o, np.integer):
        return int(o)
    if isinstance(o, np.floating):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")

with open("data/lego_starwars_graph_v2.json", "w") as f:
    json.dump({"nodes": nodes, "links": links}, f, indent=2, default=default_converter)

print("Wrote data/lego_starwars_graph_v2.json")


Wrote data/lego_starwars_graph_v2.json


In [11]:
edges = []
for set_id, group in minifigNetwork.groupby("Set_Number"):
    figs = group["Minifig"].tolist()
    set_name = group["Set_Name"].iloc[0]
    set_img = group["Set_Image"].iloc[0]

    for a, b in itertools.combinations(sorted(figs), 2):
        edges.append({
            "source": a,
            "target": b,
            "set_number": set_id,
            "set_name": set_name,
            "set_img": set_img
        })

# group edges by (a, b) and collect sets
from collections import defaultdict
edge_map = defaultdict(lambda: {"sets": []})

for e in edges:
    key = tuple(sorted([e["source"], e["target"]]))
    edge_map[key]["sets"].append({
        "set_number": e["set_number"],
        "set_name": e["set_name"],
        "set_img": e["set_img"]
    })

links = [
    {
        "source": a,
        "target": b,
        "value": len(data["sets"]),
        "sets": data["sets"]
    }
    for (a, b), data in edge_map.items()
]

graph_data = {
    "nodes": nodes,
    "links": links
}