In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change the working directory to the project folder
%cd "/content/drive/MyDrive/directory/dataset"

In [None]:
!pip install rdflib --quiet

# Property and class mappings

In [None]:
import os
import csv
import json
import rdflib
from rdflib import Graph, Namespace, RDF, URIRef
from collections import defaultdict

extension = False

CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
WESO_S = Namespace("http://weso.es/shapes/")
EX = Namespace("http://example.org/")
RDF_NS = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

class_mappings = {
        str(CRM.E53_Place): {"label": "Place", "wikibase_id": "Q3"},
        str(EX.Occupation): {"label": "Occupation", "wikibase_id": "Q39"},
        str(CRM.E31_Document): {"label": "Document", "wikibase_id": "Q30"},
        str(CRM.E74_Group): {"label": "Group", "wikibase_id": "Q17"},
        str(CRM.E55_Type): {"label": "Type", "wikibase_id": "Q31"},
        str(CRM.E98_Currency): {"label": "Currency", "wikibase_id": "Q38"},
        str(CRM.E97_MonetaryAmount): {"label": "Monetary Amount", "wikibase_id": "Q37"},
        str(EX.PublicOffice): {"label": "Public Office", "wikibase_id": "Q56"},
        str(CRM.E27_Site): {"label": "Site", "wikibase_id": "Q27"},
        "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span": {"label": "TimeSpan", "wikibase_id": "Q2"},
        str(CRM.E21_Person): {"label": "Person", "wikibase_id": "Q16"},
        str(CRM.E96_Purchase): {"label": "Purchase", "wikibase_id": "Q14"},
        str(CRM.E8_Acquisition): {"label": "Acquisition", "wikibase_id": "Q9"},
        str(CRM.E7_Activity): {"label": "Activity", "wikibase_id": "Q8"},
        str(CRM.E12_Production): {"label": "Production", "wikibase_id": "Q13"},
        str(CRM.E69_Death): {"label": "Death", "wikibase_id": "Q35"},
    }

property_mappings = {
    str(CRM.P1_is_identified_by): "P1:string",
    str(EX.nativeLabel): "P69:string",
    str(CRM.P180_has_currency): "P44:wikibase-item",
    str(CRM.P90_has_value): "P33:quantity",
    str(EX.applies_to_jurisdiction): "P64:wikibase-item",
    str(CRM.P53_has_former_or_current_location): "P27:wikibase-item",
    str(CRM.P82_at_some_time_within): "P32:time",
    str(EX.givenName): "P49:wikibase-item",
    str(EX.familyName): "P50:wikibase-item",
    str(EX.occupation): "P52:wikibase-item",
    str(EX.spouse): "P56:wikibase-item",
    str(EX.residence): "P55:wikibase-item",
    str(EX.child): "P59:wikibase-item",
    str(EX.father): "P57:wikibase-item",
    str(EX.position_held): "P65:wikibase-item",
    str(EX.mother): "P58:wikibase-item",
    str(EX.sibling): "P60:wikibase-item",
    str(EX.employer): "P54:wikibase-item",
    str(EX.work_location): "P53:wikibase-item",
    str(EX.nickname): "P51:string",
    str(EX.relative_cousin): "P61:wikibase-itemPQ62-Cousin",
    str(EX.relative_grandchild): "P61:wikibase-itemPQ62-Grandchild",
    str(EX.relative_nephew): "P61:wikibase-itemPQ62-Nephew",
    str(EX.relative_uncle): "P61:wikibase-itemPQ62-Uncle",
    str(EX.relative_grandfather): "P61:wikibase-itemPQ62-Grandfather",
    str(EX.relative_grandmother): "P61:wikibase-itemPQ62-Grandmother",
    "http://example.org/relative_sibling-in-law": "P61:wikibase-itemPQ62-Sibling-in-law",
    "http://example.org/relative_father-in-law": "P61:wikibase-itemPQ62-Father-in-law",
    "http://example.org/relative_son-in-law": "P61:wikibase-itemPQ62-Son-in-law",
    str(EX.regnalOrdinal): "P66:string",
    str(EX.title): "P67:string",
    str(CRM.P70i_is_documented_in): "P28:wikibase-item",
    "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span": "P6:wikibase-item",
    str(CRM.P2_has_type): "P5:wikibase-item",
    str(CRM.P179_had_sales_price): "P43:wikibase-item",
    str(CRM.P24_transferred_title_of): "P19:wikibase-item",
    str(CRM.P23_transferred_title_from): "P18:wikibase-item",
    str(CRM.P22_transferred_title_to): "P17:wikibase-item",
    str(CRM.P14_carried_out_by): "P12:wikibase-itemPQ13-Witness",
    "http://www.cidoc-crm.org/cidoc-crm/P14_carried_out_byPQ14.1-Notary": "P12:wikibase-itemPQ13-Notary",
    "http://www.cidoc-crm.org/cidoc-crm/P14_carried_out_byPQ14.1-Judge": "P12:wikibase-itemPQ13-Judge",
    "http://www.cidoc-crm.org/cidoc-crm/P14_carried_out_byPQ14.1-PartyA": "P12:wikibase-itemPQ13-PartyA",
    "http://www.cidoc-crm.org/cidoc-crm/P14_carried_out_byPQ14.1-PartyB": "P12:wikibase-itemPQ13-PartyB",
    str(CRM.P17_was_motivated_by): "P15:wikibase-item",
    str(CRM.P108_has_produced): "P37:wikibase-item",
    str(CRM.P100_died_in): "P35:wikibase-item",
    str(CRM.P183_ends_before_the_start_of): "P46:wikibase-item",
}

# RDF dataset parsing and graph creation

In [None]:
directory_path = "./"
output_csv_path = "../wikibase_import_dataset1.csv"
mapping_output_path = "../id_mapping.json"

all_data = Graph()

for filename in os.listdir(directory_path):
        if filename.endswith('.ttl'):
            file_path = os.path.join(directory_path, filename)
            all_data.parse(file_path, format='turtle')

entities_by_p1  = defaultdict(list)
places_by_id = {}
groups_by_p1 = defaultdict(list)

# Dictionary to map original IDs to consolidated entity info
id_mapping = {}


# Preprocessing
Change the document-level IDs of Occupations and Public Office to a consolidated, identifiable value.

In [None]:
for s in all_data.subjects():
        entity_types = list(all_data.objects(s, RDF.type))

        for entity_type in entity_types:
            if str(entity_type) == str(EX.Occupation) or str(entity_type) == str(EX.PublicOffice):
                entity_id = str(s).split('/')[-1]

                # Get P1 values (identifiers)
                p1_values = []
                for o in all_data.objects(s, CRM.P1_is_identified_by):
                    p1_values.append(str(o))

                if p1_values:
                    # Use the first P1 value as the key for consolidation
                    p1_key_original = p1_values[0]
                    p1_key_lower = p1_key_original.lower()
                    entities_by_p1[p1_key_lower].append({
                        "original_id": entity_id,
                        "type": str(entity_type),
                        "p1_values": p1_values,
                        "original_p1": p1_key_original
                    })

            elif str(entity_type) == str(CRM.E53_Place):
                entity_id = str(s).split('/')[-1]

                # Get P1 values (identifiers)
                p1_values = []
                for o in all_data.objects(s, CRM.P1_is_identified_by):
                    p1_values.append(str(o))

                # Store the place with its entity ID and P1 values
                places_by_id[entity_id] = {
                    "type": str(entity_type),
                    "p1_values": p1_values,
                    "original_id": entity_id
                }
            elif str(entity_type) == str(CRM.E74_Group):
              entity_id = str(s).split('/')[-1]

              # Get P1 values (identifiers)
              p1_values = []
              for o in all_data.objects(s, CRM.P1_is_identified_by):
                  p1_values.append(str(o))

              if p1_values:
                # Use the first P1 value as the key for consolidation
                p1_key_original = p1_values[0]
                p1_key_lower = p1_key_original.lower()
                groups_by_p1[p1_key_lower].append({
                    "original_id": entity_id,
                    "type": str(entity_type),
                    "p1_values": p1_values,
                    "entity_uri": s,
                    "original_p1": p1_key_original
                })
              else:
                  # Si no tiene P1, usar el ID original
                  groups_by_p1[entity_id] = {
                      "type": str(entity_type),
                      "p1_values": [],
                      "entity_uri": s,
                      "original_id": entity_id
                  }


Consolidation of document-level persons.

In [None]:
persons_by_id = defaultdict(dict)

# First pass: group Person entities by their person ID (Pxxx)
for s in all_data.subjects():
    entity_types = list(all_data.objects(s, RDF.type))

    for entity_type in entity_types:
        if str(entity_type) == str(CRM.E21_Person):  # Process Person entities
            full_entity_id = str(s).split('/')[-1]

            # Extract document and person IDs
            if 'P' in full_entity_id:
                # Handle special case for DxPBx format
                if 'PB' in full_entity_id:
                    # Treat as separate entity (no consolidation)
                    doc_id = full_entity_id.split('P')[0]
                    person_id = full_entity_id
                else:
                    # Regular case: DxxxPyyy format
                    doc_id = full_entity_id.split('P')[0]
                    person_id = 'P' + full_entity_id.split('P')[1]

                # Initialize or update person record
                if person_id not in persons_by_id:
                    persons_by_id[person_id] = {
                        "documents": set(),
                        "properties": defaultdict(set),
                        "original_ids": set()
                    }

                # Add document reference and original ID
                persons_by_id[person_id]["documents"].add(doc_id)
                persons_by_id[person_id]["original_ids"].add(full_entity_id)

                # Collect all properties for this person
                for p, o in all_data.predicate_objects(s):
                    if str(p) in property_mappings:
                        # Convert URIRef objects to string IDs
                        if isinstance(o, URIRef):
                            o_val = str(o).split('/')[-1]
                        else:
                            o_val = str(o)
                        persons_by_id[person_id]["properties"][str(p)].add(o_val)

# Processing
## Occupations and Public Offices

In [None]:
csv_rows = []

# Process consolidated Occupations
for p1_key_lower, entity_group in entities_by_p1.items():
    # All entities in this group have the same P1, consolidate them
    entity_type = entity_group[0]["type"]  # Use type from first entity (should be Occupation)
    class_info = class_mappings[entity_type]
    p1_display = entity_group[0]["original_p1"]
    native_label = p1_display
    if extension: # Store documents of native label
      all_labels = set()
      for i in range(len(entity_group)):
        all_labels.add(f"{entity_group[i]['original_id'].split('O')[0]}-{native_label}") #D1-Oriz, D2-Oriz...
      native_label =  "|".join(all_labels)

    all_ids = set()
    for i in range(len(entity_group)):
      all_ids.add(f"AMSPO-{entity_group[i]['original_id']}")
    # Create a consolidated entity row
    entity_row = {
        "Item Label": p1_display,  # Use P1 value as label
        "Description": f"Instance of {class_info['label']}",
        "instance of": class_info["wikibase_id"],
        "P1:string": "|".join(all_ids),
        "P69:string": native_label
    }

    for entity_info in entity_group:
        entity_id = entity_info["original_id"]
        entity_uri = URIRef(f"http://example.org/{entity_id}")

        # Collect all properties for this entity
        for prop_uri, prop_id in property_mappings.items():
            if prop_id not in entity_row:  # Initialize if not already present
                entity_row[prop_id] = ""

            if prop_id == "P1:string":
                continue

            values = []
            for o in all_data.objects(entity_uri, URIRef(prop_uri)):
                if isinstance(o, URIRef):
                    ref_id = str(o).split('/')[-1]
                    # Check if this reference is to a consolidated entity
                    if ref_id in id_mapping:
                        # Use the consolidated label instead
                        ref_id = id_mapping[ref_id]["wikibase_label"]
                    values.append(ref_id)
                else:
                    # This is a literal value
                    values.append(str(o))

            # Add values if any found (don't overwrite existing values)
            if values:
                existing = set(entity_row[prop_id].split("|")) if entity_row[prop_id] else set()
                existing.update(values)
                # Remove empty string if present
                if "" in existing:
                    existing.remove("")
                entity_row[prop_id] = "|".join(existing)

    csv_rows.append(entity_row)

    # Update the mapping for all original IDs in this group
    for entity_info in entity_group:
        id_mapping[entity_info["original_id"]] = {
            "wikibase_label": p1_display,
            "p1_value": p1_display,
            "entity_type": entity_info["type"]
        }

## Places

In [None]:
for entity_id, place_info in places_by_id.items():
      class_info = class_mappings[place_info["type"]]

      # For places, use entity_id as P1 and the P1 value as the label (interchange)
      p1_value = entity_id

      # Use the first P1 value as the label if available, otherwise use entity_id
      if place_info["p1_values"]:
          label = place_info["p1_values"][0]
      else:
          label = entity_id

      # Create a row for this place entity
      entity_row = {
          "Item Label": label,  # Use P1 value as label
          "Description": f"Instance of {class_info['label']}",
          "instance of": class_info["wikibase_id"],
          "P1:string": p1_value,
          "P69:string": label
      }

      csv_rows.append(entity_row)

      # Update the mapping for this place entity
      id_mapping[entity_id] = {
          "wikibase_label": label,
          "p1_value": p1_value,
          "entity_type": place_info["type"]
      }

In [None]:
# Process groups by P1
for p1_key, entities in groups_by_p1.items():
    if not entities:
        continue

    first_entity = entities[0]
    # Create a new consolidated entity
    group_row = {
        "Item Label": first_entity["original_p1"] ,
        "Description": "Group",
        "instance of": class_mappings[str(CRM.E74_Group)]["wikibase_id"]
    }


    # Añadir un nuevo P1 que sea la combinación de los IDs originales
    if "P1:string" in property_mappings.values():
        # Encuentra la clave para P1
        p1_prop_id = next(prop_id for prop_uri, prop_id in property_mappings.items() if prop_id == "P1:string")
        # Asigna los IDs originales como valor de P1
        group_row[p1_prop_id] = "|".join(sorted(set(e["original_id"] for e in entities)))


    # Collect all properties from all entities with this P1
    for entity_info in entities:
        entity_id = entity_info["original_id"]
        entity_uri = entity_info["entity_uri"]

        # Map this entity ID to the consolidated label
        id_mapping[entity_id] = {
            "wikibase_label": entity_info["original_p1"],
            "entity_type": entity_info["type"]
        }

        # Add property values
        for prop_uri, prop_id in property_mappings.items():
            # Saltamos P1 porque ya lo hemos establecido
            if prop_id == "P1:string":
                continue

            values = []
            uri_references = []

            for o in all_data.objects(entity_uri, URIRef(prop_uri)):
                if isinstance(o, URIRef):
                    # This is a reference to another entity
                    ref_id = str(o).split('/')[-1]
                    uri_references.append(ref_id)
                else:
                    # This is a literal value
                    values.append(str(o))

            # Combine all values
            all_values = values + uri_references
            if all_values:
                # Get existing values if any
                existing_values = set()
                if prop_id in group_row and group_row[prop_id]:
                    existing_values = set(group_row[prop_id].split("|"))

                # Add new values
                existing_values.update(all_values)

                # Update property
                group_row[prop_id] = "|".join(existing_values)

    # Add to CSV rows
    csv_rows.append(group_row)

## Rest of the classes

In [None]:
for clase in class_mappings:
    if (str(clase) == str(EX.Occupation) or
        str(clase) == str(EX.PublicOffice) or
        str(clase) == str(CRM.E53_Place) or
        str(clase) == str(CRM.E74_Group) or
        str(clase) == str(CRM.E21_Person)):
        continue

    # Dictionary to collect entities by label for this class
    entities_by_label = {}

    for s in all_data.subjects():
        entity_types = list(all_data.objects(s, RDF.type))

        for entity_type in entity_types:
            if str(entity_type) == str(clase):
                entity_id = str(s).split('/')[-1]
                class_info = class_mappings[str(entity_type)]

                # Get label - use entity_id as label
                label = entity_id

                if str(entity_type) in [str(CRM.E27_Site), str(CRM.E97_MonetaryAmount), str(CRM.E96_Purchase), str(CRM.E8_Acquisition), str(CRM.E7_Activity), str(CRM.E12_Production)]:
                    # Get P1 values (identifiers
                    label = f"AMSPO-{label}"

                # Check if we already have an entity with this label
                if label in entities_by_label:
                    # Merge properties with existing entity
                    existing_row = entities_by_label[label]

                    # Add property values to the existing entity
                    for prop_uri, prop_id in property_mappings.items():
                        values = []
                        uri_references = []

                        for o in all_data.objects(s, URIRef(prop_uri)):
                            if isinstance(o, URIRef):
                                # This is a reference to another entity
                                ref_id = str(o).split('/')[-1]

                                # Check if this reference is to a consolidated entity
                                if ref_id in id_mapping and prop_id != "P27:wikibase-item":
                                    # Use the consolidated label instead
                                    ref_id = id_mapping[ref_id]["wikibase_label"]

                                uri_references.append(ref_id)
                            else:
                                # This is a literal value
                                values.append(str(o))

                        # Combine all values
                        all_values = values + uri_references
                        if all_values:
                            # Get existing values if any
                            existing_values = set()
                            if prop_id in existing_row and existing_row[prop_id]:
                                existing_values = set(existing_row[prop_id].split("|"))

                            # Add new values
                            existing_values.update(all_values)

                            # Update property
                            existing_row[prop_id] = "|".join(existing_values)

                    # Update mapping for this entity ID
                    id_mapping[entity_id] = {
                        "wikibase_label": label,
                        "entity_type": str(entity_type)
                    }
                else:
                    if str(entity_type) == str(CRM.E69_Death):
                      entity_row = {
                        "Item Label": f"Death before the start of AMSPO-{label.split('Death')[0]} ({label.split('_')[1]})",
                        "Description": f"Instance of {class_info['label']}",
                        "instance of": class_info["wikibase_id"],
                        "P1:string": label
                      }
                    else:
                      entity_row = {
                          "Item Label": label,
                          "Description": f"Instance of {class_info['label']}",
                          "instance of": class_info["wikibase_id"]
                      }

                    # Add property values
                    for prop_uri, prop_id in property_mappings.items():
                        values = []
                        uri_references = []

                        for o in all_data.objects(s, URIRef(prop_uri)):
                            if isinstance(o, URIRef):
                                # This is a reference to another entity
                                ref_id = str(o).split('/')[-1]

                                # Check if this reference is to a consolidated entity
                                if ref_id in id_mapping and prop_id != "P27:wikibase-item":
                                    # Use the consolidated label instead
                                    ref_id = id_mapping[ref_id]["wikibase_label"]

                                uri_references.append(ref_id)
                            else:
                                # This is a literal value
                                values.append(str(o))

                        # Combine all values
                        all_values = values + uri_references
                        if all_values:
                            entity_row[prop_id] = "|".join(all_values)
                        elif not (str(entity_type) == str(CRM.E69_Death) and prop_id == "P1:string"):
                            entity_row[prop_id] = ""

                    # Store entity in our dictionary
                    entities_by_label[label] = entity_row

                    # Add to mapping
                    id_mapping[entity_id] = {
                        "wikibase_label": label,
                        "entity_type": str(entity_type)
                    }

    # Convert dictionary to list and sort
    entities_rows = list(entities_by_label.values())

    # Sort entities for this class alphabetically and add to csv_rows
    if entities_rows:  # Only process if we found entities
        sorted_rows = sorted(entities_rows, key=lambda x: x["Item Label"].lower())
        csv_rows.extend(sorted_rows)


## Given name and family name

In [None]:
if not extension:
  given_names = set()
  family_names = set()

  # Collect all unique given names and family names from persons
  for person_id, person_data in persons_by_id.items():
      # Get given name if available
      if str(EX.givenName) in person_data["properties"]:
          for name in person_data["properties"][str(EX.givenName)]:
              given_names.add(name)

      # Get family name if available
      if str(EX.familyName) in person_data["properties"]:
          for name in person_data["properties"][str(EX.familyName)]:
              family_names.add(name)

  # Create entities for given names (Q53)
  for name in given_names:
      if name:  # Skip empty names
          entity_row = {
              "Item Label": name,
              "Description": "Given name",
              "instance of": "Q53",  # Given name
              "P69:string": name,
              "alias": f"{name} (given name)"
          }
          csv_rows.append(entity_row)

          # Add to mapping in case we need to reference it
          id_mapping[f"GivenName_{name}"] = {
              "wikibase_label": name,
              "p1_value": name,
              "entity_type": "GivenName"
          }

  # Create entities for family names (Q54)
  for name in family_names:
      if name:  # Skip empty names
          entity_row = {
              "Item Label": name,
              "Description": "Family name",
              "instance of": "Q54",  # Family name
              "P69:string": name,
              "alias": f"{name} (family name)"
          }
          csv_rows.append(entity_row)

          # Add to mapping in case we need to reference it
          id_mapping[f"FamilyName_{name}"] = {
              "wikibase_label": name,
              "p1_value": name,
              "entity_type": "FamilyName"
          }

## Person

In [None]:
for person_id, person_data in persons_by_id.items():
    # Create label from given name and family name if available
    given_name = next(iter(person_data["properties"].get(str(EX.givenName), [""])))
    family_name = next(iter(person_data["properties"].get(str(EX.familyName), [""])))
    title = next(iter(person_data["properties"].get(str(EX.title), [""])))

    if given_name and family_name:
        label = f"{given_name} {family_name}"
    elif given_name and title:
        label = f"{title} {given_name}"
    elif given_name:
        label = given_name
    else:
        label = person_id  # Fallback to person ID

    # Create a consolidated entity row
    entity_row = {
        "Item Label": label,
        "Description": f"{label} ({person_id} in AMSPO FSV)",
        "instance of": class_mappings[str(CRM.E21_Person)]["wikibase_id"],
        "P1:string": f"AMSPO-{person_id}",
        "alias": f"AMSPO-{person_id}"
    }

    # Track which original entity IDs have which property values for residence and work_location
    refs_by_doc  = defaultdict(lambda: defaultdict(set))

    # First, collect residence and work_location values for each document
    for original_id in person_data["original_ids"]:
        entity_uri = URIRef(f"http://example.org/{original_id}")

        # Extract document ID from original entity ID
        if 'P' in original_id:
            doc_id = original_id.split('P')[0]
        else:
            doc_id = ""  # Fallback

        # Collect residence values
        for o in all_data.objects(entity_uri, URIRef(str(EX.residence))):
            if isinstance(o, URIRef):
                ref_id = str(o).split('/')[-1]
                refs_by_doc [doc_id][str(EX.residence)].add(ref_id)

        # Collect work_location values
        for o in all_data.objects(entity_uri, URIRef(str(EX.work_location))):
            if isinstance(o, URIRef):
                ref_id = str(o).split('/')[-1]
                refs_by_doc [doc_id][str(EX.work_location)].add(ref_id)

        for o in all_data.objects(entity_uri, URIRef(str(EX.nickname))):
            refs_by_doc[doc_id][str(EX.nickname)].add(str(o))

        if extension:
            for o in all_data.objects(entity_uri, URIRef(str(EX.givenName))):
                refs_by_doc[doc_id][str(EX.givenName)].add(str(o))

            for o in all_data.objects(entity_uri, URIRef(str(EX.familyName))):
                refs_by_doc[doc_id][str(EX.familyName)].add(str(o))

    # Add all collected properties
    for prop_uri, values in person_data["properties"].items():
        if prop_uri in property_mappings:
            prop_id = property_mappings[prop_uri]

            # Handle occupation and position_held properties
            if prop_uri == str(EX.occupation) or prop_uri == str(EX.position_held):
                modified_values = set()
                for value in values:
                    # Extract document ID from the value itself (D125Oc3 format)
                    if 'Oc' in value and (prop_uri == str(EX.occupation) or prop_uri == str(EX.position_held)):
                        doc_id = value.split('Oc')[0]
                        if value in id_mapping:
                            entity_value = id_mapping[value]["p1_value"]
                            modified_values.add(f"{doc_id}-{entity_value}")
                        else:
                            modified_values.add(value)
                    else:
                        modified_values.add(value)

                entity_row[prop_id] = "|".join(modified_values)

            # Handle residence and work_location properties
            elif prop_uri in [str(EX.residence), str(EX.work_location), str(EX.nickname)]:
                modified_values = set()

                # For each document where this property exists
                for doc_id, props in refs_by_doc.items():
                    if prop_uri in props:
                        # For each value in this document
                        for value in props[prop_uri]:
                            # For residence and work_location, look up the entity label

                            modified_values.add(f"{doc_id}-{value}")

                entity_row[prop_id] = "|".join(modified_values)

            elif prop_uri in [str(EX.givenName), str(EX.familyName)]:
                modified_values = set()
                for value in values:
                  if extension:
                    for doc_id, props in refs_by_doc.items():
                      if prop_uri in props:
                          for value in props[prop_uri]:
                              modified_values.add(f"{doc_id}-{value}")
                  else:
                    if prop_uri == str(EX.givenName):
                      modified_values.add(f"{value} (given name)")
                    else:
                      modified_values.add(f"{value} (family name)")
                entity_row[prop_id] = "|".join(modified_values)

            # Handle all other properties normally
            else:
                entity_row[prop_id] = "|".join(values)

    # Add document references using P28
    if person_data["documents"]:
        entity_row["P28:wikibase-item"] = "|".join(person_data["documents"])

    csv_rows.append(entity_row)

    # Update the mapping for all original IDs
    for original_id in person_data["original_ids"]:
        id_mapping[original_id] = {
            "wikibase_label": label,
            "person_id": person_id,
            "entity_type": str(CRM.E21_Person)
        }

# Write to file

In [None]:
with open(output_csv_path, 'w', newline='', encoding='latin-1') as csvfile:
    # Create fieldnames
    fieldnames = ["Item Label", "Description", "alias", "instance of"] + list(property_mappings.values())

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
    writer.writeheader()

    # Write all rows
    for row in csv_rows:
        writer.writerow(row)

# Write the ID mapping to a JSON file
with open(mapping_output_path, 'w', encoding='latin-1') as mapping_file:
    json.dump(id_mapping, mapping_file, indent=2)

print(f"CSV file created at {output_csv_path}")
print(f"ID mapping created at {mapping_output_path}")
print(f"Consolidated {sum(len(group) for group in entities_by_p1.values())} Occupation entities into {len(entities_by_p1)} unique entities")
print(f"Processed {len(csv_rows)} total entities")