## Partie 1

In [None]:
import json

def load_schema(path):
    """
    Charge un JSON Schema et extrait une structure exploitable.

    Paramètres
    ----------
    path : str
        Chemin vers le fichier .json contenant le JSON Schema.

    Retour
    ------
    dict
        Un dictionnaire décrivant :
        - le nom des champs,
        - leur type,
        - les objets imbriqués,
        - les tableaux,
        - les types primitifs réels,
        - les contraintes.
    """

    with open(path, "r") as f:
        schema = json.load(f)

    # Vérification minimale
    if "properties" not in schema:
        raise ValueError("Le JSON Schema ne contient pas de clé 'properties' (schema invalide).")

    def parse_properties(properties):
        """
        Analyse récursivement les propriétés d’un objet.
        """
        parsed = {}

        for key, info in properties.items():
            field_type = info.get("type", "object")

            # Cas 1: Champ simple
            if field_type in ["string", "number", "integer", "boolean"]:
                parsed[key] = {
                    "kind": "primitive",
                    "type": field_type
                }

            # Cas 2: Objet imbriqué
            elif field_type == "object":
                parsed[key] = {
                    "kind": "object",
                    "properties": parse_properties(info.get("properties", {}))
                }

            # Cas 3: Tableau
            elif field_type == "array":
                items = info.get("items", {})
                item_type = items.get("type", "object")

                parsed[key] = {
                    "kind": "array",
                    "item_type": item_type,
                    "items_properties": parse_properties(items.get("properties", {}))
                    if item_type == "object" else None
                }

            else:
                parsed[key] = {"kind": "unknown", "raw": info}

        return parsed

    return {
        "title": schema.get("title", "Unknown"),
        "type": schema.get("type", "object"),
        "properties": parse_properties(schema["properties"]),
        "required": schema.get("required", [])
    }


In [5]:
schema = load_schema("../product.json")
schema
print(json.dumps(schema, indent=2))

{
  "title": "Product",
  "type": "object",
  "properties": {
    "IDP": {
      "kind": "primitive",
      "type": "string"
    },
    "name": {
      "kind": "primitive",
      "type": "string"
    },
    "brand": {
      "kind": "primitive",
      "type": "string"
    },
    "description": {
      "kind": "primitive",
      "type": "string"
    },
    "image_url": {
      "kind": "primitive",
      "type": "string"
    },
    "price": {
      "kind": "object",
      "properties": {
        "amount": {
          "kind": "primitive",
          "type": "number"
        },
        "currency": {
          "kind": "primitive",
          "type": "string"
        },
        "vat_rate": {
          "kind": "primitive",
          "type": "number"
        }
      }
    },
    "categories": {
      "kind": "array",
      "item_type": "object",
      "items_properties": {
        "title": {
          "kind": "primitive",
          "type": "string"
        }
      }
    },
    "supplier": {
     

## Partie 2

In [None]:
def build_structure(schema, stats=None):
    if stats is None:
        stats = {}

    root_name = schema["title"]  

    def process_properties(properties):
        internal_fields = {}

        for field, info in properties.items():
            kind = info["kind"]

            # ---- PRIMITIFS ----
            if kind == "primitive":
                internal_fields[field] = {"type": info["type"]}

            # ---- OBJETS ----
            elif kind == "object":
                internal_fields[field] = {
                    "type": "object",
                    "fields": process_properties(info["properties"])
                }

            # ---- TABLEAUX ----
            elif kind == "array":
                cardinality = stats.get(root_name, {}).get(field, 1)

                internal_fields[field] = {
                    "type": "array",
                    "cardinality": cardinality,
                    "items": {
                        "type": info["item_type"],
                        "fields": (
                            process_properties(info["items_properties"])
                            if info["item_type"] == "object"
                            else None
                        )
                    }
                }

        return internal_fields

    return {
        "title": root_name,
        "fields": process_properties(schema["properties"])
    }


In [12]:
stats = {
    "Product": { "categories": 2 }
}
structure = build_structure(schema, stats)
structure
print(json.dumps(structure, indent=2))

{
  "title": "Product",
  "fields": {
    "IDP": {
      "type": "string"
    },
    "name": {
      "type": "string"
    },
    "brand": {
      "type": "string"
    },
    "description": {
      "type": "string"
    },
    "image_url": {
      "type": "string"
    },
    "price": {
      "type": "object",
      "fields": {
        "amount": {
          "type": "number"
        },
        "currency": {
          "type": "string"
        },
        "vat_rate": {
          "type": "number"
        }
      }
    },
    "categories": {
      "type": "array",
      "cardinality": 2,
      "items": {
        "type": "object",
        "fields": {
          "title": {
            "type": "string"
          }
        }
      }
    },
    "supplier": {
      "type": "object",
      "fields": {
        "IDS": {
          "type": "string"
        },
        "name": {
          "type": "string"
        },
        "SIRET": {
          "type": "string"
        },
        "headOffice": {
          "t

# Partie 3

In [13]:
def compute_document_size(structure):
    """
    Calcule la taille (en bytes) d'un document selon la structure interne.

    Paramètres
    ----------
    structure : dict
        Structure générée par build_structure()

    Retour
    ------
    int
        Taille du document en bytes.
    """

    TYPE_SIZES = {
        "string": 80,
        "number": 8,
        "integer": 8,
        "date": 20,
        "longstring": 200
    }

    KEY_OVERHEAD = 12  # taille de la clé JSON

    def size_of_fields(fields):
        total = 0

        for field, info in fields.items():

            field_type = info["type"]

            # ---- PRIMITIF ----
            if field_type in TYPE_SIZES:
                total += KEY_OVERHEAD + TYPE_SIZES[field_type]

            # ---- OBJET ----
            elif field_type == "object":
                sub_size = size_of_fields(info["fields"])
                total += KEY_OVERHEAD + sub_size

            # ---- TABLEAU ----
            elif field_type == "array":
                item = info["items"]
                cardinality = info["cardinality"]

                # taille d'un élément du tableau
                if item["type"] in TYPE_SIZES:
                    item_size = KEY_OVERHEAD + TYPE_SIZES[item["type"]]
                else:  # objet dans tableau
                    item_size = size_of_fields(item["fields"])

                total += KEY_OVERHEAD + cardinality * item_size

            else:
                raise ValueError(f"Type inconnu : {field_type}")

        return total

    return size_of_fields(structure["fields"])


In [14]:
tructure = build_structure(schema, stats)

size = compute_document_size(structure)
size

1200

## Partie 4

In [15]:
def compute_collection_size(structure, stats):
    """
    Calcule la taille totale d'une collection (en bytes, MB, GB).

    Paramètres
    ----------
    structure : dict
        Structure interne générée par build_structure()

    stats : dict
        Statistiques contenant au minimum : {"CollectionName": {"count": N}}

    Retour
    ------
    dict
        {
            "document_size": ... (bytes),
            "count": ...,
            "total_bytes": ...,
            "total_MB": ...,
            "total_GB": ...
        }
    """

    collection_name = structure["title"]

    if collection_name not in stats or "count" not in stats[collection_name]:
        raise ValueError(f"Missing 'count' for collection '{collection_name}' in stats.")

    # Taille d'un document (bytes)
    doc_size = compute_document_size(structure)

    # Nombre de documents
    count = stats[collection_name]["count"]

    # Taille totale
    total_bytes = doc_size * count

    return {
        "document_size": doc_size,
        "count": count,
        "total_bytes": total_bytes,
        "total_MB": total_bytes / (1024**2),
        "total_GB": total_bytes / (1024**3)
    }


In [17]:
stats = {
    "Product": {"categories": 2, "count": 10**5}
}
structure = build_structure(schema, stats)
result = compute_collection_size(structure, stats)
result

{'document_size': 1200,
 'count': 100000,
 'total_bytes': 120000000,
 'total_MB': 114.44091796875,
 'total_GB': 0.11175870895385742}

## Partie 5

In [18]:
def compute_database_size(db_structures, stats):
    """
    Calcule la taille totale d'une base de données (DB1-DB5).

    Paramètres
    ----------
    db_structures : dict
        Exemple :
        {
            "Product": structure_product,
            "Stock": structure_stock,
            "OrderLine": structure_ol,
            ...
        }

    stats : dict
        Contient au minimum {"CollectionName": {"count": N}}

    Retour
    ------
    dict :
        {
            "collection_sizes": { ... },
            "total_bytes": ...,
            "total_GB": ...
        }
    """

    results = {}
    total_bytes = 0

    for coll_name, structure in db_structures.items():

        size_info = compute_collection_size(structure, stats)
        results[coll_name] = size_info

        total_bytes += size_info["total_bytes"]

    return {
        "collection_sizes": results,
        "total_bytes": total_bytes,
        "total_GB": total_bytes / (1024**3)
    }


DB1 testing

In [47]:
stats_DB1 = {
    "Product": {
        "categories": 2,
        "suppliers": 1,        # 1 supplier per product
        "count": 100000       # 100k products
    },
    "Stock": {
        # 105 products are stored in each warehouse
        # 105 × 200 = 21,000 stock entries
        "count": 105 * 200
    },
    "Warehouse": {
        "count": 200
    },
    "OrderLine": {
        # 4 billion orderlines total (given in the PDF)
        "count": 4000000000
    },
    "Client": {
        # 10 million clients
        "count": 10000000
    }
}


In [48]:
schema_prod = load_schema("../DB1/product.json")
schema_stock = load_schema("../DB1/stock.json")
schema_warehouse = load_schema("../DB1/warehouse.json")
schema_ol = load_schema("../DB1/orderline.json")
schema_client = load_schema("../DB1/client.json")

In [49]:
structure_prod = build_structure(schema_prod, stats_DB1)
structure_stock = build_structure(schema_stock, stats_DB1)
structure_warehouse = build_structure(schema_warehouse, stats_DB1)
structure_ol = build_structure(schema_ol, stats_DB1)
structure_client = build_structure(schema_client, stats_DB1)

In [50]:
DB1_structures = {
    "Product": structure_prod,
    "Stock": structure_stock,
    "Warehouse": structure_warehouse,
    "OrderLine": structure_ol,
    "Client": structure_client
}

In [51]:
result_DB1 = compute_database_size(DB1_structures, stats_DB1)
result_DB1

{'collection_sizes': {'Product': {'document_size': 1200,
   'count': 100000,
   'total_bytes': 120000000,
   'total_MB': 114.44091796875,
   'total_GB': 0.11175870895385742},
  'Stock': {'document_size': 204,
   'count': 21000,
   'total_bytes': 4284000,
   'total_MB': 4.085540771484375,
   'total_GB': 0.00398978590965271},
  'Warehouse': {'document_size': 276,
   'count': 200,
   'total_bytes': 55200,
   'total_MB': 0.052642822265625,
   'total_GB': 5.1409006118774414e-05},
  'OrderLine': {'document_size': 388,
   'count': 4000000000,
   'total_bytes': 1552000000000,
   'total_MB': 1480102.5390625,
   'total_GB': 1445.4126358032227},
  'Client': {'document_size': 276,
   'count': 10000000,
   'total_bytes': 2760000000,
   'total_MB': 2632.14111328125,
   'total_GB': 2.5704503059387207}},
 'total_bytes': 1554884339200,
 'total_GB': 1448.098886013031}

DB5 testing

In [62]:
stats_DB5 = {
    "Product": {
        "categories": 2,
        "suppliers": 1,
        "orderlines_per_product": 40000,
        "count": 100000
    },
    "Stock": {
        "count": 105 * 200
    },
    "Warehouse": {
        "count": 200
    },
    "Client": {
        "count": 10000000
    }
}


In [63]:
schema_prod = load_schema("../DB5/product.json")
schema_stock = load_schema("../DB5/stock.json")
schema_warehouse = load_schema("../DB5/warehouse.json")
#schema_ol = load_schema("../DB5/orderline.json")
schema_client = load_schema("../DB5/client.json")

In [64]:
structure_prod = build_structure(schema_prod, stats_DB5)
structure_stock = build_structure(schema_stock, stats_DB5)
structure_warehouse = build_structure(schema_warehouse, stats_DB5)
#structure_ol = build_structure(schema_ol, stats_DB5)
structure_client = build_structure(schema_client, stats_DB5)

In [65]:
DB5_structures = {
    "Product": structure_prod,
    "Stock": structure_stock,
    "Warehouse": structure_warehouse,
    #"OrderLine": structure_ol,
    "Client": structure_client
}

In [66]:
result_DB5 = compute_database_size(DB5_structures, stats_DB5)
result_DB5

{'collection_sizes': {'Product': {'document_size': 1600,
   'count': 100000,
   'total_bytes': 160000000,
   'total_MB': 152.587890625,
   'total_GB': 0.14901161193847656},
  'Stock': {'document_size': 204,
   'count': 21000,
   'total_bytes': 4284000,
   'total_MB': 4.085540771484375,
   'total_GB': 0.00398978590965271},
  'Warehouse': {'document_size': 276,
   'count': 200,
   'total_bytes': 55200,
   'total_MB': 0.052642822265625,
   'total_GB': 5.1409006118774414e-05},
  'Client': {'document_size': 276,
   'count': 10000000,
   'total_bytes': 2760000000,
   'total_MB': 2632.14111328125,
   'total_GB': 2.5704503059387207}},
 'total_bytes': 2924339200,
 'total_GB': 2.7235031127929688}

# Partie 6

In [67]:
def compute_sharding_stats(collection_name, sharding_key, stats, nb_servers=1000):
    """
    Calcule les stats de sharding pour une collection et une clé donnée.

    Paramètres
    ----------
    collection_name : str
        Nom de la collection ("Stock", "OrderLine", "Product", ...)

    sharding_key : str
        Nom de la clé de sharding (ex: "IDP", "IDW", "IDC", "brand")

    stats : dict
        Dictionnaire de stats avec au moins :
        stats[collection_name]["count"]
        stats[collection_name]["distinct"][sharding_key]

    nb_servers : int
        Nombre de serveurs dans le cluster (par défaut 1000).

    Retour
    ------
    dict
        {
          "collection": ...,
          "sharding_key": ...,
          "total_docs": ...,
          "docs_per_server": ...,
          "distinct_values": ...,
          "distinct_values_per_server": ...
        }
    """

    if collection_name not in stats:
        raise ValueError(f"No stats for collection '{collection_name}'.")

    coll_stats = stats[collection_name]

    if "count" not in coll_stats:
        raise ValueError(f"'count' missing for collection '{collection_name}' in stats.")

    if "distinct" not in coll_stats or sharding_key not in coll_stats["distinct"]:
        raise ValueError(
            f"Distinct values for key '{sharding_key}' "
            f"missing in stats for collection '{collection_name}'."
        )

    total_docs = coll_stats["count"]
    total_distinct = coll_stats["distinct"][sharding_key]

    docs_per_server = total_docs / nb_servers
    distinct_per_server = total_distinct / nb_servers

    return {
        "collection": collection_name,
        "sharding_key": sharding_key,
        "total_docs": total_docs,
        "docs_per_server": docs_per_server,
        "distinct_values": total_distinct,
        "distinct_values_per_server": distinct_per_server
    }


In [None]:
stats_sharding = {
    "Product": {
        "count": 100_000,     # 10^5 products
        "distinct": {
            "IDP": 100_000, 
            "brand": 5_000  
        }
    },
    "Stock": {
        "count": 105 * 200,   
        "distinct": {
            "IDP": 105,      
            "IDW": 200     
        }
    },
    "OrderLine": {
        "count": 4_000_000_000, 
        "distinct": {
            "IDC": 10_000_000,  
            "IDP": 100_000     
        }
    }
}


In [None]:
cases = [
    ("Stock", "IDP"), 
    ("Stock", "IDW"),
    ("OrderLine", "IDC"), 
    ("OrderLine", "IDP"),
    ("Product", "IDP"),  
    ("Product", "brand") 
]

for coll, key in cases:
    res = compute_sharding_stats(coll, key, stats_sharding)
    print(f"{coll} - #{key}")
    print(f"  total docs                 : {res['total_docs']}")
    print(f"  docs per server            : {res['docs_per_server']}")
    print(f"  distinct values            : {res['distinct_values']}")
    print(f"  distinct values per server : {res['distinct_values_per_server']}")
    print()


Stock - #IDP
  total docs                 : 21000
  docs per server            : 21.0
  distinct values            : 105
  distinct values per server : 0.105

Stock - #IDW
  total docs                 : 21000
  docs per server            : 21.0
  distinct values            : 200
  distinct values per server : 0.2

OrderLine - #IDC
  total docs                 : 4000000000
  docs per server            : 4000000.0
  distinct values            : 10000000
  distinct values per server : 10000.0

OrderLine - #IDP
  total docs                 : 4000000000
  docs per server            : 4000000.0
  distinct values            : 100000
  distinct values per server : 100.0

Product - #IDP
  total docs                 : 100000
  docs per server            : 100.0
  distinct values            : 100000
  distinct values per server : 100.0

Product - #brand
  total docs                 : 100000
  docs per server            : 100.0
  distinct values            : 5000
  distinct values per server : 5.0