# TD1 - Big Data Structure - Test Notebook

This notebook validates the implementation of the size and sharding calculators against the expected results from TP1_correction.pdf.

## Setup

In [1]:
import sys
sys.path.append('..')

from models.schema import Schema, Field, Database
from parsers.schema_parser import SchemaParser
from models.collection import Collection
from models.statistics import Statistics
from calculators.size_calculator import SizeCalculator
from calculators.shard_calculator import ShardCalculator
from config.constants import *

## Initialize Statistics

In [2]:
stats = Statistics()
size_calc = SizeCalculator(stats)
shard_calc = ShardCalculator(stats)

print(f"Number of clients: {stats.num_clients:,}")
print(f"Number of products: {stats.num_products:,}")
print(f"Number of order lines: {stats.num_order_lines:,}")
print(f"Number of warehouses: {stats.num_warehouses:,}")
print(f"Number of stock entries: {stats.num_stock_entries:,}")
print(f"Number of servers: {stats.num_servers:,}")

Number of clients: 10,000,000
Number of products: 100,000
Number of order lines: 4,000,000,000
Number of warehouses: 200
Number of stock entries: 20,000,000
Number of servers: 1,000


## Part 1: Document Size Calculations

### 1.1 Product{[Cat],Supp} Schema

Expected: 980 B

In [None]:
product_schema = Schema(name="Product")
product_schema.add_field(Field(name="idp", field_type="integer"))
product_schema.add_field(Field(name="name", field_type="string"))
product_schema.add_field(Field(name="price_amount", field_type="number"))
product_schema.add_field(Field(name="price_vatRate", field_type="number"))
product_schema.add_field(Field(name="brand", field_type="string"))
product_schema.add_field(Field(name="description", field_type="longstring"))
product_schema.add_field(Field(name="image_url", field_type="string"))
product_schema.add_field(Field(name="ids", field_type="integer"))
product_schema.add_field(Field(name="supplier_revenue", field_type="number"))
product_schema.add_field(Field(name="supplier_name", field_type="string"))
product_schema.add_field(Field(name="supplier_headOffice", field_type="string"))

category_schema = Schema(name="Category")
category_schema.add_field(Field(name="title", field_type="string"))

product_schema.add_field(Field(
    name="categories",
    field_type="array",
    array_item_schema=category_schema
))

prod_doc_size = size_calc.calculate_document_size(
    product_schema,
    array_sizes={"categories": 2}
)
print(f"Product{{[Cat],Supp}} document size: {prod_doc_size} B")
print("Expected: 980 B")
print(f"Match: {prod_doc_size == 980}")


Product{[Cat],Supp} document size: 980 B
Expected: 980 B
Match: True


In [None]:
prod_collection = Collection(
    name="Prod{[Cat],Supp}",
    schema=product_schema,
    document_count=10**5
)

prod_coll_size = size_calc.calculate_collection_size(
    prod_collection,
    array_sizes={"categories": 2}
)

print(f"Collection size (bytes): {prod_coll_size}")
print(f"Collection size (GB): {(prod_coll_size / 1_000_000_000):.3f} GB")
print(f"Expected: 0.098 GB")
print(f"Match: {abs((prod_coll_size / 1_000_000_000) - 0.098) < 0.001}")


Collection size (bytes): 98000000
Collection size (GB): 0.098 GB
Expected: 0.098 GB
Match: True


### 1.2 Stock (St) Schema

Expected: 152 B

In [None]:
stock_schema = Schema(name="Stock")
stock_schema.add_field(Field(name="idp", field_type="integer"))
stock_schema.add_field(Field(name="idw", field_type="integer"))
stock_schema.add_field(Field(name="quantity", field_type="integer"))
stock_schema.add_field(Field(name="warehouse_name", field_type="string"))

stock_doc_size = size_calc.calculate_document_size(stock_schema)
print(f"Stock document size: {stock_doc_size} B")
print(f"Expected: 152 B")
print(f"Match: {stock_doc_size == 152}")

Stock document size: 152 B
Expected: 152 B
Match: True


In [43]:
stock_collection = Collection(
    name="St",
    schema=stock_schema,
    document_count=2 * 10**7
)

stock_coll_size = size_calc.calculate_collection_size(
    stock_collection, 
    array_sizes={}
)

print(f"Stock collection size (bytes): {stock_coll_size}")
print(f"Stock collection size (GB): {(stock_coll_size / 1_000_000_000):.3f} GB")
print("Expected: 3.040 GB")
print(f"Match: {abs((stock_coll_size / 1_000_000_000) - 3.04) < 0.001}")

Stock collection size (bytes): 3040000000
Stock collection size (GB): 3.040 GB
Expected: 3.040 GB
Match: True


### 1.3 Warehouse (Wa) Schema

Expected: 132 B

In [None]:
warehouse_schema = Schema(name="Warehouse")
warehouse_schema.add_field(Field(name="idw", field_type="integer"))
warehouse_schema.add_field(Field(name="location", field_type="string"))
warehouse_schema.add_field(Field(name="capacity", field_type="integer"))

warehouse_doc_size = size_calc.calculate_document_size(warehouse_schema)
print(f"Warehouse document size: {warehouse_doc_size} B")
print(f"Expected: 132 B")
print(f"Match: {warehouse_doc_size == 132}")

Warehouse document size: 132 B
Expected: 132 B
Match: True


In [45]:
warehouse_collection = Collection(
    name="Wa",
    schema=warehouse_schema,
    document_count=200
)

warehouse_coll_size = size_calc.calculate_collection_size(
    warehouse_collection,
    array_sizes={}
)

print(f"Warehouse collection size (bytes): {warehouse_coll_size}")
print("Expected: 26400 B")
print(f"Match: {abs((warehouse_coll_size) - 26400) < 0.001}")


Warehouse collection size (bytes): 26400
Expected: 26400 B
Match: True


### 1.4 Order Line (OL) Schema

Expected: 356 B

In [46]:
orderline_schema = Schema(name="OrderLine")
orderline_schema.add_field(Field(name="idc", field_type="integer"))
orderline_schema.add_field(Field(name="idp", field_type="integer"))
orderline_schema.add_field(Field(name="date", field_type="date"))
orderline_schema.add_field(Field(name="quantity", field_type="integer"))
orderline_schema.add_field(Field(name="delivery_date", field_type="date"))
orderline_schema.add_field(Field(name="comment", field_type="longstring"))
orderline_schema.add_field(Field(name="grade", field_type="integer"))

orderline_doc_size = size_calc.calculate_document_size(orderline_schema)
print(f"Order Line document size: {orderline_doc_size} B")
print(f"Expected: 356 B")
print(f"Match: {orderline_doc_size == 356}")

Order Line document size: 356 B
Expected: 356 B
Match: True


In [None]:
orderline_collection = Collection(
    name="OL",
    schema=orderline_schema,
    document_count=4 * 10**9
)

orderline_coll_size = size_calc.calculate_collection_size(
    orderline_collection,
    array_sizes={}
)

print(f"OrderLine collection size (bytes): {orderline_coll_size}")
print(f"OrderLine collection size (GB): {(orderline_coll_size / 1_000_000_000):.3f} GB")
print("Expected: 1424 GB")
print(f"Match: {abs((orderline_coll_size / 1_000_000_000) - 1424) < 0.001}")

OrderLine collection size (bytes): 1424000000000
OrderLine collection size (GB): 1424.000 GB
Expected: 8.900 GB
Match: True


### 1.5 Client (Cl) Schema

Expected: 512 B

In [31]:
client_schema = Schema(name="Client")
client_schema.add_field(Field(name="idc", field_type="integer"))
client_schema.add_field(Field(name="name", field_type="string"))
client_schema.add_field(Field(name="email", field_type="string"))
client_schema.add_field(Field(name="address", field_type="string"))
client_schema.add_field(Field(name="city", field_type="string"))
client_schema.add_field(Field(name="country", field_type="string"))
client_schema.add_field(Field(name="registration_date", field_type="date"))

client_doc_size = size_calc.calculate_document_size(client_schema)
print(f"Client document size: {client_doc_size} B")
print(f"Expected: 512 B")
print(f"Match: {client_doc_size == 512}")

Client document size: 512 B
Expected: 512 B
Match: True


In [49]:
client_collection = Collection(
    name="Cl",
    schema=client_schema,
    document_count=10**7
)

# Calcul de la taille de la collection
client_coll_size = size_calc.calculate_collection_size(
    client_collection,
    array_sizes={}
)

# Affichage + vÃ©rification
print(f"Client collection size (bytes): {client_coll_size}")
print(f"Client collection size (GB): {(client_coll_size / 1_000_000_000):.3f} GB")
print("Expected: 5.12 GB")
print(f"Match: {abs((client_coll_size / 1_000_000_000) - 5.12) < 0.001}")


Client collection size (bytes): 5120000000
Client collection size (GB): 5.120 GB
Expected: 5.12 GB
Match: True


## Part 2: Sharding

## Parser

In [None]:
product_schema_db2 = SchemaParser.parse_from_file("../schemas/product_db2.json")

product_collection_db2 = Collection(
    name="Prod_DB2",
    schema=product_schema_db2,
    document_count=100_000
)

array_sizes_db2 = {
    "categories": 2,
    "stocks": 200
}

doc_size_db2 = size_calc.calculate_document_size(
    product_schema_db2,
    array_sizes=array_sizes_db2
)

coll_size_db2 = size_calc.calculate_collection_size(
    product_collection_db2,
    array_sizes=array_sizes_db2
)

print("Doc size Product DB2 (bytes):", doc_size_db2)
print("Collection size Product DB2 (GB):", SizeCalculator.bytes_to_gb(coll_size_db2))


Doc size Product DB2 (bytes): 27612
Collection size Product DB2 (GB): 2.5715678930282593


In [None]:
stock_schema_db3 = SchemaParser.parse_from_file("../schemas/stock_db3.json")

stock_collection_db3 = Collection(
    name="St_DB3",
    schema=stock_schema_db3,
    document_count=2 * 10**7
)

array_sizes_db3 = {
    "categories": 2
}

doc_size_db3 = size_calc.calculate_document_size(
    stock_schema_db3,
    array_sizes=array_sizes_db3
)

coll_size_db3 = size_calc.calculate_collection_size(
    stock_collection_db3,
    array_sizes=array_sizes_db3
)

print("Doc size Stock DB3 (bytes):", doc_size_db3)
print("Collection size Stock DB3 (GB):", SizeCalculator.bytes_to_gb(coll_size_db3))

Doc size St{Prod{[Cat],Supp}} (bytes): 1352
Collection size St_DB3 (GB): 25.18296241760254


In [None]:
ol_schema_db4 = SchemaParser.parse_from_file("../schemas/orderline_db4.json")

ol_collection_db4 = Collection(
    name="OL_DB4",
    schema=ol_schema_db4,
    document_count=4 * 10**9
)

array_sizes_db4 = {
    "categories": 2
}

doc_size_db4 = size_calc.calculate_document_size(
    ol_schema_db4,
    array_sizes=array_sizes_db4
)

coll_size_db4 = size_calc.calculate_collection_size(
    ol_collection_db4,
    array_sizes=array_sizes_db4
)

print("Doc size OrderLine DB4 (bytes):", doc_size_db4)
print("Collection size OrderLine DB4 (GB):", SizeCalculator.bytes_to_gb(coll_size_db4))

Doc size OL{Prod{[Cat],Supp}} (bytes): 1344
Collection size OL_DB4 (GB): 5006.7901611328125


In [20]:
product_schema_db5 = SchemaParser.parse_from_file("../schemas/product_db5.json")

product_collection_db5 = Collection(
    name="Prod_DB5",
    schema=product_schema_db5,
    document_count=100_000
)

array_sizes_db5 = {
    "categories": 2,
    "orderlines": 4 * 10**4
}

doc_size_db5 = size_calc.calculate_document_size(product_schema_db5, array_sizes=array_sizes_db5)
coll_size_db5 = size_calc.calculate_collection_size(product_collection_db5, array_sizes=array_sizes_db5)

print("Doc size Product DB5 (bytes):", doc_size_db4)
print("Collection size Product DB5 (GB):", SizeCalculator.bytes_to_gb(coll_size_db4))


Doc size Product DB5 (bytes): 1344
Collection size Product DB5 (GB): 5006.7901611328125
