# Startup ERP Dummy Catalog

This notebook simulates a lightweight ERP dataset for a fictional B2B SaaS startup. It builds dimension tables, transactional facts, and a derived metrics catalog capturing essential growth KPIs.


In [None]:
from datetime import datetime, timedelta

import numpy as np
import polars as pl

np.random.seed(42)


In [None]:
num_customers = 50
subscription_plans = ["Starter", "Growth", "Scale"]
industries = ["FinTech", "HealthTech", "Retail", "Logistics", "EdTech"]
customer_segments = ["SMB", "Mid-Market", "Enterprise"]

random_days = np.random.randint(0, 540, num_customers)
go_live_dates = [datetime(2023, 1, 1) + timedelta(days=int(offset)) for offset in random_days]

customers = pl.DataFrame({
    "customer_id": list(range(1, num_customers + 1)),
    "company_name": [f"Acme {i:03d}" for i in range(1, num_customers + 1)],
    "industry": np.random.choice(industries, num_customers).tolist(),
    "segment": np.random.choice(customer_segments, num_customers, p=[0.55, 0.35, 0.10]).tolist(),
    "plan": np.random.choice(subscription_plans, num_customers, p=[0.45, 0.40, 0.15]).tolist(),
    "go_live_date": go_live_dates,
})

customers.head()


In [None]:
plans = pl.DataFrame({
    "plan": subscription_plans,
    "monthly_recurring_revenue": [299, 899, 1999],
    "seats_included": [10, 50, 200],
    "overage_per_seat": [29, 19, 9],
    "target_segment": ["SMB", "Mid-Market", "Enterprise"],
})

products = pl.DataFrame({
    "product_id": ["ERP-Core", "ERP-Analytics", "ERP-Integrations", "ERP-Mobile"],
    "category": ["Core", "Analytics", "Integrations", "Add-on"],
    "base_price": [299, 249, 199, 99],
    "is_add_on": [False, True, True, True],
})

plans, products


In [None]:
def month_sequence(start: datetime, periods: int) -> list[datetime]:
    months = []
    year = start.year
    month = start.month
    for _ in range(periods):
        months.append(datetime(year, month, 1))
        month += 1
        if month > 12:
            month = 1
            year += 1
    return months


def add_months(dt: datetime, offset: int) -> datetime:
    year = dt.year + (dt.month - 1 + offset) // 12
    month = ((dt.month - 1 + offset) % 12) + 1
    return datetime(year, month, dt.day)


billing_months = month_sequence(datetime(2023, 1, 1), 24)

plan_lookup = {row["plan"]: row for row in plans.to_dicts()}

invoices = []
customer_success_cost = []

for customer in customers.iter_rows(named=True):
    plan_row = plan_lookup[customer["plan"]]
    base_mrr = plan_row["monthly_recurring_revenue"]
    start_month = datetime(customer["go_live_date"].year, customer["go_live_date"].month, 1)

    churn_month = None
    if np.random.rand() < 0.18:
        churn_offset = int(np.random.randint(6, 20))
        churn_month = add_months(start_month, churn_offset)

    expansion_month = add_months(start_month, int(np.random.randint(3, 12)))
    add_on_revenue = int(np.random.choice([0, 99, 199, 249], p=[0.55, 0.25, 0.15, 0.05]))

    for month in billing_months:
        if month < start_month:
            continue
        if churn_month and month >= churn_month:
            break

        mrr = base_mrr
        expansion_flag = False

        if month >= expansion_month:
            mrr += add_on_revenue
            expansion_flag = add_on_revenue > 0

        ramp_discount = 0.15 if (month - start_month).days < 90 else 0
        net_mrr = mrr * (1 - ramp_discount)

        invoices.append({
            "customer_id": customer["customer_id"],
            "invoice_month": month,
            "plan": customer["plan"],
            "gross_mrr": mrr,
            "net_mrr": net_mrr,
            "discount_pct": ramp_discount,
            "expansion": expansion_flag,
            "churned": False,
        })

    if churn_month:
        invoices.append({
            "customer_id": customer["customer_id"],
            "invoice_month": churn_month,
            "plan": customer["plan"],
            "gross_mrr": 0.0,
            "net_mrr": 0.0,
            "discount_pct": 0.0,
            "expansion": False,
            "churned": True,
        })

    cac = int(np.random.choice([1200, 2500, 4000], p=[0.5, 0.35, 0.15]))
    cs_monthly = int(np.random.choice([150, 250, 400], p=[0.45, 0.4, 0.15]))
    customer_success_cost.append({
        "customer_id": customer["customer_id"],
        "cac": cac,
        "cs_monthly_cost": cs_monthly,
    })

invoices = pl.DataFrame(invoices)
customer_success_cost = pl.DataFrame(customer_success_cost)

invoices.head()


In [None]:
invoices = invoices.sort(["customer_id", "invoice_month"])

invoices = invoices.with_columns(
    pl.col("net_mrr").shift(1).fill_null(0).over("customer_id").alias("prev_net_mrr")
)

invoices = invoices.with_columns([
    (pl.col("net_mrr") - pl.col("prev_net_mrr")).alias("mrr_change"),
    pl.when((pl.col("prev_net_mrr") == 0) & (pl.col("net_mrr") > 0))
    .then(pl.col("net_mrr"))
    .otherwise(0)
    .alias("new_mrr"),
    pl.when((pl.col("prev_net_mrr") > 0) & (pl.col("mrr_change") > 0))
    .then(pl.col("mrr_change"))
    .otherwise(0)
    .alias("expansion_mrr"),
    pl.when(
        (pl.col("prev_net_mrr") > 0)
        & (pl.col("mrr_change") < 0)
        & (~pl.col("churned"))
    )
    .then(-pl.col("mrr_change"))
    .otherwise(0)
    .alias("contraction_mrr"),
    pl.when(pl.col("churned")).then(pl.col("prev_net_mrr")).otherwise(0).alias("churn_mrr"),
])

invoices.head(10)


In [None]:
monthly_active = (
    invoices.filter(pl.col("net_mrr") > 0)
    .groupby("invoice_month")
    .agg(pl.col("customer_id").n_unique().alias("active_customers"))
)

monthly_summary = (
    invoices.groupby("invoice_month")
    .agg([
        pl.col("net_mrr").sum().alias("net_mrr"),
        pl.col("new_mrr").sum().alias("new_mrr"),
        pl.col("expansion_mrr").sum().alias("expansion_mrr"),
        pl.col("contraction_mrr").sum().alias("contraction_mrr"),
        pl.col("churn_mrr").sum().alias("churn_mrr"),
    ])
    .sort("invoice_month")
)

monthly_summary = monthly_summary.join(monthly_active, on="invoice_month", how="left").with_columns(
    pl.col("active_customers").fill_null(0).cast(pl.Int64)
)

monthly_summary = monthly_summary.with_columns([
    (pl.col("net_mrr") * 12).alias("arr"),
    pl.col("net_mrr").shift(1).alias("previous_mrr"),
])

monthly_summary = monthly_summary.with_columns([
    pl.when(pl.col("previous_mrr") > 0)
    .then((pl.col("net_mrr") - pl.col("previous_mrr")) / pl.col("previous_mrr"))
    .otherwise(0)
    .alias("mrr_growth_rate"),
    pl.when(pl.col("previous_mrr") > 0)
    .then((pl.col("net_mrr") - pl.col("new_mrr")) / pl.col("previous_mrr"))
    .otherwise(None)
    .alias("net_dollar_retention"),
    pl.when(pl.col("previous_mrr") > 0)
    .then(
        1
        - (pl.col("churn_mrr") + pl.col("contraction_mrr")) / pl.col("previous_mrr")
    )
    .otherwise(None)
    .alias("gross_revenue_retention"),
])

monthly_summary.tail()


In [None]:
new_customer_events = (
    invoices.filter((pl.col("prev_net_mrr") == 0) & (pl.col("net_mrr") > 0))
    .select(["customer_id", "invoice_month", "net_mrr"])
    .join(customer_success_cost, on="customer_id", how="left")
)

monthly_cac = (
    new_customer_events.groupby("invoice_month")
    .agg(pl.col("cac").sum().alias("cac_spend"))
)

active_customer_cost = (
    invoices.filter(pl.col("net_mrr") > 0)
    .select(["customer_id", "invoice_month"])
    .join(customer_success_cost, on="customer_id", how="left")
    .groupby("invoice_month")
    .agg(pl.col("cs_monthly_cost").sum().alias("customer_success_spend"))
)

monthly_summary = monthly_summary.join(monthly_cac, on="invoice_month", how="left")
monthly_summary = monthly_summary.join(active_customer_cost, on="invoice_month", how="left")
monthly_summary = monthly_summary.with_columns([
    pl.col("cac_spend").fill_null(0),
    pl.col("customer_success_spend").fill_null(0),
])

monthly_summary = monthly_summary.with_columns([
    pl.when(pl.col("new_mrr") > 0)
    .then(pl.col("cac_spend") / pl.col("new_mrr"))
    .otherwise(None)
    .alias("cac_payback_months"),
    pl.when(pl.col("cac_spend") > 0)
    .then((pl.col("net_mrr") * 12) / pl.col("cac_spend"))
    .otherwise(None)
    .alias("ltv_to_cac"),
])

monthly_summary.tail()


## Metrics Catalog


In [None]:
metrics_definitions = [
    {
        "metric_name": "Active Customers",
        "definition": "Count of customers with positive billed MRR in the given month.",
        "table": "monthly_summary",
        "field": "active_customers",
        "owner": "RevOps",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Net MRR",
        "definition": "Total monthly recurring revenue after discounts and including expansions/contractions.",
        "table": "monthly_summary",
        "field": "net_mrr",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "New MRR",
        "definition": "Recurring revenue from brand new customers in the period.",
        "table": "monthly_summary",
        "field": "new_mrr",
        "owner": "Sales",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Expansion MRR",
        "definition": "Positive recurring revenue deltas from existing customers (upsell & cross-sell).",
        "table": "monthly_summary",
        "field": "expansion_mrr",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Churn MRR",
        "definition": "Recurring revenue lost due to churned customers in the period.",
        "table": "monthly_summary",
        "field": "churn_mrr",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Net Dollar Retention",
        "definition": "(Prior month MRR - churn + expansion) divided by prior month MRR, excluding new business.",
        "table": "monthly_summary",
        "field": "net_dollar_retention",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Gross Revenue Retention",
        "definition": "Percentage of recurring revenue retained from existing customers, excluding expansions.",
        "table": "monthly_summary",
        "field": "gross_revenue_retention",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "ARR",
        "definition": "Annualized recurring revenue (Net MRR * 12).",
        "table": "monthly_summary",
        "field": "arr",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "CAC Spend",
        "definition": "Customer acquisition cost dollars attributed to customers acquired in-period.",
        "table": "monthly_summary",
        "field": "cac_spend",
        "owner": "Marketing",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Customer Success Spend",
        "definition": "Allocated customer success operating expenses for active customers in the period.",
        "table": "monthly_summary",
        "field": "customer_success_spend",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "CAC Payback Period",
        "definition": "Months required to recoup CAC from new MRR in the period (CAC / New MRR).",
        "table": "monthly_summary",
        "field": "cac_payback_months",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "LTV to CAC",
        "definition": "Ratio of annualized recurring revenue to CAC spend in the period.",
        "table": "monthly_summary",
        "field": "ltv_to_cac",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "MRR Growth Rate",
        "definition": "Percent change in net MRR versus the prior month.",
        "table": "monthly_summary",
        "field": "mrr_growth_rate",
        "owner": "Finance",
        "cadence": "Monthly",
    },
]

metrics_catalog = pl.DataFrame(metrics_definitions)
metrics_catalog


## Persist to Databricks (optional)

The following cell materializes the synthetic dataset and metrics catalog into Unity Catalog tables when executed inside a Databricks workspace.


In [None]:
try:
    spark
except NameError:
    print("Spark session not available in this environment. Skipping table creation.")
else:
    spark_df_customers = spark.createDataFrame(customers.to_dicts())
    spark_df_plans = spark.createDataFrame(plans.to_dicts())
    spark_df_invoices = spark.createDataFrame(invoices.to_dicts())
    spark_df_monthly_summary = spark.createDataFrame(monthly_summary.sort("invoice_month").to_dicts())
    spark_df_metrics_catalog = spark.createDataFrame(metrics_catalog.to_dicts())

    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")

    spark_df_customers.write.mode("overwrite").saveAsTable("erp_demo.customers")
    spark_df_plans.write.mode("overwrite").saveAsTable("erp_demo.subscription_plans")
    spark_df_invoices.write.mode("overwrite").saveAsTable("erp_demo.invoices")
    spark_df_monthly_summary.write.mode("overwrite").saveAsTable("erp_demo.monthly_growth_metrics")
    spark_df_metrics_catalog.write.mode("overwrite").saveAsTable("erp_demo.metrics_catalog")

    print("Synthetic ERP catalog and growth metrics saved to Unity Catalog schema 'erp_demo'.")
