# Startup ERP Dummy Catalog

This notebook simulates a lightweight ERP dataset for a fictional B2B SaaS startup. It builds dimension tables, transactional facts, and a derived metrics catalog capturing essential growth KPIs.


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)


In [None]:
num_customers = 50
subscription_plans = ["Starter", "Growth", "Scale"]
industries = ["FinTech", "HealthTech", "Retail", "Logistics", "EdTech"]
customer_segments = ["SMB", "Mid-Market", "Enterprise"]

customers = pd.DataFrame({
    "customer_id": range(1, num_customers + 1),
    "company_name": [f"Acme {i:03d}" for i in range(1, num_customers + 1)],
    "industry": np.random.choice(industries, num_customers),
    "segment": np.random.choice(customer_segments, num_customers, p=[0.55, 0.35, 0.10]),
    "plan": np.random.choice(subscription_plans, num_customers, p=[0.45, 0.40, 0.15]),
    "go_live_date": pd.to_datetime("2023-01-01") + pd.to_timedelta(np.random.randint(0, 540, num_customers), unit="D"),
})

customers.head()


In [None]:
plans = pd.DataFrame({
    "plan": subscription_plans,
    "monthly_recurring_revenue": [299, 899, 1999],
    "seats_included": [10, 50, 200],
    "overage_per_seat": [29, 19, 9],
    "target_segment": ["SMB", "Mid-Market", "Enterprise"],
})

products = pd.DataFrame({
    "product_id": ["ERP-Core", "ERP-Analytics", "ERP-Integrations", "ERP-Mobile"],
    "category": ["Core", "Analytics", "Integrations", "Add-on"],
    "base_price": [299, 249, 199, 99],
    "is_add_on": [False, True, True, True],
})

plans, products


In [None]:
billing_months = pd.date_range("2023-01-01", periods=24, freq="MS")

invoices = []
customer_success_cost = []

for _, customer in customers.iterrows():
    plan_row = plans.loc[plans["plan"] == customer["plan"]].iloc[0]
    base_mrr = plan_row["monthly_recurring_revenue"]
    start_month = customer["go_live_date"].to_period("M").to_timestamp()

    churn_month = None
    if np.random.rand() < 0.18:
        churn_offset = np.random.randint(6, 20)
        churn_month = start_month + pd.DateOffset(months=churn_offset)

    expansion_month = start_month + pd.DateOffset(months=np.random.randint(3, 12))
    add_on_revenue = np.random.choice([0, 99, 199, 249], p=[0.55, 0.25, 0.15, 0.05])

    for month in billing_months:
        if month < start_month:
            continue
        if churn_month and month >= churn_month:
            break

        mrr = base_mrr
        expansion_flag = False

        if month >= expansion_month:
            mrr += add_on_revenue
            expansion_flag = add_on_revenue > 0

        ramp_discount = 0.15 if (month - start_month).days < 90 else 0
        net_mrr = mrr * (1 - ramp_discount)

        invoices.append({
            "customer_id": customer["customer_id"],
            "invoice_month": month,
            "plan": customer["plan"],
            "gross_mrr": mrr,
            "net_mrr": net_mrr,
            "discount_pct": ramp_discount,
            "expansion": expansion_flag,
            "churned": False,
        })

    if churn_month:
        invoices.append({
            "customer_id": customer["customer_id"],
            "invoice_month": churn_month,
            "plan": customer["plan"],
            "gross_mrr": 0.0,
            "net_mrr": 0.0,
            "discount_pct": 0.0,
            "expansion": False,
            "churned": True,
        })

    # Estimate customer success & acquisition spend allocation
    cac = np.random.choice([1200, 2500, 4000], p=[0.5, 0.35, 0.15])
    cs_monthly = np.random.choice([150, 250, 400], p=[0.45, 0.4, 0.15])
    customer_success_cost.append({
        "customer_id": customer["customer_id"],
        "cac": cac,
        "cs_monthly_cost": cs_monthly,
    })

invoices = pd.DataFrame(invoices)
customer_success_cost = pd.DataFrame(customer_success_cost)

invoices.head()


In [None]:
invoices = invoices.sort_values(["customer_id", "invoice_month"]).reset_index(drop=True)

invoices["prev_net_mrr"] = invoices.groupby("customer_id")["net_mrr"].shift(1).fillna(0)
invoices["mrr_change"] = invoices["net_mrr"] - invoices["prev_net_mrr"]

invoices["new_mrr"] = np.where(
    (invoices["prev_net_mrr"] == 0) & (invoices["net_mrr"] > 0), invoices["net_mrr"], 0
)
invoices["expansion_mrr"] = np.where(
    (invoices["prev_net_mrr"] > 0) & (invoices["mrr_change"] > 0), invoices["mrr_change"], 0
)
invoices["contraction_mrr"] = np.where(
    (invoices["prev_net_mrr"] > 0) & (invoices["mrr_change"] < 0) & (~invoices["churned"]),
    -invoices["mrr_change"],
    0,
)
invoices["churn_mrr"] = np.where(
    invoices["churned"], invoices["prev_net_mrr"], 0
)

invoices.head(10)


In [None]:
monthly_active = (
    invoices.loc[invoices["net_mrr"] > 0]
    .groupby("invoice_month")["customer_id"]
    .nunique()
)

monthly_summary = invoices.groupby("invoice_month").agg({
    "net_mrr": "sum",
    "new_mrr": "sum",
    "expansion_mrr": "sum",
    "contraction_mrr": "sum",
    "churn_mrr": "sum",
})
monthly_summary["active_customers"] = monthly_active
monthly_summary["active_customers"] = monthly_summary["active_customers"].fillna(0).astype(int)

monthly_summary["arr"] = monthly_summary["net_mrr"] * 12
monthly_summary["previous_mrr"] = monthly_summary["net_mrr"].shift(1)
monthly_summary["mrr_growth_rate"] = (
    monthly_summary["net_mrr"].pct_change().fillna(0).replace([np.inf, -np.inf], 0)
)

monthly_summary["net_dollar_retention"] = np.where(
    monthly_summary["previous_mrr"] > 0,
    (monthly_summary["net_mrr"] - monthly_summary["new_mrr"]) / monthly_summary["previous_mrr"],
    np.nan,
)
monthly_summary["gross_revenue_retention"] = np.where(
    monthly_summary["previous_mrr"] > 0,
    1 - (monthly_summary["churn_mrr"] + monthly_summary["contraction_mrr"]) / monthly_summary["previous_mrr"],
    np.nan,
)

monthly_summary.tail()


In [None]:
new_customer_events = invoices.loc[
    (invoices["prev_net_mrr"] == 0) & (invoices["net_mrr"] > 0),
    ["customer_id", "invoice_month", "net_mrr"],
]
new_customer_events = new_customer_events.merge(customer_success_cost, on="customer_id", how="left")

monthly_cac = new_customer_events.groupby("invoice_month")["cac"].sum()

active_customer_cost = (
    invoices.loc[invoices["net_mrr"] > 0, ["customer_id", "invoice_month"]]
    .merge(customer_success_cost, on="customer_id", how="left")
    .groupby("invoice_month")["cs_monthly_cost"]
    .sum()
)

monthly_summary["cac_spend"] = monthly_cac
monthly_summary["customer_success_spend"] = active_customer_cost
monthly_summary[["cac_spend", "customer_success_spend"]] = (
    monthly_summary[["cac_spend", "customer_success_spend"]].fillna(0)
)

monthly_summary["cac_payback_months"] = np.where(
    monthly_summary["new_mrr"] > 0,
    monthly_summary["cac_spend"] / monthly_summary["new_mrr"],
    np.nan,
)
monthly_summary["ltv_to_cac"] = np.where(
    monthly_summary["cac_spend"] > 0,
    (monthly_summary["net_mrr"] * 12) / monthly_summary["cac_spend"],
    np.nan,
)

monthly_summary.tail()


## Metrics Catalog


In [None]:
metrics_definitions = [
    {
        "metric_name": "Active Customers",
        "definition": "Count of customers with positive billed MRR in the given month.",
        "table": "monthly_summary",
        "field": "active_customers",
        "owner": "RevOps",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Net MRR",
        "definition": "Total monthly recurring revenue after discounts and including expansions/contractions.",
        "table": "monthly_summary",
        "field": "net_mrr",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "New MRR",
        "definition": "Recurring revenue from brand new customers in the period.",
        "table": "monthly_summary",
        "field": "new_mrr",
        "owner": "Sales",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Expansion MRR",
        "definition": "Positive recurring revenue deltas from existing customers (upsell & cross-sell).",
        "table": "monthly_summary",
        "field": "expansion_mrr",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Churn MRR",
        "definition": "Recurring revenue lost due to churned customers in the period.",
        "table": "monthly_summary",
        "field": "churn_mrr",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Net Dollar Retention",
        "definition": "(Prior month MRR - churn + expansion) divided by prior month MRR, excluding new business.",
        "table": "monthly_summary",
        "field": "net_dollar_retention",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Gross Revenue Retention",
        "definition": "Percentage of recurring revenue retained from existing customers, excluding expansions.",
        "table": "monthly_summary",
        "field": "gross_revenue_retention",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "ARR",
        "definition": "Annualized recurring revenue (Net MRR * 12).",
        "table": "monthly_summary",
        "field": "arr",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "CAC Spend",
        "definition": "Customer acquisition cost dollars attributed to customers acquired in-period.",
        "table": "monthly_summary",
        "field": "cac_spend",
        "owner": "Marketing",
        "cadence": "Monthly",
    },
    {
        "metric_name": "Customer Success Spend",
        "definition": "Allocated customer success operating expenses for active customers in the period.",
        "table": "monthly_summary",
        "field": "customer_success_spend",
        "owner": "Customer Success",
        "cadence": "Monthly",
    },
    {
        "metric_name": "CAC Payback Period",
        "definition": "Months required to recoup CAC from new MRR in the period (CAC / New MRR).",
        "table": "monthly_summary",
        "field": "cac_payback_months",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "LTV to CAC",
        "definition": "Ratio of annualized recurring revenue to CAC spend in the period.",
        "table": "monthly_summary",
        "field": "ltv_to_cac",
        "owner": "Finance",
        "cadence": "Monthly",
    },
    {
        "metric_name": "MRR Growth Rate",
        "definition": "Percent change in net MRR versus the prior month.",
        "table": "monthly_summary",
        "field": "mrr_growth_rate",
        "owner": "Finance",
        "cadence": "Monthly",
    },
]

metrics_catalog = pd.DataFrame(metrics_definitions)
metrics_catalog


## Persist to Databricks (optional)

The following cell materializes the synthetic dataset and metrics catalog into Unity Catalog tables when executed inside a Databricks workspace.


In [None]:
try:
    spark
except NameError:
    print("Spark session not available in this environment. Skipping table creation.")
else:
    spark_df_customers = spark.createDataFrame(customers)
    spark_df_plans = spark.createDataFrame(plans)
    spark_df_invoices = spark.createDataFrame(invoices)
    spark_df_monthly_summary = spark.createDataFrame(monthly_summary.reset_index())
    spark_df_metrics_catalog = spark.createDataFrame(metrics_catalog)

    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")

    spark_df_customers.write.mode("overwrite").saveAsTable("erp_demo.customers")
    spark_df_plans.write.mode("overwrite").saveAsTable("erp_demo.subscription_plans")
    spark_df_invoices.write.mode("overwrite").saveAsTable("erp_demo.invoices")
    spark_df_monthly_summary.write.mode("overwrite").saveAsTable("erp_demo.monthly_growth_metrics")
    spark_df_metrics_catalog.write.mode("overwrite").saveAsTable("erp_demo.metrics_catalog")

    print("Synthetic ERP catalog and growth metrics saved to Unity Catalog schema 'erp_demo'.")
