# Startup ERP Growth Catalog

This notebook seeds synthetic ERP data directly into Spark DataFrames for a fictional B2B SaaS startup, focusing on growth metrics and optional persistence into Unity Catalog tables.


In [None]:
from datetime import datetime, timedelta
import numpy as np

from pyspark.sql import functions as F, Window

np.random.seed(42)

try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("erp_profitability_demo").getOrCreate()


In [None]:
num_customers = 50
subscription_plans = ["Starter", "Growth", "Scale"]
industries = ["FinTech", "HealthTech", "Retail", "Logistics", "EdTech"]
customer_segments = ["SMB", "Mid-Market", "Enterprise"]

customer_rows = []
for cid in range(1, num_customers + 1):
    go_live_offset = int(np.random.randint(0, 540))
    customer_rows.append({
        "customer_id": cid,
        "company_name": f"Acme {cid:03d}",
        "industry": np.random.choice(industries),
        "segment": np.random.choice(customer_segments, p=[0.55, 0.35, 0.10]),
        "plan": np.random.choice(subscription_plans, p=[0.45, 0.40, 0.15]),
        "go_live_date": datetime(2023, 1, 1) + timedelta(days=go_live_offset),
    })

customers_df = spark.createDataFrame(customer_rows)
customers_df.show(5, truncate=False)


In [None]:
plans_df = spark.createDataFrame([
    ("Starter", 299.0, 10, 29.0, "SMB"),
    ("Growth", 899.0, 50, 19.0, "Mid-Market"),
    ("Scale", 1999.0, 200, 9.0, "Enterprise"),
], ["plan", "monthly_recurring_revenue", "seats_included", "overage_per_seat", "target_segment"])

products_df = spark.createDataFrame([
    ("ERP-Core", "Core", 299.0, False),
    ("ERP-Analytics", "Analytics", 249.0, True),
    ("ERP-Integrations", "Integrations", 199.0, True),
    ("ERP-Mobile", "Add-on", 99.0, True),
], ["product_id", "category", "base_price", "is_add_on"])

plans_df.show()
products_df.show()


In [None]:
def month_sequence(start: datetime, periods: int) -> list[datetime]:
    months = []
    year = start.year
    month = start.month
    for _ in range(periods):
        months.append(datetime(year, month, 1))
        month += 1
        if month > 12:
            month = 1
            year += 1
    return months


def add_months(dt: datetime, offset: int) -> datetime:
    year = dt.year + (dt.month - 1 + offset) // 12
    month = ((dt.month - 1 + offset) % 12) + 1
    return datetime(year, month, dt.day)


billing_months = month_sequence(datetime(2023, 1, 1), 24)
plans_lookup = {row.plan: row for row in plans_df.collect()}

invoice_rows = []
customer_cost_rows = []

for customer in customers_df.collect():
    plan_row = plans_lookup[customer.plan]
    base_mrr = plan_row.monthly_recurring_revenue
    start_month = datetime(customer.go_live_date.year, customer.go_live_date.month, 1)

    churn_month = None
    if np.random.rand() < 0.18:
        churn_offset = int(np.random.randint(6, 20))
        churn_month = add_months(start_month, churn_offset)

    expansion_month = add_months(start_month, int(np.random.randint(3, 12)))
    add_on_revenue = float(np.random.choice([0, 99, 199, 249], p=[0.55, 0.25, 0.15, 0.05]))

    for bill_month in billing_months:
        if bill_month < start_month:
            continue
        if churn_month and bill_month >= churn_month:
            break

        mrr = float(base_mrr)
        expansion_flag = False

        if bill_month >= expansion_month:
            mrr += add_on_revenue
            expansion_flag = add_on_revenue > 0

        ramp_discount = 0.15 if (bill_month - start_month).days < 90 else 0.0
        net_mrr = mrr * (1 - ramp_discount)
        service_cogs_eur = net_mrr * np.random.uniform(0.26, 0.34)

        invoice_rows.append({
            "customer_id": customer.customer_id,
            "invoice_month": bill_month,
            "plan": customer.plan,
            "gross_mrr": mrr,
            "net_mrr": net_mrr,
            "service_cogs_eur": service_cogs_eur,
            "discount_pct": ramp_discount,
            "expansion": expansion_flag,
            "churned": False,
        })

    if churn_month:
        invoice_rows.append({
            "customer_id": customer.customer_id,
            "invoice_month": churn_month,
            "plan": customer.plan,
            "gross_mrr": 0.0,
            "net_mrr": 0.0,
            "service_cogs_eur": 0.0,
            "discount_pct": 0.0,
            "expansion": False,
            "churned": True,
        })

    cac = float(np.random.choice([1200, 2500, 4000], p=[0.5, 0.35, 0.15]))
    cs_monthly = float(np.random.choice([150, 250, 400], p=[0.45, 0.4, 0.15]))
    customer_cost_rows.append({
        "customer_id": customer.customer_id,
        "cac": cac,
        "cs_monthly_cost": cs_monthly,
    })

invoices_df = spark.createDataFrame(invoice_rows)
customer_success_cost_df = spark.createDataFrame(customer_cost_rows)

invoices_df.show(5, truncate=False)


In [None]:
window_customer = Window.partitionBy("customer_id").orderBy("invoice_month")

invoices_enriched_df = (
    invoices_df
    .withColumn("prev_net_mrr", F.lag("net_mrr").over(window_customer))
    .withColumn("prev_net_mrr", F.coalesce(F.col("prev_net_mrr"), F.lit(0.0)))
    .withColumn("mrr_change", F.col("net_mrr") - F.col("prev_net_mrr"))
    .withColumn(
        "new_mrr",
        F.when((F.col("prev_net_mrr") == 0) & (F.col("net_mrr") > 0), F.col("net_mrr")).otherwise(F.lit(0.0)),
    )
    .withColumn(
        "expansion_mrr",
        F.when((F.col("prev_net_mrr") > 0) & (F.col("mrr_change") > 0), F.col("mrr_change")).otherwise(F.lit(0.0)),
    )
    .withColumn(
        "contraction_mrr",
        F.when(
            (F.col("prev_net_mrr") > 0)
            & (F.col("mrr_change") < 0)
            & (~F.col("churned")),
            -F.col("mrr_change"),
        ).otherwise(F.lit(0.0)),
    )
    .withColumn(
        "churn_mrr",
        F.when(F.col("churned"), F.col("prev_net_mrr")).otherwise(F.lit(0.0)),
    )
)

invoices_enriched_df.orderBy("customer_id", "invoice_month").show(10, truncate=False)


In [None]:
monthly_active_df = (
    invoices_enriched_df.filter(F.col("net_mrr") > 0)
    .groupBy("invoice_month")
    .agg(F.countDistinct("customer_id").alias("active_customers"))
)

monthly_summary_df = (
    invoices_enriched_df.groupBy("invoice_month")
    .agg(
        F.sum("net_mrr").alias("revenue_eur"),
        F.sum("service_cogs_eur").alias("service_cogs_eur"),
        F.sum("new_mrr").alias("new_mrr"),
        F.sum("expansion_mrr").alias("expansion_mrr"),
        F.sum("contraction_mrr").alias("contraction_mrr"),
        F.sum("churn_mrr").alias("churn_mrr"),
    )
    .join(monthly_active_df, "invoice_month", "left")
    .orderBy("invoice_month")
)

monthly_summary_df = monthly_summary_df.fillna({"active_customers": 0})
monthly_summary_df.show(5, truncate=False)


In [None]:
new_customer_events_df = (
    invoices_enriched_df
    .filter((F.col("prev_net_mrr") == 0) & (F.col("net_mrr") > 0))
    .select("customer_id", "invoice_month")
    .join(customer_success_cost_df, "customer_id", "left")
)

monthly_cac_df = (
    new_customer_events_df.groupBy("invoice_month")
    .agg(F.sum("cac").alias("cac_spend"))
)

active_customer_cost_df = (
    invoices_enriched_df
    .filter(F.col("net_mrr") > 0)
    .select("customer_id", "invoice_month")
    .join(customer_success_cost_df, "customer_id", "left")
    .groupBy("invoice_month")
    .agg(F.sum("cs_monthly_cost").alias("customer_success_spend"))
)

monthly_summary_df = monthly_summary_df.join(monthly_cac_df, "invoice_month", "left")
monthly_summary_df = monthly_summary_df.join(active_customer_cost_df, "invoice_month", "left")

month_keys = [row.invoice_month for row in monthly_summary_df.select("invoice_month").distinct().collect()]
overhead_rows = [(month, float(np.random.randint(14000, 19000))) for month in month_keys]
overhead_df = spark.createDataFrame(overhead_rows, ["invoice_month", "fixed_overhead_eur"])
monthly_summary_df = monthly_summary_df.join(overhead_df, "invoice_month", "left")

monthly_summary_df = monthly_summary_df.fillna({
    "cac_spend": 0.0,
    "customer_success_spend": 0.0,
    "fixed_overhead_eur": 0.0,
})

monthly_summary_df = (
    monthly_summary_df
    .withColumn("gross_profit_eur", F.col("revenue_eur") - F.col("service_cogs_eur"))
    .withColumn(
        "operating_income_eur",
        F.col("revenue_eur")
        - F.col("service_cogs_eur")
        - F.col("customer_success_spend")
        - F.col("fixed_overhead_eur")
        - F.col("cac_spend"),
    )
    .withColumn(
        "gross_margin_pct",
        F.when(F.col("revenue_eur") > 0, F.col("gross_profit_eur") / F.col("revenue_eur")).otherwise(F.lit(None)),
    )
    .withColumn(
        "operating_margin_pct",
        F.when(F.col("revenue_eur") > 0, F.col("operating_income_eur") / F.col("revenue_eur")).otherwise(F.lit(None)),
    )
    .withColumn(
        "cash_burn_eur",
        F.when(F.col("operating_income_eur") < 0, -F.col("operating_income_eur")).otherwise(F.lit(0.0)),
    )
    .withColumn(
        "net_cash_flow_eur",
        F.col("revenue_eur")
        - (F.col("service_cogs_eur") + F.col("customer_success_spend") + F.col("fixed_overhead_eur") + F.col("cac_spend")),
    )
    .withColumn(
        "burn_multiple",
        F.when(F.col("revenue_eur") > 0, F.col("cash_burn_eur") / F.col("revenue_eur")).otherwise(F.lit(0.0)),
    )
    .withColumn(
        "gross_profit_to_cac",
        F.when(F.col("cac_spend") > 0, F.col("gross_profit_eur") / F.col("cac_spend")).otherwise(F.lit(None)),
    )
)

monthly_summary_df.orderBy("invoice_month").show(5, truncate=False)


## Metrics Catalog


In [None]:
metrics_definitions = [
    {
        "metric_name": "Monthly Recurring Revenue (MRR)",
        "definition": "Total recurring revenue recognised in the month across all active subscriptions.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "MoM revenue growth > 0%",
    },
    {
        "metric_name": "Annual Recurring Revenue (ARR)",
        "definition": "Annualised value of current recurring contracts (MRR × 12).",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "MoM revenue growth > 0%",
    },
    {
        "metric_name": "Net Revenue Retention (NRR)",
        "definition": "Retention of existing customers' revenue, including expansion and contraction.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "NRR ≥ 100%",
    },
    {
        "metric_name": "Customer Acquisition Cost (CAC)",
        "definition": "Total sales and marketing spend allocated to new customers in-period.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Marketing",
        "cadence": "Monthly",
        "efficient_if": "CAC ≤ €3k",
    },
    {
        "metric_name": "Customer Lifetime Value (LTV)",
        "definition": "Present value of expected gross profit from a customer, based on ARPA, margin and churn.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "LTV / CAC ≥ 3x",
    },
    {
        "metric_name": "Gross Margin",
        "definition": "Share of revenue retained after direct service delivery costs.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Gross margin ≥ 75%",
    },
    {
        "metric_name": "CAC Payback Period",
        "definition": "Months required for net new recurring revenue to repay customer acquisition cost.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Payback ≤ 12 months",
    },
    {
        "metric_name": "Revenue Growth Rate (MoM)",
        "definition": "Month-over-month change in recurring revenue.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Growth > 0%",
    },
    {
        "metric_name": "Revenue Growth Rate (YoY)",
        "definition": "Year-over-year change in recurring revenue for the same month.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Growth > 0%",
    },
    {
        "metric_name": "Burn Multiple",
        "definition": "Capital efficiency indicator: net burn divided by net new ARR.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Burn multiple ≤ 1.5x",
    },
    {
        "metric_name": "Rule of 40",
        "definition": "Combined score of revenue growth and operating margin (growth % + margin %).",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Finance",
        "cadence": "Monthly",
        "efficient_if": "Rule of 40 ≥ 40",
    },
    {
        "metric_name": "Logo Retention Rate",
        "definition": "Percentage of customers retained period over period.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Customer Success",
        "cadence": "Monthly",
        "efficient_if": "Retention ≥ 90%",
    },
    {
        "metric_name": "Logo Churn Rate",
        "definition": "Share of customers lost relative to the prior period customer base.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Customer Success",
        "cadence": "Monthly",
        "efficient_if": "Churn ≤ 10%",
    },
    {
        "metric_name": "Active Users (DAU/MAU ratio)",
        "definition": "Stickiness indicator comparing daily active users to monthly active users.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Product",
        "cadence": "Monthly",
        "efficient_if": "DAU/MAU ≥ 0.30",
    },
    {
        "metric_name": "Expansion Revenue / Upsell Rate",
        "definition": "Proportion of revenue coming from expansion MRR within the month.",
        "table": "erp_demo.gold_profitability_metrics_view",
        "field": "metric_value",
        "owner": "Customer Success",
        "cadence": "Monthly",
        "efficient_if": "Expansion ≥ 10% of ending MRR",
    },
]

metrics_catalog_df = spark.createDataFrame(metrics_definitions)
metrics_catalog_df.show(truncate=False)


## Persist to Databricks (optional)

The following cell materializes the synthetic dataset and metrics catalog into Unity Catalog tables when executed inside a Databricks workspace.


In [None]:
try:
    spark
except NameError:
    print("Spark session not available in this environment. Skipping table creation.")
else:
    spark.sql("CREATE DATABASE IF NOT EXISTS erp_demo")

    customers_df.write.mode("overwrite").saveAsTable("erp_demo.customers")
    plans_df.write.mode("overwrite").saveAsTable("erp_demo.subscription_plans")
    products_df.write.mode("overwrite").saveAsTable("erp_demo.products")
    invoices_enriched_df.write.mode("overwrite").saveAsTable("erp_demo.invoices_enriched")
    customer_success_cost_df.write.mode("overwrite").saveAsTable("erp_demo.customer_success_costs")
    monthly_summary_df.write.mode("overwrite").saveAsTable("erp_demo.monthly_growth_metrics")
    metrics_catalog_df.write.mode("overwrite").saveAsTable("erp_demo.growth_metrics_catalog")

    print("Synthetic ERP growth catalog saved to Unity Catalog schema 'erp_demo'.")
