In [0]:
import os
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Destination directory (downloadable)
target_dir = "/dbfs/FileStore/mock_data/"
os.makedirs(target_dir, exist_ok=True)

# Define sources and how to read/export each
def export_csv_from_dbfs(dbfs_path, file_name):
    full_path = f"/dbfs{dbfs_path}"
    if os.path.exists(full_path):
        df = pd.read_csv(full_path)
        df.to_csv(f"{target_dir}{file_name}", index=False)
        print(f"✅ Exported: {file_name}")
    else:
        print(f"⚠️  Not found: {dbfs_path}")

def export_json_from_dbfs(dbfs_path, file_name):
    full_path = f"/dbfs{dbfs_path}"
    if os.path.exists(full_path):
        df = pd.read_json(full_path)
        df.to_json(f"{target_dir}{file_name}", orient="records", indent=2)
        print(f"✅ Exported: {file_name}")
    else:
        print(f"⚠️  Not found: {dbfs_path}")

def export_from_delta(delta_path, file_name):
    df = spark.read.format("delta").load(delta_path)
    df.limit(100).toPandas().to_csv(f"{target_dir}{file_name}", index=False)
    print(f"✅ Exported from Delta: {file_name}")


# Export all mock files
export_csv_from_dbfs("/FileStore/pipeline1_batch_delta/moc_source_a/Inventory.csv", "inventory.csv")
export_csv_from_dbfs("/FileStore/pipeline1_batch_delta/moc_source_b/Shipments.csv", "shipments.csv")
export_csv_from_dbfs("/FileStore/pipeline1_batch_delta/moc_source_c/Vendors.csv", "vendors.csv")
export_csv_from_dbfs("/mnt/raw-ingest/finance_invoice_data.csv", "finance_invoice_data.csv")

export_json_from_dbfs("/mnt/external-ingest/web_form_submissions.json", "web_forms.json")

export_from_delta("/mnt/delta/bronze/vendor_compliance", "vendor_compliance.csv")

print("\n📁 All mock data files saved to: /dbfs/FileStore/mock_data/")

In [0]:
databricks fs cp dbfs:/FileStore/mock_data/finance_invoice_data.csv ./mock_data/finance_invoice_data.csv
