## Configuration

## Cleanup Existing Catalogs (Optional)

Run this section **before** creating new environments if you need to remove existing ecommerce catalogs.

In [0]:
%skip
# =============================================================================
# CLEANUP EXISTING ECOMMERCE CATALOGS
# =============================================================================
# This will find and DROP all catalogs starting with 'ecommerce_platform_'
# WARNING: This is a destructive operation!

CATALOG_PREFIX_TO_CLEANUP = "ecommerce_platform"

# Find all catalogs matching the prefix
catalogs_df = spark.sql("SHOW CATALOGS")
ecommerce_catalogs = [row.catalog for row in catalogs_df.collect() 
                      if row.catalog.startswith(CATALOG_PREFIX_TO_CLEANUP)]

print(f"Found {len(ecommerce_catalogs)} catalogs to remove:")
for cat in ecommerce_catalogs:
    print(f"  - {cat}")

print("Run the next cell to DROP these catalogs!")

In [0]:
%skip
# =============================================================================
# DROP ALL FOUND ECOMMERCE CATALOGS
# =============================================================================
# Uncomment the lines below to execute the cleanup

dropped = []
failed = []

for catalog_name in ecommerce_catalogs:
    try:
        spark.sql(f"DROP CATALOG IF EXISTS {catalog_name} CASCADE")
        dropped.append(catalog_name)
        print(f"Dropped: {catalog_name}")
    except Exception as e:
        failed.append((catalog_name, str(e)))
        print(f"Failed to drop {catalog_name}: {e}")


print(f"Cleanup complete: {len(dropped)} dropped, {len(failed)} failed")


In [0]:
%sql
DROP CATALOG IF EXISTS ecommerce_platform_trainer CASCADE

In [0]:
# =============================================================================
# CONFIGURATION - Adjust these values
# =============================================================================

TRAINING_GROUP = "admins"  # Databricks group with training participants
CATALOG_PREFIX = "ecommerce_platform"  # Catalog naming: ecommerce_platform_{username}
STORAGE_LOCATION = "abfss://unity-catalog-storage@dbstoragelpmsv3hzon5ee.dfs.core.windows.net/7405610202063360"  # ->> ad you ADLS url for storage account
# GitHub raw content base URL
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/Bureyz/DatabricksDataEngineerOne/main/dataset" 

# Files to download (remote_path, local_path in Volume)
# Structure: demo/main (exploration), demo/ingestion (streaming), workshop (AWLite)
DATASET_FILES = [
    # === DEMO - MAIN (exploration, basic demo) ===
    ("demo/main/customers/customers.csv", "customers/customers.csv"),
    ("demo/main/customers/customers_extended.xlsx", "customers/customers_extended.xlsx"),
    ("demo/main/orders/orders_batch.json", "orders/orders_batch.json"),
    ("demo/main/orders/stream/orders_stream_001.json", "orders/stream/orders_stream_001.json"),
    ("demo/main/orders/stream/orders_stream_002.json", "orders/stream/orders_stream_002.json"),
    ("demo/main/orders/stream/orders_stream_003.json", "orders/stream/orders_stream_003.json"),
    ("demo/main/products/products.csv", "products/products.csv"),
    
    # === DEMO - INGESTION (incremental load, streaming demo) ===
    ("demo/ingestion/customers/customers_extended.csv", "demo/ingestion/customers/customers_extended.csv"),
    ("demo/ingestion/customers/customers_new.csv", "demo/ingestion/customers/customers_new.csv"),
    ("demo/ingestion/orders/stream/orders_stream_004.json", "demo/ingestion/orders/stream/orders_stream_004.json"),
    ("demo/ingestion/orders/stream/orders_stream_005.json", "demo/ingestion/orders/stream/orders_stream_005.json"),
    ("demo/ingestion/orders/stream/orders_stream_006.json", "demo/ingestion/orders/stream/orders_stream_006.json"),
    
    # === WORKSHOP - AdventureWorks Lite (star schema exercises) ===
    ("workshop/main/Customers.csv", "workshop/Customers.csv"),
    ("workshop/main/Product.csv", "workshop/Product.csv"),
    ("workshop/main/ProductCategory.csv", "workshop/ProductCategory.csv"),
    ("workshop/main/SalesOrderDetail.csv", "workshop/SalesOrderDetail.csv"),
    ("workshop/main/SalesOrderHeader.csv", "workshop/SalesOrderHeader.csv"),
    ("workshop/main/Customers.csv", "workshop/Lakeflow/Customers/Customers.csv"),
    ("workshop/main/Product.csv", "workshop/Lakeflow/Product/Product.csv"),
    ("workshop/main/ProductCategory.csv", "workshop/Lakeflow/ProductCategory/ProductCategory.csv"),
    ("workshop/main/SalesOrderDetail.csv", "workshop/Lakeflow/SalesOrderDetail/SalesOrderDetail.csv"),
    ("workshop/main/SalesOrderHeader.csv", "workshop/Lakeflow/SalesOrderHeader/SalesOrderHeader.csv")
]

# Schema names (Medallion architecture)
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"
DEFAULT_SCHEMA = "default"

# Volume name for datasets
VOLUME_NAME = "datasets"

print(f"Training group: {TRAINING_GROUP}")
print(f"Catalog prefix: {CATALOG_PREFIX}")
print(f"GitHub base: {GITHUB_RAW_BASE}")
print(f"Files to download: {len(DATASET_FILES)}")

## Step 1: Get Users from Training Group

We use Databricks REST API to get group members.

In [0]:
import requests
import re

def get_group_members(group_name):
    """
    Get all members of a Databricks group using REST API.
    Returns list of usernames (email addresses).
    """
    context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    host = context.apiUrl().get()
    token = context.apiToken().get()
    
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    # Get group ID first
    groups_url = f"{host}/api/2.0/preview/scim/v2/Groups"
    params = {"filter": f'displayName eq "{group_name}"'}
    
    response = requests.get(groups_url, headers=headers, params=params)
    response.raise_for_status()
    
    groups = response.json().get("Resources", [])
    if not groups:
        raise ValueError(f"Group '{group_name}' not found")
    
    group_id = groups[0]["id"]
    
    # Get group members
    group_url = f"{host}/api/2.0/preview/scim/v2/Groups/{group_id}"
    response = requests.get(group_url, headers=headers)
    response.raise_for_status()
    
    members = response.json().get("members", [])
    
    # Get user emails
    user_emails = []
    for member in members:
        if member.get("$ref", "").startswith("Users/"):
            user_id = member["value"]
            user_url = f"{host}/api/2.0/preview/scim/v2/Users/{user_id}"
            user_response = requests.get(user_url, headers=headers)
            user_response.raise_for_status()
            user_data = user_response.json()
            emails = user_data.get("emails", [])
            if emails:
                user_emails.append(emails[0].get("value", ""))
    
    return user_emails

def sanitize_username(email):
    """
    Convert email to safe catalog name suffix.
    Example: jan.kowalski@company.com -> jan_kowalski
    Example: krzysztof_burejza_3 -> krzysztof_burejza
    """
    username = email.split('@')[0]
    safe_name = re.sub(r'[^a-zA-Z0-9]', '_', username).lower()
    safe_name = re.sub(r'_+', '_', safe_name)
    
    # Remove numbers and underscores from beginning and end
    safe_name = re.sub(r'^[0-9_]+', '', safe_name)
    safe_name = re.sub(r'[0-9_]+$', '', safe_name)
    
    if not safe_name:
        safe_name = "user"
        
    return safe_name

In [0]:
# Get users from training group
try:
    training_users = get_group_members(TRAINING_GROUP)
    print(f"Found {len(training_users)} users in group '{TRAINING_GROUP}':")
    print("=" * 60)
    
    user_catalog_map = {}
    for email in training_users:
        safe_name = sanitize_username(email)
        
        if safe_name in ["trainer", "krzysztof_burejza"]:
            catalog_name = f"{CATALOG_PREFIX}_trainer"
        else:
            catalog_name = f"{CATALOG_PREFIX}_{safe_name}"
        user_catalog_map[email] = catalog_name
        print(f"  {email} -> {catalog_name}")
    
    print("=" * 60)
    print(f"Total: {len(user_catalog_map)} catalogs to create")
    
except Exception as e:
    print(f"ERROR: Could not get group members: {e}")
    print(f"Possible issues:")
    print()
    print(f" 1. Group '{TRAINING_GROUP}' does not exist")
    print("  2. You don't have permission to read group members")
    print("  3. Group has no members")
    raise

## Step 2: Create Catalogs and Schemas

For each user, we create:
- Catalog: `ecommerce_platform_{username}`
- Schemas: `bronze`, `silver`, `gold`, `default`
- Volume: `datasets` in `default` schema

In [0]:
def create_user_environment(email, catalog_name, storage_location):
    """
    Create catalog, schemas, and volume for a user.
    """
    results = {"catalog": False, "schemas": [], "volume": False, "owner": False}
    
    try:
        # Get trainer (current user)
        trainer_email = spark.sql("SELECT current_user()").first()[0]

        # Create catalog
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name} MANAGED LOCATION '{storage_location}/{catalog_name}' ")
        results["catalog"] = True
        
        # Create schemas
        for schema in [DEFAULT_SCHEMA, BRONZE_SCHEMA, SILVER_SCHEMA, GOLD_SCHEMA]:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
            results["schemas"].append(schema)
        
        # Create volume for datasets
        spark.sql(f"""
            CREATE VOLUME IF NOT EXISTS {catalog_name}.{DEFAULT_SCHEMA}.{VOLUME_NAME}
            COMMENT 'Training datasets volume'
        """)
        results["volume"] = True
        
        # Grant permissions to User and Trainer
        spark.sql(f"GRANT ALL PRIVILEGES ON CATALOG {catalog_name} TO `{email}`")
        if trainer_email != email:
            spark.sql(f"GRANT ALL PRIVILEGES ON CATALOG {catalog_name} TO `{trainer_email}`")
            
        results["owner"] = True
        
    except Exception as e:
        results["error"] = str(e)
    
    return results

In [0]:
# Create environments for all users
creation_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"Processing: {email}")
    result = create_user_environment(email, catalog_name,STORAGE_LOCATION)
    creation_results[email] = result
    
    if "error" in result:
        print(f"  ERROR: {result['error']}")
    else:
        print(f"  Catalog: {catalog_name}")
        print(f"  Schemas: {', '.join(result['schemas'])}")
        print(f"  Volume: {result['volume']}")
        print(f"  PRIVILEGES set: {result['owner']}")

successful = sum(1 for r in creation_results.values() if "error" not in r)
print(f"Successfully created: {successful}/{len(creation_results)} environments")

## Step 3: Copy Dataset from Repo
Copy files directly from the repository folder (`dataset/`) to each user's Volume.


In [0]:
import shutil
import os

def copy_dataset_to_volume(catalog_name):
    """
    Copy dataset files directly from Repo to user's Volume.
    """
    results = {"success": [], "failed": []}
    
    # Paths
    # Repo root is one level up from notebooks folder
    repo_root = os.path.abspath("..")
    source_dataset_path = os.path.join(repo_root, "dataset")
    
    # Target Volume Path
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    
    print(f"Copying from: {source_dataset_path}")
    print(f"To: {volume_path}")
    
    try:
        # Check if source exists
        if not os.path.exists(source_dataset_path):
            raise Exception(f"Source dataset not found at {source_dataset_path}")
            
        # Copy entire directory structure
        # copytree requires destination to NOT exist if dirs_exist_ok=False (default in older python)
        # But in 3.8+ (Databricks) dirs_exist_ok=True works.
        shutil.copytree(source_dataset_path, volume_path, dirs_exist_ok=True)
        results["success"].append("All files copied successfully")
        
    except Exception as e:
        results["failed"].append(("dataset folder", str(e)))
        
    return results

In [0]:
# Copy dataset to each user's Volume
print("Copying dataset to user Volumes...")
print("=" * 60)

copy_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"Copying to: {catalog_name}")
    result = copy_dataset_to_volume(catalog_name)
    copy_results[email] = result
    
    if result["failed"]:
        print(f"Failed: {len(result['failed'])} errors")
        for file, error in result["failed"]:
            print(f"    - {file}: {error}")
    else:
        print("  Success!")


successful = sum(1 for r in copy_results.values() if not r["failed"])
print(f"Fully completed: {successful}/{len(copy_results)} volumes")

## Step 4: Verify Setup

Verify all environments are ready for training.

In [0]:
print("=" * 60)
print("TRAINING ENVIRONMENT SUMMARY")
print("=" * 60)
print()

for email, catalog_name in user_catalog_map.items():
    print(f"User: {email}")
    print(f"  Catalog: {catalog_name}")
    print(f"  Schemas: {BRONZE_SCHEMA}, {SILVER_SCHEMA}, {GOLD_SCHEMA}, {DEFAULT_SCHEMA}")
    
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    try:
        # Check main folders
        folders = dbutils.fs.ls(volume_path)
        print(f"  Volume root: {[f.name for f in folders]}")
        
        # Check demo/main structure
        demo_main_path = f"{volume_path}/demo/main"
        try:
            demo_main = dbutils.fs.ls(demo_main_path)
            print(f"    demo/main: {[f.name for f in demo_main]}")
        except:
            print(f"    demo/main: NOT FOUND")
        
        # Check workshop structure
        workshop_path = f"{volume_path}/workshop"
        try:
            workshop = dbutils.fs.ls(workshop_path)
            print(f"    workshop: {len(workshop)} files (AWLite)")
        except:
            print(f"    workshop: NOT FOUND")
            
    except Exception as e:
        print(f"  Volume: ERROR - {e}")
    
    print()


print("Dataset structure in each Volume:")
print("  /demo/main/       - Basic exploration & demo")
print("  /demo/ingestion/  - Incremental load & streaming")
print("  /workshop/        - AdventureWorks Lite (star schema)")
print()
print("Participants can now run 00_setup.ipynb to validate their environment.")

## Cleanup (After Training)

Run this section to remove all training catalogs after the training is complete.

In [0]:
# =============================================================================
# CLEANUP - Remove all training catalogs
# =============================================================================
# WARNING: This will DELETE all training data!

# Uncomment to run:
# for email, catalog_name in user_catalog_map.items():
#     try:
#         spark.sql(f"DROP CATALOG IF EXISTS {catalog_name} CASCADE")
#         print(f"Dropped: {catalog_name}")
#     except Exception as e:
#         print(f"Failed to drop {catalog_name}: {e}")
# 
# print("\nCleanup complete.")