## Configuration

In [0]:
# =============================================================================
# CONFIGURATION - Adjust these values
# =============================================================================

TRAINING_GROUP = "dp_trn_1"  # Databricks group with training participants
CATALOG_PREFIX = "ecommerce_platform"  # Catalog naming: ecommerce_platform_{username}
STORAGE_LOCATION = "abfss://unity-catalog-storage@dbstorage3laxlzjqlvw46.dfs.core.windows.net/938068554870483"
# GitHub raw content base URL
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/Bureyz/DataEngineeringOne/main/dataset"

# Files to download (remote_path, local_path in Volume)
# Structure: demo/main (exploration), demo/ingestion (streaming), workshop (AWLite)
DATASET_FILES = [
    # === DEMO - MAIN (exploration, basic demo) ===
    ("demo/main/customers/customers.csv", "demo/main/customers/customers.csv"),
    ("demo/main/orders/orders_batch.json", "demo/main/orders/orders_batch.json"),
    ("demo/main/orders/stream/orders_stream_001.json", "demo/main/orders/stream/orders_stream_001.json"),
    ("demo/main/orders/stream/orders_stream_002.json", "demo/main/orders/stream/orders_stream_002.json"),
    ("demo/main/orders/stream/orders_stream_003.json", "demo/main/orders/stream/orders_stream_003.json"),
    ("demo/main/products/products.csv", "demo/main/products/products.csv"),
    
    # === DEMO - INGESTION (incremental load, streaming demo) ===
    ("demo/ingestion/customers/customers_extended.csv", "demo/ingestion/customers/customers_extended.csv"),
    ("demo/ingestion/customers/customers_new.csv", "demo/ingestion/customers/customers_new.csv"),
    ("demo/ingestion/customers/customers2.csv", "demo/ingestion/customers/customers2.csv"),
    ("demo/ingestion/orders/stream/orders_stream_004.json", "demo/ingestion/orders/stream/orders_stream_004.json"),
    ("demo/ingestion/orders/stream/orders_stream_005.json", "demo/ingestion/orders/stream/orders_stream_005.json"),
    ("demo/ingestion/orders/stream/orders_stream_006.json", "demo/ingestion/orders/stream/orders_stream_006.json"),
    ("demo/ingestion/orders/stream/orders_stream_007.json", "demo/ingestion/orders/stream/orders_stream_007.json"),
    ("demo/ingestion/orders/stream/orders_stream_008.json", "demo/ingestion/orders/stream/orders_stream_008.json"),
    ("demo/ingestion/orders/stream/orders_stream_009.json", "demo/ingestion/orders/stream/orders_stream_009.json"),
    ("demo/ingestion/orders/stream/orders_stream_010.json", "demo/ingestion/orders/stream/orders_stream_010.json"),
    ("demo/ingestion/orders/stream/orders_stream_011.json", "demo/ingestion/orders/stream/orders_stream_011.json"),
    ("demo/ingestion/orders/stream/orders_stream_012.json", "demo/ingestion/orders/stream/orders_stream_012.json"),
    ("demo/ingestion/orders/stream/orders_stream_013.json", "demo/ingestion/orders/stream/orders_stream_013.json"),
    ("demo/ingestion/orders/stream/orders_stream_014.json", "demo/ingestion/orders/stream/orders_stream_014.json"),
    ("demo/ingestion/orders/stream/orders_stream_015.json", "demo/ingestion/orders/stream/orders_stream_015.json"),
    
    # === WORKSHOP - AdventureWorks Lite (star schema exercises) ===
    ("workshop/Address.csv", "workshop/Address.csv"),
    ("workshop/Customers.csv", "workshop/Customers.csv"),
    ("workshop/Product.csv", "workshop/Product.csv"),
    ("workshop/ProductCategory.csv", "workshop/ProductCategory.csv"),
    ("workshop/SalesOrderDetail.csv", "workshop/SalesOrderDetail.csv"),
    ("workshop/SalesOrderHeader.csv", "workshop/SalesOrderHeader.csv"),
]

# Schema names (Medallion architecture)
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"
DEFAULT_SCHEMA = "default"

# Volume name for datasets
VOLUME_NAME = "datasets"

print(f"Training group: {TRAINING_GROUP}")
print(f"Catalog prefix: {CATALOG_PREFIX}")
print(f"GitHub base: {GITHUB_RAW_BASE}")
print(f"Files to download: {len(DATASET_FILES)}")
print(f"  - Demo main: 6 files")
print(f"  - Demo ingestion: 15 files")
print(f"  - Workshop (AWLite): 6 files")

Training group: dp_trn_1
Catalog prefix: ecommerce_platform
GitHub base: https://raw.githubusercontent.com/Bureyz/DataEngineeringOne/main/dataset
Files to download: 27
  - Demo main: 6 files
  - Demo ingestion: 15 files
  - Workshop (AWLite): 6 files


## Step 1: Get Users from Training Group

We use Databricks REST API to get group members.

In [0]:
import requests
import re

def get_group_members(group_name):
    """
    Get all members of a Databricks group using REST API.
    Returns list of usernames (email addresses).
    """
    context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    host = context.apiUrl().get()
    token = context.apiToken().get()
    
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    # Get group ID first
    groups_url = f"{host}/api/2.0/preview/scim/v2/Groups"
    params = {"filter": f'displayName eq "{group_name}"'}
    
    response = requests.get(groups_url, headers=headers, params=params)
    response.raise_for_status()
    
    groups = response.json().get("Resources", [])
    if not groups:
        raise ValueError(f"Group '{group_name}' not found")
    
    group_id = groups[0]["id"]
    
    # Get group members
    group_url = f"{host}/api/2.0/preview/scim/v2/Groups/{group_id}"
    response = requests.get(group_url, headers=headers)
    response.raise_for_status()
    
    members = response.json().get("members", [])
    
    # Get user emails
    user_emails = []
    for member in members:
        if member.get("$ref", "").startswith("Users/"):
            user_id = member["value"]
            user_url = f"{host}/api/2.0/preview/scim/v2/Users/{user_id}"
            user_response = requests.get(user_url, headers=headers)
            user_response.raise_for_status()
            user_data = user_response.json()
            emails = user_data.get("emails", [])
            if emails:
                user_emails.append(emails[0].get("value", ""))
    
    return user_emails

def sanitize_username(email):
    """
    Convert email to safe catalog name suffix.
    Example: jan.kowalski@company.com -> jan_kowalski
    """
    username = email.split('@')[0]
    safe_name = re.sub(r'[^a-zA-Z0-9]', '_', username).lower()
    safe_name = re.sub(r'_+', '_', safe_name)
    safe_name = safe_name.strip('_')
    return safe_name

In [0]:
# Get users from training group
try:
    training_users = get_group_members(TRAINING_GROUP)
    print(f"Found {len(training_users)} users in group '{TRAINING_GROUP}':")
    print("=" * 60)
    
    user_catalog_map = {}
    for email in training_users:
        safe_name = sanitize_username(email)
        
        if safe_name in ["trainer", "krzysztof_burejza"]:
            catalog_name = f"{CATALOG_PREFIX}_training"
        else:
            catalog_name = f"{CATALOG_PREFIX}_{safe_name}"
        user_catalog_map[email] = catalog_name
        print(f"  {email} -> {catalog_name}")
    
    print("=" * 60)
    print(f"Total: {len(user_catalog_map)} catalogs to create")
    
except Exception as e:
    print(f"ERROR: Could not get group members: {e}")
    print(f"Possible issues:")
    print(f" 1. Group '{TRAINING_GROUP}' does not exist")
    print("  2. You don't have permission to read group members")
    print("  3. Group has no members")
    raise

Found 2 users in group 'dp_trn_1':
  bureyz@kzbdev.com -> ecommerce_platform_bureyz
  krzysztof.burejza@outlook.com -> ecommerce_platform_training
Total: 2 catalogs to create


## Step 2: Create Catalogs and Schemas

For each user, we create:
- Catalog: `ecommerce_platform_{username}`
- Schemas: `bronze`, `silver`, `gold`, `default`
- Volume: `datasets` in `default` schema

In [0]:
def create_user_environment(email, catalog_name,storage_location):
    """
    Create catalog, schemas, and volume for a user.
    """
    results = {"catalog": False, "schemas": [], "volume": False, "owner": False}
    
    try:
        # Create catalog
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name} MANAGED LOCATION '{storage_location}/{catalog_name}' ")
        results["catalog"] = True
        
        # Create schemas
        for schema in [DEFAULT_SCHEMA, BRONZE_SCHEMA, SILVER_SCHEMA, GOLD_SCHEMA]:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
            results["schemas"].append(schema)
        
        # Create volume for datasets
        spark.sql(f"""
            CREATE VOLUME IF NOT EXISTS {catalog_name}.{DEFAULT_SCHEMA}.{VOLUME_NAME}
            COMMENT 'Training datasets volume'
        """)
        results["volume"] = True
        
        # Set owner to user
        spark.sql(f"ALTER CATALOG {catalog_name} SET OWNER TO `{email}`")
        results["owner"] = True
        
    except Exception as e:
        results["error"] = str(e)
    
    return results

In [0]:
# Create environments for all users
creation_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"Processing: {email}")
    result = create_user_environment(email, catalog_name,STORAGE_LOCATION)
    creation_results[email] = result
    
    if "error" in result:
        print(f"  ERROR: {result['error']}")
    else:
        print(f"  Catalog: {catalog_name}")
        print(f"  Schemas: {', '.join(result['schemas'])}")
        print(f"  Volume: {result['volume']}")
        print(f"  Owner set: {result['owner']}")

successful = sum(1 for r in creation_results.values() if "error" not in r)
print(f"Successfully created: {successful}/{len(creation_results)} environments")

Processing: bureyz@kzbdev.com
  Catalog: ecommerce_platform_bureyz
  Schemas: default, bronze, silver, gold
  Volume: True
  Owner set: True
Processing: krzysztof.burejza@outlook.com
  Catalog: ecommerce_platform_training
  Schemas: default, bronze, silver, gold
  Volume: True
  Owner set: True
Successfully created: 2/2 environments


## Step 3: Download Dataset from GitHub

Download files directly from GitHub raw content to each user's Volume.

In [0]:
import urllib.request
import tempfile
import os

def download_dataset_to_volume(catalog_name):
    """
    Download dataset files directly from GitHub to user's Volume.
    Creates subdirectory structure.
    - CSV/JSON files: saved as-is (text)
    - products.csv: converted to Parquet for demo purposes
    """
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    results = {"success": [], "failed": []}
    
    # Track created directories to avoid redundant checks
    created_dirs = set()
    
    for remote_path, local_path in DATASET_FILES:
        url = f"{GITHUB_RAW_BASE}/{remote_path}"
        dest_path = f"{volume_path}/{local_path}"
        
        # Check if this is products.csv that should be converted to Parquet
        convert_to_parquet = "products/products.csv" in remote_path and "demo/main" in remote_path
        
        try:
            # Create parent directories if needed
            parent_dir = "/".join(dest_path.split("/")[:-1])
            if parent_dir not in created_dirs:
                try:
                    dbutils.fs.mkdirs(parent_dir)
                except:
                    pass  # Directory might already exist
                created_dirs.add(parent_dir)
            
            # Download file content
            response = urllib.request.urlopen(url)
            content = response.read().decode('utf-8')
            
            if convert_to_parquet:
                # Save CSV temporarily, read with Spark, write as Parquet
                dbutils.fs.put(dest_path, content, overwrite=True)
                
                # Read CSV and save as Parquet
                parquet_path = dest_path.replace(".csv", ".parquet")
                df = spark.read.option("header", "true").option("inferSchema", "true").csv(dest_path)
                df.write.mode("overwrite").parquet(parquet_path)
                
                # Keep both CSV and Parquet for flexibility
                results["success"].append(local_path)
                results["success"].append(local_path.replace(".csv", ".parquet"))
            else:
                # Save as text file (CSV, JSON)
                dbutils.fs.put(dest_path, content, overwrite=True)
                results["success"].append(local_path)
            
        except Exception as e:
            results["failed"].append((local_path, str(e)))
    
    return results

In [0]:
# Download dataset to each user's Volume
print("Downloading dataset to user Volumes...")
print("=" * 60)

download_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"\nDownloading to: {catalog_name}")
    result = download_dataset_to_volume(catalog_name)
    download_results[email] = result
    
    if result["success"]:
        print(f"  ✓ Downloaded: {len(result['success'])} files")
    if result["failed"]:
        print(f"  ✗ Failed: {len(result['failed'])} files")
        for file, error in result["failed"]:
            print(f"    - {file}: {error}")

print("\n" + "=" * 60)
successful = sum(1 for r in download_results.values() if not r["failed"])
print(f"Fully completed: {successful}/{len(download_results)} volumes")

Downloading dataset to user Volumes...

Downloading to: ecommerce_platform_bureyz
  ✗ Failed: 27 files
    - demo/main/customers/customers.csv: HTTP Error 404: Not Found
    - demo/main/orders/orders_batch.json: HTTP Error 404: Not Found
    - demo/main/orders/stream/orders_stream_001.json: HTTP Error 404: Not Found
    - demo/main/orders/stream/orders_stream_002.json: HTTP Error 404: Not Found
    - demo/main/orders/stream/orders_stream_003.json: HTTP Error 404: Not Found
    - demo/main/products/products.csv: HTTP Error 404: Not Found
    - demo/ingestion/customers/customers_extended.csv: HTTP Error 404: Not Found
    - demo/ingestion/customers/customers_new.csv: HTTP Error 404: Not Found
    - demo/ingestion/customers/customers2.csv: HTTP Error 404: Not Found
    - demo/ingestion/orders/stream/orders_stream_004.json: HTTP Error 404: Not Found
    - demo/ingestion/orders/stream/orders_stream_005.json: HTTP Error 404: Not Found
    - demo/ingestion/orders/stream/orders_stream_006.json

## Step 4: Verify Setup

Verify all environments are ready for training.

In [0]:
print("=" * 60)
print("TRAINING ENVIRONMENT SUMMARY")
print("=" * 60)
print()

for email, catalog_name in user_catalog_map.items():
    print(f"User: {email}")
    print(f"  Catalog: {catalog_name}")
    print(f"  Schemas: {BRONZE_SCHEMA}, {SILVER_SCHEMA}, {GOLD_SCHEMA}, {DEFAULT_SCHEMA}")
    
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    try:
        # Check main folders
        folders = dbutils.fs.ls(volume_path)
        print(f"  Volume root: {[f.name for f in folders]}")
        
        # Check demo/main structure
        demo_main_path = f"{volume_path}/demo/main"
        try:
            demo_main = dbutils.fs.ls(demo_main_path)
            print(f"    demo/main: {[f.name for f in demo_main]}")
        except:
            print(f"    demo/main: NOT FOUND")
        
        # Check workshop structure
        workshop_path = f"{volume_path}/workshop"
        try:
            workshop = dbutils.fs.ls(workshop_path)
            print(f"    workshop: {len(workshop)} files (AWLite)")
        except:
            print(f"    workshop: NOT FOUND")
            
    except Exception as e:
        print(f"  Volume: ERROR - {e}")
    
    print()

print("=" * 60)
print("PRE-CONFIGURATION COMPLETE")
print("=" * 60)
print()
print("Dataset structure in each Volume:")
print("  /demo/main/       - Basic exploration & demo")
print("  /demo/ingestion/  - Incremental load & streaming")
print("  /workshop/        - AdventureWorks Lite (star schema)")
print()
print("Participants can now run 00_setup.ipynb to validate their environment.")

TRAINING ENVIRONMENT SUMMARY

User: bureyz@kzbdev.com
  Catalog: ecommerce_platform_bureyz
  Schemas: bronze, silver, gold, default
  Volume: ERROR - An error occurred while calling o513.ls.
: com.databricks.sql.managedcatalog.acl.UnauthorizedAccessException: PERMISSION_DENIED: User does not have USE CATALOG on Catalog 'ecommerce_platform_bureyz'.
	at com.databricks.sql.managedcatalog.client.ErrorDetailsHandlerImpl.wrapServiceException(ErrorDetailsHandler.scala:119)
	at com.databricks.sql.managedcatalog.client.ErrorDetailsHandlerImpl.wrapServiceException$(ErrorDetailsHandler.scala:88)
	at com.databricks.managedcatalog.ManagedCatalogClientImpl.wrapServiceException(ManagedCatalogClientImpl.scala:44)
	at com.databricks.sql.managedcatalog.client.ManagedCatalogClientImpl.recordAndWrapExceptionBase(ManagedCatalogClientImpl.scala:7912)
	at com.databricks.sql.managedcatalog.client.ManagedCatalogClientImpl.recordAndWrapException(ManagedCatalogClientImpl.scala:7896)
	at com.databricks.sql.manage

## Cleanup (After Training)

Run this section to remove all training catalogs after the training is complete.

In [0]:
# =============================================================================
# CLEANUP - Remove all training catalogs
# =============================================================================
# WARNING: This will DELETE all training data!

# Uncomment to run:
# for email, catalog_name in user_catalog_map.items():
#     try:
#         spark.sql(f"DROP CATALOG IF EXISTS {catalog_name} CASCADE")
#         print(f"Dropped: {catalog_name}")
#     except Exception as e:
#         print(f"Failed to drop {catalog_name}: {e}")
# 
# print("\nCleanup complete.")