## Configuration

In [None]:
# =============================================================================
# CONFIGURATION - Adjust these values
# =============================================================================

TRAINING_GROUP = "dp_trn_1"  # Databricks group with training participants
CATALOG_PREFIX = "ecommerce_platform"  # Catalog naming: ecommerce_platform_{username}

# GitHub raw content base URL
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/Bureyz/DataEngineeringOne/main/dataset"

# Files to download (remote_path, local_name)
DATASET_FILES = [
    ("customers/customers.csv", "customers/customers.csv"),
    ("customers/customers2.csv", "customers/customers2.csv"),
    ("customers/customers_extended.csv", "customers/customers_extended.csv"),
    ("customers/customers_new.csv", "customers/customers_new.csv"),
    ("products/csv/products.csv", "products/products.csv"),
    ("products/csv/products_new.csv", "products/products_new.csv"),
    ("products/csv/products_premium.csv", "products/products_premium.csv"),
    ("products/csv/products_updated.csv", "products/products_updated.csv"),
    ("orders/orders_batch.json", "orders/orders_batch.json")
]

# Schema names (Medallion architecture)
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"
DEFAULT_SCHEMA = "default"

# Volume name for datasets
VOLUME_NAME = "datasets"

print(f"Training group: {TRAINING_GROUP}")
print(f"Catalog prefix: {CATALOG_PREFIX}")
print(f"GitHub base: {GITHUB_RAW_BASE}")
print(f"Files to download: {len(DATASET_FILES)}")

## Step 1: Get Users from Training Group

We use Databricks REST API to get group members.

In [None]:
import requests
import re

def get_group_members(group_name):
    """
    Get all members of a Databricks group using REST API.
    Returns list of usernames (email addresses).
    """
    context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    host = context.apiUrl().get()
    token = context.apiToken().get()
    
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    # Get group ID first
    groups_url = f"{host}/api/2.0/preview/scim/v2/Groups"
    params = {"filter": f'displayName eq "{group_name}"'}
    
    response = requests.get(groups_url, headers=headers, params=params)
    response.raise_for_status()
    
    groups = response.json().get("Resources", [])
    if not groups:
        raise ValueError(f"Group '{group_name}' not found")
    
    group_id = groups[0]["id"]
    
    # Get group members
    group_url = f"{host}/api/2.0/preview/scim/v2/Groups/{group_id}"
    response = requests.get(group_url, headers=headers)
    response.raise_for_status()
    
    members = response.json().get("members", [])
    
    # Get user emails
    user_emails = []
    for member in members:
        if member.get("$ref", "").startswith("Users/"):
            user_id = member["value"]
            user_url = f"{host}/api/2.0/preview/scim/v2/Users/{user_id}"
            user_response = requests.get(user_url, headers=headers)
            user_response.raise_for_status()
            user_data = user_response.json()
            emails = user_data.get("emails", [])
            if emails:
                user_emails.append(emails[0].get("value", ""))
    
    return user_emails

def sanitize_username(email):
    """
    Convert email to safe catalog name suffix.
    Example: jan.kowalski@company.com -> jan_kowalski
    """
    username = email.split('@')[0]
    safe_name = re.sub(r'[^a-zA-Z0-9]', '_', username).lower()
    safe_name = re.sub(r'_+', '_', safe_name)
    safe_name = safe_name.strip('_')
    return safe_name

In [None]:
# Get users from training group
try:
    training_users = get_group_members(TRAINING_GROUP)
    print(f"Found {len(training_users)} users in group '{TRAINING_GROUP}':")
    print("=" * 60)
    
    user_catalog_map = {}
    for email in training_users:
        safe_name = sanitize_username(email)
        catalog_name = f"{CATALOG_PREFIX}_{safe_name}"
        user_catalog_map[email] = catalog_name
        print(f"  {email} -> {catalog_name}")
    
    print("=" * 60)
    print(f"Total: {len(user_catalog_map)} catalogs to create")
    
except Exception as e:
    print(f"ERROR: Could not get group members: {e}")
    print(f"\nPossible issues:")
    print(f"  1. Group '{TRAINING_GROUP}' does not exist")
    print("  2. You don't have permission to read group members")
    print("  3. Group has no members")
    raise

## Step 2: Create Catalogs and Schemas

For each user, we create:
- Catalog: `ecommerce_platform_{username}`
- Schemas: `bronze`, `silver`, `gold`, `default`
- Volume: `datasets` in `default` schema

In [None]:
def create_user_environment(email, catalog_name):
    """
    Create catalog, schemas, and volume for a user.
    """
    results = {"catalog": False, "schemas": [], "volume": False, "owner": False}
    
    try:
        # Create catalog
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
        results["catalog"] = True
        
        # Create schemas
        for schema in [DEFAULT_SCHEMA, BRONZE_SCHEMA, SILVER_SCHEMA, GOLD_SCHEMA]:
            spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
            results["schemas"].append(schema)
        
        # Create volume for datasets
        spark.sql(f"""
            CREATE VOLUME IF NOT EXISTS {catalog_name}.{DEFAULT_SCHEMA}.{VOLUME_NAME}
            COMMENT 'Training datasets volume'
        """)
        results["volume"] = True
        
        # Set owner to user
        spark.sql(f"ALTER CATALOG {catalog_name} SET OWNER TO \`{email}\`")
        results["owner"] = True
        
    except Exception as e:
        results["error"] = str(e)
    
    return results

In [None]:
# Create environments for all users
print("Creating user environments...")
print("=" * 60)

creation_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"\nProcessing: {email}")
    result = create_user_environment(email, catalog_name)
    creation_results[email] = result
    
    if "error" in result:
        print(f"  ERROR: {result['error']}")
    else:
        print(f"  Catalog: {catalog_name}")
        print(f"  Schemas: {', '.join(result['schemas'])}")
        print(f"  Volume: {result['volume']}")
        print(f"  Owner set: {result['owner']}")

print("\n" + "=" * 60)
successful = sum(1 for r in creation_results.values() if "error" not in r)
print(f"Successfully created: {successful}/{len(creation_results)} environments")

## Step 3: Download Dataset from GitHub

Download files directly from GitHub raw content to each user's Volume.

In [None]:
import urllib.request

def download_dataset_to_volume(catalog_name):
    """
    Download dataset files directly from GitHub to user's Volume.
    Uses simple urllib + dbutils.fs.put approach.
    """
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    results = {"success": [], "failed": []}
    
    for remote_path, local_path in DATASET_FILES:
        url = f"{GITHUB_RAW_BASE}/{remote_path}"
        dest_path = f"{volume_path}/{local_path}"
        
        try:
            # Download file content
            response = urllib.request.urlopen(url)
            content = response.read()
            
            # Write to volume using dbutils
            dbutils.fs.put(dest_path, content.decode('utf-8'), overwrite=True)
            results["success"].append(local_path)
            
        except Exception as e:
            results["failed"].append((local_path, str(e)))
    
    return results

In [None]:
# Download dataset to each user's Volume
print("Downloading dataset to user Volumes...")
print("=" * 60)

download_results = {}

for email, catalog_name in user_catalog_map.items():
    print(f"\nDownloading to: {catalog_name}")
    result = download_dataset_to_volume(catalog_name)
    download_results[email] = result
    
    if result["success"]:
        print(f"  ✓ Downloaded: {len(result['success'])} files")
    if result["failed"]:
        print(f"  ✗ Failed: {len(result['failed'])} files")
        for file, error in result["failed"]:
            print(f"    - {file}: {error}")

print("\n" + "=" * 60)
successful = sum(1 for r in download_results.values() if not r["failed"])
print(f"Fully completed: {successful}/{len(download_results)} volumes")

## Step 4: Verify Setup

Verify all environments are ready for training.

In [None]:
print("=" * 60)
print("TRAINING ENVIRONMENT SUMMARY")
print("=" * 60)
print()

for email, catalog_name in user_catalog_map.items():
    print(f"User: {email}")
    print(f"  Catalog: {catalog_name}")
    print(f"  Schemas: {BRONZE_SCHEMA}, {SILVER_SCHEMA}, {GOLD_SCHEMA}, {DEFAULT_SCHEMA}")
    
    volume_path = f"/Volumes/{catalog_name}/{DEFAULT_SCHEMA}/{VOLUME_NAME}"
    try:
        files = dbutils.fs.ls(volume_path)
        print(f"  Volume folders: {[f.name for f in files]}")
    except:
        print(f"  Volume: ERROR - cannot access")
    
    print()

print("=" * 60)
print("PRE-CONFIGURATION COMPLETE")
print("=" * 60)
print()
print("Participants can now run 00_setup.ipynb to validate their environment.")

## Cleanup (After Training)

Run this section to remove all training catalogs after the training is complete.

In [None]:
# =============================================================================
# CLEANUP - Remove all training catalogs
# =============================================================================
# WARNING: This will DELETE all training data!

# Uncomment to run:
# for email, catalog_name in user_catalog_map.items():
#     try:
#         spark.sql(f"DROP CATALOG IF EXISTS {catalog_name} CASCADE")
#         print(f"Dropped: {catalog_name}")
#     except Exception as e:
#         print(f"Failed to drop {catalog_name}: {e}")
# 
# print("\nCleanup complete.")