In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Create Medallion Architecture Structure
# MAGIC Creating folder structure for Bronze, Silver, Gold layers

# COMMAND ----------

import os
import pandas as pd

# ===================================
# 1. DEFINE BASE PATH
# ===================================

BASE_PATH = "/Workspace/Users/yahyasanbati.mail@gmail.com/GREEN-IT-DATA-PLATFORM/X002_Databricks"

print("=" * 70)
print("CREATING MEDALLION ARCHITECTURE STRUCTURE")
print("=" * 70)
print(f"\nBase Path: {BASE_PATH}")

# COMMAND ----------

# ===================================
# 2. CREATE FOLDER STRUCTURE
# ===================================

# Define folder structure
folders = [
    # Notebooks folder
    f"{BASE_PATH}/L_02_notebooks",
    
    # Medallion data folders
    f"{BASE_PATH}/L_02_Medallion/M_01_Bronze",
    f"{BASE_PATH}/L_02_Medallion/M_02_Silver",
    f"{BASE_PATH}/L_02_Medallion/M_03_Gold"
]

print("\nCreating folders...")

for folder in folders:
    try:
        os.makedirs(folder, exist_ok=True)
        print(f"Created: {folder}")
    except Exception as e:
        print(f"Failed: {folder} - {e}")

print("\nAll folders created successfully!")

# COMMAND ----------

# ===================================
# 3. VERIFY STRUCTURE
# ===================================

print("\nVerifying structure...")
print("=" * 70)

def show_tree(path, prefix="", max_depth=3, current_depth=0):
    """Display directory tree with depth limit"""
    if current_depth >= max_depth:
        return
    
    try:
        items = sorted(os.listdir(path))
        
        for i, item in enumerate(items):
            is_last_item = (i == len(items) - 1)
            item_path = os.path.join(path, item)
            
            # Skip hidden files and __pycache__
            if item.startswith('.') or item == '__pycache__':
                continue
            
            # Print item
            connector = "└── " if is_last_item else "├── "
            if os.path.isdir(item_path):
                print(f"{prefix}{connector}{item}/")
                
                # Recurse if directory
                extension = "    " if is_last_item else "│   "
                show_tree(item_path, prefix + extension, max_depth, current_depth + 1)
            else:
                print(f"{prefix}{connector}{item}")
                
    except Exception as e:
        print(f"{prefix}Error: {e}")

# Show the structure
print(f"\nX002_Databricks/")
show_tree(BASE_PATH, "", max_depth=3, current_depth=0)

print("=" * 70)

# COMMAND ----------

# ===================================
# 4. CREATE README FILES
# ===================================

print("\nCreating README files...")

# Bronze README
bronze_readme = f"{BASE_PATH}/L_02_Medallion/M_01_Bronze/README.md"
with open(bronze_readme, 'w', encoding='utf-8') as f:
    f.write("""# M_01_Bronze - Raw Data Layer

## Purpose
Storage for raw, unprocessed data from source systems.

## Characteristics
- Format: Parquet
- Compression: Snappy
- Schema: As-is from source (no modifications)
- Updates: Append-only

## Files
- green_workload_bronze.parquet: Main dataset from Oracle/Workspace

## Data Lineage
Source -> Bronze
- Oracle Database (stg_green_workload)
- Workspace Parquet file
""")
print(f"Created: {bronze_readme}")

# Silver README
silver_readme = f"{BASE_PATH}/L_02_Medallion/M_02_Silver/README.md"
with open(silver_readme, 'w', encoding='utf-8') as f:
    f.write("""# M_02_Silver - Cleaned Data Layer

## Purpose
Cleaned, validated, and enriched data ready for analytics.

## Transformations Applied
1. Data Cleaning
   - Text standardization (lowercase, trim)
   - Data type conversions
   - NULL handling

2. Data Enrichment
   - Carbon intensity calculation
   - Efficiency categorization
   - Renewable energy flags

3. Quality Checks
   - Duplicate removal
   - Validation rules
   - Metadata addition

## Files
- green_workload_silver.parquet: Cleaned and enriched dataset

## Data Lineage
Bronze -> Silver
- Cleaning + Validation + Enrichment
""")
print(f"Created: {silver_readme}")

# Gold README
gold_readme = f"{BASE_PATH}/L_02_Medallion/M_03_Gold/README.md"
with open(gold_readme, 'w', encoding='utf-8') as f:
    f.write("""# M_03_Gold - Analytics-Ready Star Schema

## Purpose
Dimensional model optimized for BI tools and analytics.

## Star Schema Design

### Dimension Tables
- DIM_WORKLOAD.parquet: Workload types and characteristics
- DIM_ENERGY.parquet: Energy sources and renewable share
- DIM_SECURITY.parquet: Security levels and PQC status
- DIM_SCENARIO.parquet: Workload scenarios and strategies

### Fact Table
- FACT_GREEN_WORKLOAD.parquet: Main metrics table with Energy, Carbon, Cost, Performance

## Usage
Connect Power BI / Tableau to Gold layer for reporting and dashboards.

## Data Lineage
Silver -> Gold
- Dimensional modeling (Star Schema)
- Surrogate key generation
- Fact-Dimension relationships
""")
print(f"Created: {gold_readme}")

# COMMAND ----------

# ===================================
# 5. CREATE CONFIGURATION FILE
# ===================================

print("\nCreating configuration file...")

config_content = f'''"""
Medallion Architecture Configuration
Project: GREEN-IT-DATA-PLATFORM
Environment: Databricks
Generated: {pd.Timestamp.now()}
"""

# BASE PATHS
BASE_PATH = "{BASE_PATH}"
NOTEBOOKS_PATH = f"{{BASE_PATH}}/L_02_notebooks"
MEDALLION_PATH = f"{{BASE_PATH}}/L_02_Medallion"

# DATA LAYER PATHS
BRONZE_PATH = f"{{MEDALLION_PATH}}/M_01_Bronze"
SILVER_PATH = f"{{MEDALLION_PATH}}/M_02_Silver"
GOLD_PATH = f"{{MEDALLION_PATH}}/M_03_Gold"

# FILE PATHS

# Bronze
BRONZE_FILE = f"{{BRONZE_PATH}}/green_workload_bronze.parquet"

# Silver
SILVER_FILE = f"{{SILVER_PATH}}/green_workload_silver.parquet"

# Gold - Dimensions
DIM_WORKLOAD_FILE = f"{{GOLD_PATH}}/DIM_WORKLOAD.parquet"
DIM_ENERGY_FILE = f"{{GOLD_PATH}}/DIM_ENERGY.parquet"
DIM_SECURITY_FILE = f"{{GOLD_PATH}}/DIM_SECURITY.parquet"
DIM_SCENARIO_FILE = f"{{GOLD_PATH}}/DIM_SCENARIO.parquet"

# Gold - Fact
FACT_GREEN_WORKLOAD_FILE = f"{{GOLD_PATH}}/FACT_GREEN_WORKLOAD.parquet"

# SOURCE DATA
SOURCE_FILE = "/Workspace/Users/yahyasanbati.mail@gmail.com/GREEN-IT-DATA-PLATFORM/X001_Oracle/O_03_LOAD_CSV_DATA/L_02_ETL_Logic/E_03_Data_Output/green_it_metrics_2026-02-09 (1).parquet"

# GOLD FILES DICTIONARY
GOLD_FILES = {{
    "dim_workload": DIM_WORKLOAD_FILE,
    "dim_energy": DIM_ENERGY_FILE,
    "dim_security": DIM_SECURITY_FILE,
    "dim_scenario": DIM_SCENARIO_FILE,
    "fact_green_workload": FACT_GREEN_WORKLOAD_FILE
}}

# METADATA
PROJECT_NAME = "GREEN-IT-DATA-PLATFORM"
ENVIRONMENT = "Databricks"
OWNER = "yahyasanbati.mail@gmail.com"
VERSION = "1.0"
'''

config_path = f"{BASE_PATH}/config.py"
with open(config_path, 'w', encoding='utf-8') as f:
    f.write(config_content)

print(f"Created: {config_path}")

# COMMAND ----------

# ===================================
# 6. CREATE .gitkeep FILES
# ===================================

print("\nCreating .gitkeep files...")

gitkeep_folders = [
    f"{BASE_PATH}/L_02_Medallion/M_01_Bronze",
    f"{BASE_PATH}/L_02_Medallion/M_02_Silver",
    f"{BASE_PATH}/L_02_Medallion/M_03_Gold"
]

for folder in gitkeep_folders:
    gitkeep_file = f"{folder}/.gitkeep"
    with open(gitkeep_file, 'w') as f:
        f.write("")
    print(f"Created: {gitkeep_file}")

# COMMAND ----------

# ===================================
# 7. FINAL SUMMARY
# ===================================

print("\n" + "=" * 70)
print("STRUCTURE CREATED SUCCESSFULLY")
print("=" * 70)

print(f"\nComplete Structure:")
print(f"""
{BASE_PATH}/
├── L_02_notebooks/
│   ├── 01_bronze_ingestion.py (to be created)
│   ├── 02_silver_transformation.py (to be created)
│   └── 03_gold_star_schema.py (to be created)
│
├── L_02_Medallion/
│   ├── M_01_Bronze/
│   │   ├── README.md
│   │   └── .gitkeep
│   │
│   ├── M_02_Silver/
│   │   ├── README.md
│   │   └── .gitkeep
│   │
│   └── M_03_Gold/
│       ├── README.md
│       └── .gitkeep
│
└── config.py
""")

print("\nNext Steps:")
print("=" * 70)
print("1. Navigate to: L_02_notebooks/")
print("2. Create 3 notebooks:")
print("   - 01_bronze_ingestion")
print("   - 02_silver_transformation")
print("   - 03_gold_star_schema")
print("3. Copy the notebook code")
print("4. Update all paths to use the new structure")
print("5. Run the pipeline: Bronze -> Silver -> Gold")

print("\nKey Paths:")
print("=" * 70)
print(f"Config file: {config_path}")
print(f"Bronze:      {BASE_PATH}/L_02_Medallion/M_01_Bronze")
print(f"Silver:      {BASE_PATH}/L_02_Medallion/M_02_Silver")
print(f"Gold:        {BASE_PATH}/L_02_Medallion/M_03_Gold")

print("\nTo use config in notebooks:")
print("=" * 70)
print("""
import sys
sys.path.append("{BASE_PATH}")
from config import *

# Then use: BRONZE_FILE, SILVER_FILE, GOLD_PATH, etc.
""".replace("{BASE_PATH}", BASE_PATH))

print("=" * 70)
print("Ready to build your Medallion Architecture")
print("=" * 70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Structure Setup Complete
# MAGIC 
# MAGIC Your medallion architecture structure is now ready with:
# MAGIC - L_02_notebooks/ folder for pipeline notebooks
# MAGIC - L_02_Medallion/ with Bronze, Silver, Gold layers
# MAGIC - README files documenting each layer
# MAGIC - Configuration file with all paths
# MAGIC - .gitkeep files for Git tracking
# MAGIC 
# MAGIC Next: Create your pipeline notebooks