In [None]:
# Import required libraries
import sys
import json
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *
import notebookutils

# Add utils directory to path
sys.path.append('/lakehouse/default/Files/notebooks/utils')

from g4s_api_client import G4SApiClient
from keyvault_client import get_api_key_from_keyvault
from delta_manager import DeltaTableManager

In [None]:
# Configuration Parameters
KEY_VAULT_URL = "https://your-keyvault.vault.azure.net/"  # Update with your Key Vault URL
SQL_ENDPOINT = "your-sql-endpoint"  # SQL endpoint for metadata tables
DATABASE_NAME = "your-database"  # Database containing AcademySecurity table

# Runtime parameters (can be passed from pipeline)
sync_scope = "FULL"  # Options: FULL, STUDENTS, TEACHING, ASSESSMENT, ATTAINMENT, ATTENDANCE, TIMETABLE, BEHAVIOUR, USERS

In [None]:
# Initialize Delta Table Manager
delta_mgr = DeltaTableManager(spark)

# API Endpoint Configuration
ENDPOINTS = {
    "STUDENTS": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students", "name": "student_details"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students/education-details", "name": "education_details"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students/general-attributes", "name": "general_attributes"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students/demographic-attributes", "name": "demographic_attributes"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students/send-attributes", "name": "send_attributes"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/students/sensitive-attributes", "name": "sensitive_attributes"}
    ],
    "TEACHING": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/teaching/departments", "name": "departments"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/teaching/subjects", "name": "subjects"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/teaching/groups", "name": "groups"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/teaching/group-students", "name": "group_students"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/teaching/teachers", "name": "teachers"}
    ],
    "ASSESSMENT": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/assessment/markbooks", "name": "markbooks"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/assessment/marksheet-grades", "name": "marksheet_grades"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/assessment/markslot-marks", "name": "markslot_marks"}
    ],
    "ATTAINMENT": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attainment/prior-attainment", "name": "prior_attainment"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attainment/grade-names", "name": "grade_names"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attainment/grades", "name": "grades"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attainment/exam-results", "name": "exam_results"}
    ],
    "ATTENDANCE": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attendance/codes", "name": "attendance_codes"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attendance/student-lesson-marks", "name": "student_lesson_marks", "requires_date": True},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attendance/student-session-marks/date/{date}", "name": "student_session_marks", "requires_date": True},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/attendance/student-session-summaries", "name": "student_session_summaries"}
    ],
    "TIMETABLE": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/timetable/calendar", "name": "calendar"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/timetable/periods", "name": "periods"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/timetable/classes", "name": "timetable_classes"}
    ],
    "BEHAVIOUR": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/behaviour/classifications", "name": "behaviour_classifications"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/behaviour/event-types", "name": "behaviour_event_types"},
        {"endpoint": "/customer/v1/academic-years/{academicYear}/behaviour/events", "name": "behaviour_events"}
    ],
    "USERS": [
        {"endpoint": "/customer/v1/academic-years/{academicYear}/users/staff", "name": "staff"}
    ]
}

In [None]:
# Fetch academy configurations from metadata table
def get_active_academies():
    """Retrieve active academy configurations from SQL metadata table"""
    query = """
    SELECT 
        AcademyCode,
        Name,
        CurrentAcademicYear,
        KeyVaultSecretName,
        LowestYear,
        HighestYear,
        GetLessonAttendance,
        GetSessionAttendance,
        AttendanceFrom,
        AttendanceTo,
        GetBehaviour,
        BehaviourFrom,
        BehaviourTo
    FROM sec.AcademySecurity
    WHERE Active = 1
    """
    
    # Read from SQL endpoint using Spark
    df = spark.read \
        .format("jdbc") \
        .option("url", f"jdbc:sqlserver://{SQL_ENDPOINT}") \
        .option("dbtable", f"({query}) AS academy_config") \
        .option("database", DATABASE_NAME) \
        .option("authentication", "ActiveDirectoryIntegrated") \
        .load()
    
    return df.collect()

# For Fabric, you might use the SQL Analytics endpoint directly:
# academies = spark.sql("SELECT * FROM sec.AcademySecurity WHERE Active = 1").collect()

print("Fetching active academies...")
academies = get_active_academies()
print(f"Found {len(academies)} active academies")

In [None]:
# Function to log sync results
def log_sync_result(academy_code, dataset, endpoint, success, record_count=None, exception=None, inner_exception=None):
    """Log sync result to tracking table"""
    from pyspark.sql import Row
    
    result_row = Row(
        AcademyCode=academy_code,
        DataSet=dataset,
        EndPoint=endpoint,
        LoggedAt=datetime.utcnow(),
        Result=success,
        RecordCount=record_count,
        Exception=exception,
        InnerException=inner_exception
    )
    
    result_df = spark.createDataFrame([result_row])
    
    # Write to SyncResults table
    result_df.write \
        .format("jdbc") \
        .option("url", f"jdbc:sqlserver://{SQL_ENDPOINT}") \
        .option("dbtable", "sec.SyncResults") \
        .option("database", DATABASE_NAME) \
        .option("authentication", "ActiveDirectoryIntegrated") \
        .mode("append") \
        .save()

In [None]:
# Main ingestion function
def ingest_endpoint_data(academy, endpoint_config, api_client):
    """Fetch data from a single endpoint and store in raw Delta table"""
    academy_code = academy.AcademyCode
    academic_year = academy.CurrentAcademicYear
    endpoint = endpoint_config["endpoint"]
    endpoint_name = endpoint_config["name"]
    
    try:
        print(f"Fetching {endpoint_name} for {academy_code} ({academic_year})...")
        
        # Handle date-based endpoints
        if endpoint_config.get("requires_date"):
            # For attendance endpoints, fetch data for the configured date range
            date_from = academy.AttendanceFrom if hasattr(academy, 'AttendanceFrom') else None
            date_to = academy.AttendanceTo if hasattr(academy, 'AttendanceTo') else None
            
            if date_from:
                date_str = date_from.strftime("%Y-%m-%d")
                data = api_client.fetch_all_data(endpoint, academic_year, date=date_str)
            else:
                print(f"Skipping {endpoint_name} - no date range configured")
                return
        else:
            data = api_client.fetch_all_data(endpoint, academic_year)
        
        if not data:
            print(f"No data returned for {endpoint_name}")
            log_sync_result(academy_code, academic_year, endpoint, True, 0)
            return
        
        # Add metadata to each record
        enriched_data = []
        for record in data:
            enriched_record = {
                **record,
                "_academy_code": academy_code,
                "_academic_year": academic_year,
                "_endpoint": endpoint_name,
                "_ingested_at": datetime.utcnow().isoformat()
            }
            enriched_data.append(enriched_record)
        
        # Write to raw Delta table
        raw_table_name = f"raw_g4s_{endpoint_name}"
        
        # Delete existing partition before writing
        try:
            if spark.catalog.tableExists(raw_table_name):
                delta_mgr.delete_partition(
                    raw_table_name,
                    f"_academy_code = '{academy_code}' AND _academic_year = '{academic_year}'"
                )
        except:
            pass  # Table might not exist yet
        
        # Write data
        delta_mgr.write_raw_json(
            enriched_data,
            raw_table_name,
            partition_cols=["_academy_code", "_academic_year"],
            mode="append"
        )
        
        print(f"✓ Successfully ingested {len(data)} records for {endpoint_name}")
        log_sync_result(academy_code, academic_year, endpoint, True, len(data))
        
    except Exception as e:
        error_msg = str(e)
        print(f"✗ Error ingesting {endpoint_name}: {error_msg}")
        log_sync_result(academy_code, academic_year, endpoint, False, exception=error_msg)
        raise

In [None]:
# Process all academies and endpoints
def process_sync(sync_scope="FULL"):
    """Main processing loop for API sync"""
    
    # Determine which endpoint groups to process
    if sync_scope == "FULL":
        endpoint_groups = ENDPOINTS.keys()
    else:
        endpoint_groups = [sync_scope] if sync_scope in ENDPOINTS else []
    
    if not endpoint_groups:
        print(f"Invalid sync scope: {sync_scope}")
        return
    
    total_start = datetime.now()
    
    for academy in academies:
        academy_code = academy.AcademyCode
        secret_name = academy.KeyVaultSecretName
        
        print(f"\n{'='*60}")
        print(f"Processing Academy: {academy_code} - {academy.Name}")
        print(f"{'='*60}")
        
        try:
            # Retrieve API key from Key Vault
            api_key = get_api_key_from_keyvault(KEY_VAULT_URL, secret_name)
            api_client = G4SApiClient(api_key)
            
            # Process each endpoint group
            for group_name in endpoint_groups:
                print(f"\n--- Processing {group_name} endpoints ---")
                
                for endpoint_config in ENDPOINTS[group_name]:
                    ingest_endpoint_data(academy, endpoint_config, api_client)
            
        except Exception as e:
            print(f"Error processing academy {academy_code}: {str(e)}")
            continue
    
    total_end = datetime.now()
    duration = (total_end - total_start).total_seconds()
    
    print(f"\n{'='*60}")
    print(f"Sync completed in {duration:.2f} seconds")
    print(f"{'='*60}")

# Execute the sync
process_sync(sync_scope)

In [None]:
# Optimize raw tables after ingestion
print("\nOptimizing raw Delta tables...")

for group_name, endpoints in ENDPOINTS.items():
    for endpoint_config in endpoints:
        table_name = f"raw_g4s_{endpoint_config['name']}"
        try:
            if spark.catalog.tableExists(table_name):
                delta_mgr.optimize_table(table_name, z_order_cols=["_academy_code", "_academic_year"])
                print(f"✓ Optimized {table_name}")
        except Exception as e:
            print(f"✗ Failed to optimize {table_name}: {str(e)}")

print("\nRaw layer ingestion complete!")