In [73]:
from notebookutils import mssparkutils
from datetime import datetime

In [75]:
start_time = datetime.now()
# FIXME: 2024-03-18 Frequency Based ETL deprecated in v0.6
highFrequentDelta = lowFrequentDelta = moderateFrequentDelta = descriptorsDelta = 0.00002

In [76]:
import random
import string

characters = string.ascii_letters + string.digits
ten_digit_alphanumeric = ''.join(random.choice(characters) for _ in range(10))

pipelineExecutionId = executionId = f'TEST_{ten_digit_alphanumeric}'

In [77]:
# NOTE: DO NOT REMOVE THESE VARIABLES
schoolYear_varParam = "SchoolYear"
districtId_varParam = "DistrictId"

In [78]:
input_params = {
    'create_s2r_descriptor_views': create_s2r_descriptor_views,
    'migrate_s2r_to_s3': migrate_s2r_to_s3,
    'create_s3_sql_db_views': create_s3_sql_db_views,
    'create_s3_semantic_views': create_s3_semantic_views,
    'workspace': workspace,
    'stage2_db_name': stage2_db_name,
    'stage3_db_name': stage3_db_name,
    'base_table_db_name': stage3_db_name,
    'schoolYear_varParam': schoolYear_varParam,
    'current_school_year': schoolYear,
    'schoolYear': schoolYear,
    'districtId_varParam': districtId_varParam,
    'districtId': districtId,
    'sql_db_secret_name': secret_name,
    'sql_db_user_name': user_name,
    'sql_db_name': database_name,
    'sql_db_data_source': data_source,
    'sql_db_server_name': server_name,
    'secret_name': secret_name,
    'database_name':database_name,
    'user_name':user_name,
    'data_source':data_source,
    'semantic_schema_name':semantic_schema_name,
    'driver':driver,
    'server_name':server_name,
    'apiVersion': apiVersion,
    'highFrequentDelta': highFrequentDelta,
    'moderateFrequentDelta': moderateFrequentDelta,
    'lowFrequentDelta': lowFrequentDelta,
    'descriptorsDelta': descriptorsDelta,
    'prepare_edgraph_metadata': prepare_edgraph_metadata}


In [79]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_edfi_py

In [80]:
from datetime import datetime
oea = EdFiOEAChild()   
oea.set_workspace(workspace)

In [81]:
def execute_etl_step(condition,
                     nb_path,
                     nb_timeout,
                     nb_params,
                     nb_error,
                     etl_stage):
    global etl_status
    # FIXME: Parameterize etl_status 
    stage_start_time = datetime.now()
    if nb_error is None:
        etl_status = f"{etl_stage} - RUNNING"
        try:
            if condition:
                mssparkutils.notebook.run(path = nb_path,
                                          timeout_seconds = nb_timeout,
                                          arguments = nb_params)
                etl_status = f"{etl_stage} - SUCCEEDED"
            else:
                etl_status = f"{etl_stage} - SKIPPED"
        except Exception as error_msg:
            etl_status = f"{etl_stage} - FAILED"
            logger.info(f"{error_msg}")
            nb_error = error_msg
    else:
        pass
    
    stage_end_time = datetime.now()
    log_data = pipeline_error_logger.create_log_dict(uniqueId = pipeline_error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                            pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                            sparkSessionId = spark.sparkContext.applicationId,
                                            stageName = etl_stage,
                                            start_time = stage_start_time,
                                            end_time = stage_end_time,
                                            run_status = 'SUCCEEDED' if nb_error is None else 'FAILED',
                                            etl_status = etl_status,
                                            error_description = str(nb_error))
    pipeline_error_logger.consolidate_logs(log_data,'stage')
    if ' - SUCCEEDED' in etl_status or ' - SKIPPED' in etl_status:
        return etl_status, None
    else:
        return etl_status, nb_error

def get_entity_logs_status(df, executionId, stageName):
    df = df.filter((F.col('pipelineExecutionId') == executionId) & (F.col('stageName') == stageName))
    df = df.withColumn('numRecordsFailed', F.col('totalNumOutputRows') - F.col('numInputRows'))

    if df.filter(F.col("numRecordsFailed") == 0).count() == df.count():
        return 'success'
    
    if df.filter(F.col("numRecordsFailed") != 0).count() == df.count():
        return 'failure'
    
    if df.agg(F.sum("numRecordsFailed")).collect()[0][0] != 0:
        return 'partial-success'
    
    return 'unknown'

### Main Code

In [82]:
nb_root = "EdGraph/modules/EdGraph_DW/v0.6/src"
nb_version = "edgraph_dw_v0_6"
nb_error = None

run_status = "INITIATED"
etl_status = ""

In [83]:
pipeline_error_logger = ErrorLogging(spark = spark, 
                                     oea = oea, 
                                     logger = logger)

try:
    run_status = "RUNNING"
    # NOTE: Prepare Meta Data 
    etl_status, nb_error = execute_etl_step(condition = prepare_edgraph_metadata,
                                            nb_path = f"{nb_root}/main/{nb_version}_freq_etl_metadata",
                                            nb_timeout = 1200,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "edgraph-dwh: Metadata Preparation")
    
    # NOTE: Prepare Meta Data 
    etl_status, nb_error = execute_etl_step(condition = create_s2r_descriptor_views,
                                            nb_path = f"{nb_root}/main/{nb_version}_s2r_descriptor_views",
                                            nb_timeout = 1200,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "edgraph-dwh: Metadata Preparation")
    # NOTE: Migrate To Stage 3 (Edgraph DWH)
    etl_status, nb_error = execute_etl_step(condition = migrate_s2r_to_s3,
                                            nb_path = f"{nb_root}/main/{nb_version}_migrate_to_stage3",
                                            nb_timeout = 3600,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "edgraph-dwh: Migrate From S2R To Edgraph DWH")
    # NOTE: Create SQL DB Views
    etl_status, nb_error = execute_etl_step(condition = create_s3_sql_db_views,
                                            nb_path = f"{nb_root}/main/{nb_version}_populate_sql_db",
                                            nb_timeout = 1200,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "edgraph-dwh: Create SQL DB Views")
    
    # NOTE: Create Semantic Views
    etl_status, nb_error = execute_etl_step(condition = create_s3_semantic_views,
                                            nb_path = f"{nb_root}/main/{nb_version}_semantic_views",
                                            nb_timeout = 1200,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "edgraph-dwh: Create Semantic Views")
    if nb_error is not None:
        raise Exception(f"{nb_error}")
    run_status = "SUCCEEDED"
except Exception as e:
    run_status = "FAILED"
    print(f'Pipeline Executed with Error {e}')

### Pipeline Level Logs

In [70]:
log_type = 'pipeline'

end_time = datetime.now()
log_data = pipeline_error_logger.create_log_dict(uniqueId = pipeline_error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                                pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                                sparkSessionId = spark.sparkContext.applicationId,
                                                start_time = start_time,
                                                end_time = end_time,
                                                run_status = run_status,
                                                etl_status = str(etl_status))
pipeline_error_logger.consolidate_logs(log_data,'pipeline')
df = pipeline_error_logger.create_spark_df('pipeline')


pipeline_error_logger.write_logs_to_delta_lake(df = df, 
                             log_type = 'pipeline',
                             destination_url = pipeline_error_logger.to_logs_url('etl-logs/log_type=pipeline'))
pipeline_error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edgraph_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'pipeline',
                                     overwrite = False)

### Stage Level Logs

In [71]:
# log_type = 'entity'
# entity_logs_url = pipeline_error_logger.to_logs_url('etl-logs/log_type=entity')
# entity_logs_df = oea.load(entity_logs_url)

In [72]:
df = pipeline_error_logger.create_spark_df('stage')
df = df.withColumn('stage_status', F.lit('Unknown'))

log_type = 'stage'
destination_url = pipeline_error_logger.to_logs_url('etl-logs/log_type=stage')

pipeline_error_logger.write_logs_to_delta_lake(df = df, 
                                               log_type = log_type,
                                               destination_url = destination_url)
pipeline_error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edgraph_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'stage',
                                     overwrite = False)