In [61]:
# NOTE: DO NOT REMOVE
minChangeVer = None
maxChangeVer = None
parameterized = False
# kvSecret_devMode = True 

# FIXME: Deprecated Params (Under Review)
metadataUrl = '<DEPRECATED PARAM>'
kvName = '<DEPRECATED PARAM>'

In [62]:
apiLimit = batchLimit #= 200
prepareEdFiMetaData = prepareEdFiMetadata #= False
client_id = kvSecret_clientId
client_secret = kvSecret_clientSecret

In [63]:
from notebookutils import mssparkutils
from datetime import datetime

In [64]:
start_time = datetime.now()

### URL Initializations

In [65]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [66]:
instance_id = instanceId
school_year = schoolYear
api_url = apiUrl

edfi_api_manager = EdFiApiManager(api_url, instance_id, school_year)
edfi_api_manager.update_urls()
edfi_api_manager.set_other_metadata()

dependenciesUrl = edfi_api_manager.dependencies_url
openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
dataManagementUrl = edfi_api_manager.data_management_url
authUrl = edfi_api_manager.auth_url

changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

apiVersion = edfi_api_manager.api_version
apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [67]:
%run OEA/modules/Ed-Fi/v0.8/src/utilities/edfi_v0_8_edfi_py

In [68]:
from datetime import datetime
oea = EdFiOEAChild()   
oea.set_workspace(workspace)

### Error Logging Initializations

In [69]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

### Other Initializations

In [70]:
# NOTE: 2024-03-27 Feature update to automate prepareEdFiMetadata
from py4j.protocol import Py4JJavaError
if not(prepareEdFiMetadata):
    try:
        metadata_root_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets'
        metadata_root_url = oea.to_url(metadata_root_path) 
        files = mssparkutils.fs.ls(metadata_root_url)
        metadata_exists = False
        freq_etl_exists = False
        for file in files:
            if file.name.lower() == 'metadata.csv':
                metadata_exists = True
            elif file.name.lower() == 'frequency_etl.csv':
                freq_etl_exists = True
        prepareEdFiMetaData = prepareEdFiMetadata = metadata_exists and freq_etl_exists
    except Py4JJavaError as e:
        logger.info("Ed-Fi metadata files not detected - attempting to create them first")
        prepareEdFiMetaData = prepareEdFiMetadata = True

In [71]:
import random
import string

characters = string.ascii_letters + string.digits
ten_digit_alphanumeric = ''.join(random.choice(characters) for _ in range(10))

# TODO: 2024-03-26 Under Review
if 'pipelineExecutionId' in globals():
    pipelineExecutionId = globals()['pipelineExecutionId']
    if pipelineExecutionId.lower().startswith('edgraph_dw_edfi_'):
        pipelineExecutionId = pipelineExecutionId[len('edgraph_dw_edfi_'):]
    executionId = pipelineExecutionId
    print("pipelineExecutionId exists in global variables and its value is:", pipelineExecutionId)
else:
    print("pipelineExecutionId does not exist in global variables")
    pipelineExecutionId = executionId = f'TEST_{ten_digit_alphanumeric}'

In [72]:
# FIXME: Do not pass uncessary params
input_params = {
        "kvSecret_devMode": kvSecret_devMode,
        "kvName": kvName,
        "kVName": kvName,
        "workspace": workspace,
        "apiUrl": apiUrl,
        "ApiUrl": apiUrl,
        "instanceId": instanceId,
        "InstanceId": instanceId,
        "moduleName": moduleName,
        "apiLimit": apiLimit,
        "minChangeVer": minChangeVer,
        "maxChangeVer": maxChangeVer,
        "schoolYear": schoolYear,
        "SchoolYear": schoolYear,
        "districtId": districtId,
        "districtID": districtId,
        "DistrictId": districtId,
        "districtPath": districtId,
        "edfi_url": apiUrl,
        "pipelineExecutionId" : pipelineExecutionId,
        "batchLimit": apiLimit,
        "metadataUrl": metadataUrl,
        "client_id": kvSecret_clientId,
        "client_secret_id": kvSecret_clientSecret,
        "kvSecret_clientId":kvSecret_clientId,
        "kvSecret_clientSecret":kvSecret_clientSecret,
        "prepareEdFiMetadata": prepareEdFiMetadata,
        "parameterized": parameterized,
        "highFrequentDelta" : highFrequentDelta,
        "moderateFrequentDelta" : moderateFrequentDelta,  
        "lowFrequentDelta" : lowFrequentDelta,  
        "descriptorsDelta" : descriptorsDelta,
        "etlProcessing": etlProcessing,
        "fetchHistory": fetchHistory,
        "ingestionHistoryMode": ingestionHistoryMode,
        "landingDateTimeFormat": landingDateTimeFormat,
        "landData": landData,
        "ingestData": ingestData,
        "refineData": refineData
        }

In [73]:
def execute_etl_step(condition,
                     nb_path,
                     nb_timeout,
                     nb_params,
                     nb_error,
                     etl_stage):
    global etl_status
    # FIXME: Parameterize etl_status 
    stage_start_time = datetime.now()
    if nb_error is None:
        etl_status = f"{etl_stage} - RUNNING"
        try:
            if condition:
                mssparkutils.notebook.run(path = nb_path,
                                          timeout_seconds = nb_timeout,
                                          arguments = nb_params)
                etl_status = f"{etl_stage} - SUCCEEDED"
            else:
                etl_status = f"{etl_stage} - SKIPPED"
        except Exception as error_msg:
            etl_status = f"{etl_stage} - FAILED"
            logger.info(f"{error_msg}")
            nb_error = error_msg
    else:
        pass
    
    stage_end_time = datetime.now()
    log_data = pipeline_error_logger.create_log_dict(uniqueId = pipeline_error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                            pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                            sparkSessionId = spark.sparkContext.applicationId,
                                            stageName = etl_stage,
                                            start_time = stage_start_time,
                                            end_time = stage_end_time,
                                            run_status = 'SUCCEEDED' if nb_error is None else 'FAILED',
                                            etl_status = etl_status,
                                            error_description = str(nb_error))
    pipeline_error_logger.consolidate_logs(log_data,'stage')
    if ' - SUCCEEDED' in etl_status or ' - SKIPPED' in etl_status:
        return etl_status, None
    else:
        return etl_status, nb_error

def get_entity_logs_status(df, executionId, stageName):
    df = df.filter((F.col('pipelineExecutionId') == executionId) & (F.col('stageName') == stageName))
    df = df.withColumn('numRecordsFailed', F.col('totalNumOutputRows') - F.col('numInputRows'))

    if df.filter(F.col("numRecordsFailed") == 0).count() == df.count():
        return 'success'
    
    if df.filter(F.col("numRecordsFailed") != 0).count() == df.count():
        return 'failure'
    
    if df.agg(F.sum("numRecordsFailed")).collect()[0][0] != 0:
        return 'partial-success'
    
    return 'unknown'

### Main Code

In [78]:
nb_root = "OEA/modules/Ed-Fi/v0.8/src/main"
nb_version = "edfi_v0_8"
nb_error = None

run_status = "INITIATED"
etl_status = ""

In [79]:
pipeline_error_logger = ErrorLogging(spark = spark, 
                                     oea = oea, 
                                     logger = logger)
pipeline_nb_error = None
try:
    run_status = "RUNNING"
    # NOTE: Prepare Meta Data 
    etl_status, nb_error = execute_etl_step(condition = prepareEdFiMetadata,
                                            nb_path = f"{nb_root}/{nb_version}_prepare_metadata",
                                            nb_timeout = 300,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "ed-fi: Metadata Preparation")
    # NOTE: Landing 
    etl_status, nb_error = execute_etl_step(condition = landData,
                                            nb_path = f"{nb_root}/{nb_version}_land",
                                            nb_timeout = 3600,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "ed-fi: Landing")
    # NOTE: Ingestion
    etl_status, nb_error = execute_etl_step(condition = ingestData,
                                            nb_path = f"{nb_root}/{nb_version}_ingest",
                                            nb_timeout = 7200,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "ed-fi: Ingestion")
     # NOTE: Refinement
    etl_status, nb_error = execute_etl_step(condition = refineData,
                                            nb_path = f"{nb_root}/{nb_version}_refine",
                                            nb_timeout = 14400,
                                            nb_params = input_params,
                                            nb_error = nb_error,
                                            etl_stage = "ed-fi: Refinement")
    if nb_error is not None:
        raise Exception(f"{nb_error}")
    else:
        run_status = "SUCCEEDED"
    # TODO: 2024-03-26 Under Review
    param_destination_path = f'etl-logs/log_type=run_params/ed-fi/{apiVersion}/pipelineExecutionId={pipelineExecutionId}/run_params.json'
    param_destination_url = error_logger.to_logs_url(param_destination_path)
    input_params_str = json.dumps(input_params)
    mssparkutils.fs.put(param_destination_url, input_params_str, True)  # Overwrite if the file existed
except Exception as pipeline_nb_error:
    run_status = "FAILED"
    print(f'Pipeline Executed with Error {pipeline_nb_error}')

In [ ]:
destination_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/frequency-etl/frequency_etl.csv'
processor = EntityFrequencyProcessor(oea = oea, 
                                     filepath = destination_path, 
                                     highFrequentDelta = highFrequentDelta,#0.005, 
                                     moderateFrequentDelta = moderateFrequentDelta, #5, 
                                     lowFrequentDelta = lowFrequentDelta, #10, 
                                     descriptorsDelta = descriptorsDelta) #360)

processor.load_lookup_df(runStatusTracked = False)
entities_to_etl = processor.return_entities_to_etl()
processor.update_lookup_df()
processor.write_lookup_df(destination_path)


### Pipeline Level Logs

In [ ]:
log_type = 'pipeline'

end_time = datetime.now()
log_data = pipeline_error_logger.create_log_dict(uniqueId = pipeline_error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                                pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                                sparkSessionId = spark.sparkContext.applicationId,
                                                start_time = start_time,
                                                end_time = end_time,
                                                run_status = run_status,
                                                etl_status = str(etl_status))
pipeline_error_logger.consolidate_logs(log_data,'pipeline')
df = pipeline_error_logger.create_spark_df('pipeline')


pipeline_error_logger.write_logs_to_delta_lake(df = df, 
                             log_type = 'pipeline',
                             destination_url = pipeline_error_logger.to_logs_url('etl-logs/log_type=pipeline'))
pipeline_error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edfi_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'pipeline',
                                     overwrite = False)

### Stage Level Logs

In [ ]:
# FIXME: WIP
df = pipeline_error_logger.create_spark_df('stage')
df = df.withColumn('stage_status', F.lit('Unknown'))

log_type = 'stage'
destination_url = pipeline_error_logger.to_logs_url('etl-logs/log_type=stage')

pipeline_error_logger.write_logs_to_delta_lake(df = df, 
                                               log_type = log_type,
                                               destination_url = destination_url)
pipeline_error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edfi_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'stage',
                                     overwrite = False)

In [ ]:
mssparkutils.session.stop()