In [40]:
instance = InstanceId = instanceId
ApiUrl = apiUrl
SchoolYear = schoolYear

districtPath = DistrictId = DistrictID  = districtID = districtId
apiLimit = batchLimit

prepareSAPMetaData = prepareSAPMetadata
zone = submissionsType = sap_pipeline

In [41]:
def assign_default_variable(variable_name, default_value):
    if variable_name not in globals():
        globals()[variable_name] = default_value
        logger.info(f'{variable_name} not found - using system default')

### Pre-Requisites (Dev)

In [42]:
from notebookutils import mssparkutils
import configparser
import random

config_path = "/tmp/conf.ini"
def copy_config_to_temp():
    mssparkutils.fs.cp(oea.to_url("stage1/Transactional/SAP/metadata-assets/edfi-configs.ini"),"file:/tmp/conf.ini")

def read_edfi_credentials(config_path):
    config = configparser.ConfigParser()
    config.read(config_path)

    edfi_credentials = {}

    if 'EdFi' in config:
        edfi_credentials['client_id'] = config['EdFi'].get('client_id', '')
        edfi_credentials['client_secret'] = config['EdFi'].get('client_secret', '')
    
    return edfi_credentials

### Actual Code

In [43]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import * #StringType, StructType, StructField, IntegerType, DateType

import pyspark.sql.functions as F
from pyspark.sql.functions import col, struct, concat, lit, round, concat, array
from pyspark.sql.functions import regexp_replace, expr, when, date_format, to_date

import uuid
from datetime import datetime
import logging
import json
import csv
import copy

import threading
import requests
from requests.auth import HTTPBasicAuth

import random
import string

In [44]:
print('Submissions - TESTING PARAMETERIZATION')
try:
    print(kVName)
    print(workspace)
    print(apiUrl)
    print(instanceId)
    print(moduleName)
    print(apiLimit)
    print(minChangeVer)
    print(maxChangeVer)
    print(sapVersion)
    print(prepareSAPMetaData)
    print(submissions)
    print(sap_pipeline)
    print(sap_pipelineType)
    print(schoolYear)
    print(districtID)
    print(pipelineExecutionId)

    kvName = kVName
    districtId = districtID
    districtPath = districtId
except Exception as params_error:
    print('CATCHING ERROR!!!')
    print(params_error)

### URL Initializations

In [45]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [46]:
instance_id = instanceId
school_year = schoolYear
api_year = school_year
api_url = apiUrl

# FIXME: 2024-01-31 TEMP FIX FOR FY
try:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, api_year)
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion
except Exception as error:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, '')
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [47]:
%run EdGraph/modules/SAP_PEIMS/v0.6/src/utilities/sap_peim_v0_6_sap_py

In [48]:
# TODO: Rename Relevant Child Class to follow a more intuitive naming convention
from datetime import datetime
oea = SAPEdFiOEAChild()   
oea.set_workspace(workspace)

In [49]:
# swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

In [50]:
# Set Ed-Fi Credentials
# copy_config_to_temp()

# credentials = read_edfi_credentials(config_path)
# client_id = credentials.get('client_id')
# client_secret_id = credentials.get('client_secret')

### Metadata Processing

In [51]:
metadata_path = "stage1/Transactional/SAP/metadata-assets/sap-to-edfi.json"
metadata_url = oea.to_url(metadata_path)

In [52]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
config_data = json.loads(json_string)

### SAP & Error Logging Initiliazations

In [53]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

In [54]:
sap_utilities = SAPUtilities(spark = spark, 
                             oea = oea)

In [55]:
sap_to_edfi_complex = config_data.get('sap_to_edfi_complex', {})
final_columns = config_data.get('final_columns', {})
final_columns = {key: [col for col in columns if col not in ['rundate']] for key, columns in final_columns.items()}

_ext_TX_cols = config_data.get('_ext_TX_cols', {})
descriptorsDFRef = config_data.get('descriptorsDFRef', {})
descriptors = config_data.get('descriptors', [])

In [56]:
nine_digit_number = random.randint(100000000, 999999999)
assign_default_variable(variable_name = 'pipeline_execution_id', 
                        default_value = 'Test_1234')
pipeline_execution_id = pipelineExecutionId

characters = string.ascii_letters + string.digits
random_word = ''.join(random.choice(characters) for _ in range(15))
run_id = f"runid_{random_word}"

### EdFi Client Initializations

In [57]:
logger = logging.getLogger('EdFiAPIClient')
exception = None
edfiAPIClient = edfiLandClient = None

from datetime import datetime
minChangeVer = None
maxChangeVer = None

# kvSecret_clientId = None # oea._get_secret("oea-edfi-api-client-id")
# kvSecret_clientSecret = None # oea._get_secret("oea-edfi-api-client-secret")

try:
    global edfiAPIClient, edfiLandClient
    edfiAPIClient = SAPEdFiClient(workspace = workspace, 
                                    kvName = kvName, #NOTE: Default to None 
                                    moduleName = moduleName, 
                                    authUrl = authUrl, 
                                    dataManagementUrl = dataManagementUrl, 
                                    changeQueriesUrl = changeQueriesUrl, 
                                    dependenciesUrl = dependenciesUrl, 
                                    apiVersion = apiVersion, 
                                    batchLimit = batchLimit, 
                                    minChangeVer = minChangeVer, 
                                    maxChangeVer = maxChangeVer,
                                    oea = oea,
                                    schoolYear = schoolYear,
                                    districtId = districtId,
                                    final_columns  = final_columns,
                                    lookup_table_name = 'submissions_lookup_table',
                                    lookup_table_base_path = 'stage1/Transactional/SAP/metadata-assets',
                                    lookup_db_name = f'ldb_{workspace}_sap_etl_logs',
                                    kvSecret_clientId = kvSecret_clientId,
                                    kvSecret_clientSecret = kvSecret_clientSecret)
except Exception as exception:
    logger.exception(exception)

In [58]:
entities_info = edfiAPIClient.getEntities()

### Utility Functions
1. EdFiSubmissionProcessor Class
2. Utilities To Selectively Delete
3. Utilities To Return only the relevant ed-fi entities

In [59]:
from datetime import datetime
from pyspark.sql.functions import lit

class EdfiSubmissionProcessor:
    def __init__(self, oea, edfiAPIClient, error_logger, logger, spark, pipelineExecutionId, test_mode=False):
        self.oea = oea
        self.edfiAPIClient = edfiAPIClient
        self.error_logger = error_logger
        self.logger = logger
        self.spark = spark
        self.test_mode = test_mode
        self.pipelineExecutionId = pipelineExecutionId
    
    def set_submission_type(self, sap_pipeline, sap_pipelineType):
        self.sap_pipeline = sap_pipeline
        self.sap_pipelineType = sap_pipelineType

    def extract_resources_dict(self, file_path):
        items = self.oea.get_folders(file_path)
        resource_names = [item for item in items if not(item == 'data-submission-logs' or item.lower().endswith('descriptors'))]

        resource_json_dict = self.edfiAPIClient.getDataForEdFiPosts(
            resource_names = resource_names,
            file_path = file_path,
            resource_json_dict = dict())
        resource_names = {resource: resource for resource in resource_names}
        return resource_json_dict, resource_names

    def post_to_edfi_resources(self, 
                               run_id = None,
                               resource_json_dict = None, 
                               resource_names = None,
                               data_sample = None, 
                               test_mode = True):
        # global pipelineExecutionId
        for stage3Name, edfiName in resource_names.items():
            start_time = datetime.now()

            if test_mode:
                data_slice = data_sample if data_sample else resource_json_dict[edfiName]
            else:
                data_slice = resource_json_dict[edfiName]

            try:
                self.logger.info(f"POST Requests for the Resource: {stage3Name}")
                if stage3Name == 'staffs' or stage3Name == 'staffEducationOrganizationEmploymentAssociations' or stage3Name == 'staffEducationOrganizationAssignmentAssociations':
                    self.edfiAPIClient.upsert_records(
                        pipeline_execution_id = self.pipelineExecutionId,
                        run_id = run_id,
                        resource = f'/ed-fi/{edfiName}',
                        resource_name = stage3Name,
                        records = data_slice,
                        chunk_size = 500,
                        num_threads = 10,
                        function_name = 'post',
                        success_logging = True,
                        error_logging = True
                    )

                    end_time = datetime.now()
                    log_data = self.error_logger.create_log_dict(
                        uniqueId = self.error_logger.generate_random_alphanumeric(10),
                        pipelineExecutionId = self.pipelineExecutionId,
                        sparkSessionId = self.spark.sparkContext.applicationId,
                        etlType = self.sap_pipeline,
                        stageName = "Submission",
                        schemaFormat = 'ed-fi',
                        entityType = 'ed-fi',
                        entityName = edfiName,
                        numInputRows = len(data_slice),
                        totalNumOutputRows = len(data_slice),
                        numTargetRowsInserted = 0,
                        numTargetRowsUpdated = 0,
                        numRecordsSkipped = 0,
                        # numRecordsDeleted = 0,
                        start_time = start_time,
                        end_time = end_time,                                                
                        insertionType = 'upsert',
                        emptySchemaMetadata = False
                    )
                else:
                    self.edfiAPIClient.upsert_records(
                        pipeline_execution_id = self.pipelineExecutionId,
                        run_id = run_id,
                        resource = f'/TX/{edfiName}',
                        resource_name = stage3Name,
                        records = data_slice,
                        chunk_size = 500,
                        num_threads = 10,
                        function_name = 'post',
                        success_logging = True,
                        error_logging = True
                    )

                    end_time = datetime.now()
                    log_data = self.error_logger.create_log_dict(
                        uniqueId = self.error_logger.generate_random_alphanumeric(10),
                        pipelineExecutionId = self.pipelineExecutionId,
                        sparkSessionId = self.spark.sparkContext.applicationId,
                        etlType = self.sap_pipeline,
                        stageName = "Submission",
                        schemaFormat = 'ed-fi',
                        entityType = 'tx',
                        entityName = edfiName,
                        numInputRows = len(data_slice),
                        totalNumOutputRows = len(data_slice),
                        numTargetRowsInserted = 0,
                        numTargetRowsUpdated = 0,
                        numRecordsSkipped = 0,
                        # numRecordsDeleted = 0,
                        start_time = start_time,
                        end_time = end_time,
                        insertionType = 'upsert',
                        emptySchemaMetadata = False
                    )

                self.error_logger.consolidate_logs(log_data, 'entity')
            except Exception as e:
                self.logger.info(f"Exception {e}")

    def process_logs(self, df, sap_pipeline, sap_pipelineType, is_post_success, run_date):
        df = df.withColumn('sap_pipeline', lit(sap_pipeline))
        df = df.withColumn('sap_pipelineType', lit(sap_pipelineType))
        df = df.withColumn('is_post_success', lit(is_post_success))
        df = df.withColumn('run_date', lit(run_date))
        df = df.withColumn('entityType', F.split(col('resource'), '/')[1])
        df = df.withColumn('entityName', F.split(col('resource'), '/')[2])
        df = df.withColumn('log_type', lit('submission'))
        return df

    def write_logs(self, df, log_file_url):
        if self.test_mode:
            self.logger.info('TEST MODE - LOGS NOT WRITTEN')
            return df, log_file_url
        else:
            # self.spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
            df.write.format('delta').partitionBy('sap_pipeline', 'sap_pipelineType', 'entityName').mode('append').save(log_file_url)
            return None, None

    def return_resource_json_dict(self, apiVersion, ext_type):
        file_path = f'stage3/pipeline={self.sap_pipeline}/pipelineType={self.sap_pipelineType}/{apiVersion}/general/{ext_type}'
        try:
            resource_json_dict, resource_names = self.extract_resources_dict(file_path)
        except:
            self.logger.exception(f'[POST TO ED-FI] Loading Data From Stage 3 {file_path}')
            resource_json_dict = dict()
            resource_names = list()
        return resource_json_dict, resource_names

In [60]:
def return_last_submission_logs(entity_name):
    query = f"""WITH maxPipelineExecutionIdCTE AS
                (
                    SELECT pipelineExecutionId
                    FROM ldb_{workspace}_sap_etl_logs.etlsubmissionslogs
                    WHERE entity_name = '{entity_name}'
                      AND operation_type != 'delete'
                    ORDER BY start_time desc, end_time desc
                    LIMIT 1
                )
                SELECT DISTINCT edfi_id,
                                edfi_location,
                                NATURAL_KEY_HASH 
                FROM ldb_{workspace}_sap_etl_logs.etlsubmissionslogs
                INNER JOIN maxPipelineExecutionIdCTE 
                    ON etlsubmissionslogs.pipelineExecutionId = maxPipelineExecutionIdCTE.pipelineExecutionId
                WHERE entity_name = '{entity_name}'
                  AND response_status_code LIKE '2%' 
                  AND response_status_code NOT LIKE '204'
                """
    df = spark.sql(query)
    return df

def return_records_to_be_deleted(entity_name):
    query = f"""SELECT DISTINCT temp_vw_last_logs_df.edfi_id,
                                temp_vw_last_logs_df.edfi_location
                FROM temp_vw_last_logs_df
                LEFT JOIN temp_vw_recent_df
                    ON temp_vw_last_logs_df.NATURAL_KEY_HASH = temp_vw_recent_df.NATURAL_KEY_HASH
                ---WHERE temp_vw_last_logs_df.NATURAL_KEY_HASH IS NOT NULL
                WHERE temp_vw_recent_df.NATURAL_KEY_HASH IS NULL
            """
    df = spark.sql(query)
    return df

In [61]:
def return_sap_entities(sap_pipeline, sap_pipelineType):
    if sap_pipeline == 'TEA':
        if sap_pipelineType == 'PEIMS_FALL':
            return ['YHROHPM04', 'YHROHPM07', 'YHROHPM08', 'YHROHPM09', 'YHROHPM10', 'YFMOHPEIM']
        elif sap_pipelineType == 'PEIMS_MIDYR':
            return ['YFIOHPEIM']
        elif sap_pipelineType == 'PEIMS_EXYR':
            return ['YHROHPM04']
        elif sap_pipelineType == 'TSDS_ECDS_KG':
            return ['YHROHPM03']
        elif sap_pipelineType == 'TSDS_CLASS_ROSTER_FALL':
            return ['YHROHPM05']
        elif sap_pipelineType == 'TSDS_ECDS_PK':
            return ['YHROHPM02']
    return None
    
def return_entities_to_delete(tables_source, 
                              items):
    edfi_items = list()
    entity_base_path = '/'.join(tables_source.split('/')[2:])
    if items is None:
        items = oea.get_folders(tables_source)
    else:
        temp_items = set(oea.get_folders(tables_source))
        common_items = list(temp_items.intersection(items))
        items = [item for item in items if item in common_items]
    for item in items:
        edfi_items.append(sap_to_edfi_complex.get(item))
    return edfi_items

### Entities To Delete
1. Return Only Submission Type and Period Specific Entities 
2. Return Only Entities to Delete from the above ones

In [62]:
items = return_sap_entities(sap_pipeline, sap_pipelineType)
if sap_pipeline != 'analytics':
    entities_to_delete = return_entities_to_delete(tables_source = f'stage1/Transactional/SAP/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}', 
                                                   items = items)
else:
    entities_to_delete = list()

### Main Code

In [64]:
test_mode = False
testEntity = 'contractedInstructionalStaffFTEExts'

edfiSubmissionProcessor = EdfiSubmissionProcessor(oea = oea, 
                                                  edfiAPIClient = edfiAPIClient, 
                                                  error_logger = error_logger, 
                                                  logger = logger, 
                                                  spark = spark, 
                                                  pipelineExecutionId = pipelineExecutionId, 
                                                  test_mode = False)

In [69]:
tables_with_dependencies = {'staffs': ['payrollExts', 
                                       'staffEducationOrganizationEmploymentAssociations',
                                       'staffEducationOrganizationAssignmentAssociations']}

query = f"""SELECT edfi_id, 
                   edfi_id_modified,
                   edfi_location,
                   entity_name,
                   staffUniqueId
                FROM ldb_{workspace}_sap_etl_logs.edfi_ids_lookup_table
                WHERE isDeleted = False
            """
entity_ids_df = spark.sql(query).cache()
entity_ids_df.createOrReplaceTempView('temp_edfi_ids_lookup_table')

for entity_name in entities_to_delete:
    # TODO: Review or optimize?
    if entity_name.lower().endswith('exts'):
        ext_type = 'tx'
    else:
        ext_type = 'ed-fi'
    stage3_path = f'stage3/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{apiVersion}/general/{ext_type}/{entity_name}'
    
    # NOTE: FETCH RECORDS BY SY AND DI
    df = edfiAPIClient.load_by_SY_DI(stage3_path)
    submission_type = edfiAPIClient.return_submission_type(df)
    
    # NOTE: FETCH THE LATEST RECORDS FROM STAGE 3 I.E. HAVING RUNDATE >= LASTSUBMISSIONDATE
    df = edfiAPIClient.get_latest_submission_records(df = df, 
                                                    lookup_table_name = edfiAPIClient.lookup_table_name,#'submissions_lookup_table', 
                                                    filtering_date = 'rundate',
                                                    resource_name = entity_name,
                                                    sap_pipeline = sap_pipeline,
                                                    sap_pipelineType = sap_pipelineType,
                                                    operationType = 'delete',
                                                    debugMode = False)
    df = df.filter(df['SUBMISSION_RECORD_IS_ACTIVE'] == False)
    if entity_name == 'staffs':
        df = df.select('NATURAL_KEY_HASH', 'lakeId', 'edfi_id', 'edfi_id_modified', 'staffUniqueId', 'edfi_location')
    else:
        df = df.select('NATURAL_KEY_HASH', 'lakeId', 'edfi_id', 'edfi_id_modified', 'edfi_location')
    
    # NOTE: Temp Session Views
    df.createOrReplaceTempView('temp_vw_recent_df')
    df.createOrReplaceTempView('temp_vw_record_deletes')
    
    # TODO: Assess if the logging output is semantically correct or not
    logger.info(f'{entity_name} - NUMBER OF RECORDS AS INCOMING RECORDS FOR DELETES    : {df.count()}')
    if entity_name in tables_with_dependencies.keys():
        # FIXME: 2024-03-15: Need to check the side effects of v0.6 delete codes on dependent entities
        logger.info(f'Removing the dependent entities')
        for dependent_entity_name in tables_with_dependencies[entity_name]:
            if entity_name == 'staffs':
                temp_df = spark.sql(f"""SELECT temp_edfi_ids_lookup_table.edfi_id,
                                                     temp_edfi_ids_lookup_table.edfi_location
                                FROM temp_edfi_ids_lookup_table
                                INNER JOIN temp_vw_recent_df
                                    ON temp_edfi_ids_lookup_table.staffUniqueId = temp_vw_recent_df.staffUniqueId
                                WHERE LOWER(entity_name) = '{dependent_entity_name.lower()}'
                                """)
                temp_df.createOrReplaceTempView('temp_vw_dependent_record_deletes')
                logger.info(f'{dependent_entity_name} - NUMBER OF RECORDS TO BE DELETED: {temp_df.count()}')
            temp_records = temp_df.toJSON().map(lambda x: json.loads(x)).collect()
            ext_type = 'TX' if dependent_entity_name.lower().endswith('exts') else 'ed-fi'
            edfiAPIClient.upsert_records(pipeline_execution_id = pipelineExecutionId,
                                         run_id = run_id,
                                         resource = f"/{ext_type}/{dependent_entity_name}", 
                                         resource_name = dependent_entity_name, 
                                         records = temp_records, 
                                         chunk_size = 500, 
                                         num_threads = 10, 
                                         function_name = 'delete',
                                         success_logging = True, 
                                         error_logging = True)
    # NOTE: FILTER OUT THE RECORDS THAT HAVE ALREADY BEEN DELETED
    temp_df = spark.sql(f"""SELECT DISTINCT temp_vw_record_deletes.edfi_id,
                                            temp_vw_record_deletes.edfi_location
                            FROM temp_vw_record_deletes 
                            INNER JOIN temp_edfi_ids_lookup_table
                                ON temp_vw_record_deletes.edfi_id = temp_edfi_ids_lookup_table.edfi_id
                               AND temp_vw_record_deletes.edfi_location = temp_edfi_ids_lookup_table.edfi_location
                        """)
    temp_df.createOrReplaceTempView('temp_vw_record_deletes')
    logger.info(f'{entity_name} - NUMBER OF RECORDS TO BE DELETED: {temp_df.count()}')
    temp_records = temp_df.toJSON().map(lambda x: json.loads(x)).collect()
    ext_type = 'TX' if entity_name.lower().endswith('exts') else 'ed-fi'
    edfiAPIClient.upsert_records(pipeline_execution_id = pipelineExecutionId,
                                     run_id = run_id,
                                     resource = f"/{ext_type}/{entity_name}", 
                                     resource_name = entity_name, 
                                     records = temp_records, 
                                     chunk_size = 500, 
                                     num_threads = 10, 
                                     function_name = 'delete',
                                     success_logging = True, 
                                     error_logging = True)

### Ed-Fi IDs - Lookup Table

In [31]:
edfi_ids_lookup_table_path = 'stage1/Transactional/SAP/metadata-assets/edfi_ids_lookup_table'
if edfiAPIClient.edfi_id_records != [] and sap_pipeline != 'analytics':
    edfi_id_records_df = spark.createDataFrame(edfiAPIClient.edfi_id_records, schema = edfiAPIClient.edfi_id_record_schema) if len(edfiAPIClient.edfi_id_records) != 0 else spark.createDataFrame([], schema = edfiAPIClient.edfi_id_record_schema)
    primary_key = ['edfi_id', 'resource']
    update_cols = {"sink.isDeleted": "updates.isDeleted"}

    destination_url = oea.to_url(edfi_ids_lookup_table_path)
    pk_statement = oea.return_pk_statement(primary_key)
    if DeltaTable.isDeltaTable(spark, destination_url):
        logger.info('TRUE UPSERT')
        delta_table_sink = DeltaTable.forPath(spark, destination_url)
        delta_table_sink.alias('sink').merge(edfi_id_records_df.alias('updates'), pk_statement).whenMatchedUpdate(set = update_cols).execute()# .whenNotMatchedInsert(values = insert_cols).execute()
    else:
        logger.error(f'Invalid Path - {edfi_ids_lookup_table_path}')

    spark.sql(f'CREATE DATABASE IF NOT EXISTS ldb_{workspace}_sap_etl_logs')
    spark.sql(f"drop table if exists ldb_{workspace}_sap_etl_logs.edfi_ids_lookup_table")
    spark.sql(f"create table if not exists ldb_{workspace}_sap_etl_logs.edfi_ids_lookup_table using DELTA location '{oea.to_url(edfi_ids_lookup_table_path)}'")

### Submissions - Lookup Table

In [32]:
if edfiAPIClient.max_rundates != []:
    edfiAPIClient.dump_lookup_table()
    edfiAPIClient.add_lookup_table_to_lake_db(overwrite = True)

### Error Logging

In [33]:
if sap_pipeline != 'analytics':
    error_df = spark.createDataFrame(edfiAPIClient.error_logs, schema = edfiAPIClient.log_schema) if len(edfiAPIClient.error_logs) != 0 else spark.createDataFrame([], schema = edfiAPIClient.log_schema)
    success_df = spark.createDataFrame(edfiAPIClient.success_logs, schema = edfiAPIClient.log_schema) if len(edfiAPIClient.success_logs) != 0 else spark.createDataFrame([], schema = edfiAPIClient.log_schema)

    current_datetime = datetime.today()
    run_date = current_datetime.strftime("%Y-%m-%d")
    test_mode = False

In [37]:
if sap_pipeline != 'analytics':
    # NOTE: If all is passed all will be the submission_type
    success_df = edfiSubmissionProcessor.process_logs(df = success_df, 
                                sap_pipeline = sap_pipeline,
                                sap_pipelineType =  sap_pipelineType,
                                is_post_success = True, 
                                run_date = run_date)
    # NOTE: If all is passed all will be the submission_type
    error_df = edfiSubmissionProcessor.process_logs(df = error_df, 
                                sap_pipeline = sap_pipeline,
                                sap_pipelineType = sap_pipelineType,
                                is_post_success = False, 
                                run_date = run_date)
    logs_df = success_df.union(error_df)

In [38]:
if sap_pipeline != 'analytics':
   log_file_url = error_logger.to_logs_url('etl-logs/log_type=submissions')
   df_delta, _ = edfiSubmissionProcessor.write_logs(df = logs_df, 
                           log_file_url = log_file_url)
   edfiSubmissionProcessor.error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                       logs_base_path = 'etl-logs',
                                       log_type = 'submissions',
                                       overwrite = True)

In [44]:
# TODO: Create entity level logs for deletes as well
# TODO: Test this with some actual data
# NOTE: Current implementation assumes that numInputRows is for only those rows that go to the API
#       That is, if the code filters out records before delete api calls => they are NOT counted
#       as part of input rows
from pyspark.sql import Window
if logs_df.count() >= 0:
    logs_df.createOrReplaceTempView('temp_df_logs_df')
    query = f"""
            WITH entity_logs AS (
                SELECT * FROM temp_df_logs_df
                WHERE operation_type = 'delete'
            )
            SELECT  max(end_time) as end_time,
                    entityName as entityName,
                    entityType as entityType,
                    sap_pipeline as sap_pipeline,
                    sap_pipelineType as sap_pipelineType,
                    'entity' as log_type,
                    count(*) as numInputRows,
                    count(CASE WHEN response_status_code like '204' THEN 1 END) AS numRecordsDeleted,
                    0 as numRecordsSkipped,
                    0 as numTargetRowsInserted,
                    0 as numTargetRowsUpdated,
                    pipelineExecutionId,
                    'ed-fi' as schemaFormat,
                    '{spark.sparkContext.applicationId}' as sparkSessionId,
                    'Deletion' as stageName,
                    min(start_time) as start_time,
                    0 as totalNumOutputRows,
                    'NA' as insertionType,
                    False as emptySchemaMetadata
            FROM entity_logs
            GROUP BY entityName,
                    entityType,
                    sap_pipeline,
                    sap_pipelineType,
                    pipelineExecutionId
    """
    entity_logs = spark.sql(query)
    entity_logs = entity_logs.withColumn("row_idx", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))

    temp_uniqueIDs = [{'uniqueId': error_logger.generate_random_alphanumeric(10)} for i in range(entity_logs.count())]
    temp_uniqueIDsDF = spark.createDataFrame(data = temp_uniqueIDs, 
                                             schema = StructType([
                                                                StructField("uniqueId", StringType(), True)
                                                                ]))
    temp_uniqueIDsDF = temp_uniqueIDsDF.withColumn("row_idx", f.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
    entity_logs = entity_logs.join(temp_uniqueIDsDF, entity_logs.row_idx == temp_uniqueIDsDF.row_idx).drop("row_idx")
else:
    # TODO: Log for zero records being posted but how?
    pass
error_logger.write_logs_to_delta_lake(df = entity_logs, 
                                      log_type = 'entity',
                                      destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'entity',
                                     overwrite = False)