In [39]:
instance = instanceId = InstanceId
apiUrl = ApiUrl
schoolYear = SchoolYear
DistrictId = DistrictID = districtId = districtID
apiLimit = batchLimit

prepareSAPMetaData = prepareSAPMetadata
zone = submissionsType = sap_pipeline

In [ ]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

from pyspark.sql.types import StructField, StructType
from pyspark.sql.functions import col, expr

from pyspark.sql.functions import col
from pyspark.sql.functions import col, substring, regexp_extract, split, lit, lower

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import input_file_name, lit, expr

In [ ]:
print('Stage 3 Submission - TESTING PARAMETERIZATION')
try:
    print(kVName)
    print(workspace)
    print(apiUrl)
    print(instanceId)
    print(moduleName)
    print(apiLimit)
    print(minChangeVer)
    print(maxChangeVer)
    print(sapVersion)
    print(prepareSAPMetaData)
    print(submissions)
    print(submissionsType)
    print(schoolYear)
    print(districtID)
    print(pipelineExecutionId)

    kvName = kVName
    districtId = districtID
    districtPath = districtId
    
except Exception as params_error:
    print('CATCHING ERROR!!!')
    print(params_error)

### URL Initializations

In [ ]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [ ]:
instance_id = instanceId
school_year = schoolYear
api_year = school_year
api_url = apiUrl

# FIXME: 2024-01-31 TEMP FIX FOR FY
try:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, api_year)
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion
except Exception as error:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, '')
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [ ]:
%run EdGraph/modules/SAP_PEIMS/v0.6/src/utilities/sap_peim_v0_6_sap_py

In [ ]:
from datetime import datetime
oea = SAPEdFiOEAChild()   
oea.set_workspace(workspace)

In [ ]:
oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

primitive_datatypes = ['timestamp', 'date', 'decimal', 'boolean', 'integer', 'string', 'long']

### Metadata For Processing

In [ ]:
metadata_path = "stage1/Transactional/SAP/metadata-assets/sap-to-edfi.json"
metadata_url = oea.to_url(metadata_path)

In [ ]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
config_data = json.loads(json_string)

### SAP & Error Logging Initializations

In [25]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

In [26]:
sap_utilities = SAPUtilities(spark = spark, 
                             oea = oea)

In [27]:
# sap_pipeline = "peims-submissions"

sap_to_edfi_complex = config_data.get('sap_to_edfi_complex', {})
final_columns = config_data.get('final_columns', {})
_ext_TX_cols = config_data.get('_ext_TX_cols', {})
descriptorsDFRef = config_data.get('descriptorsDFRef', {})
descriptors = config_data.get('descriptors', [])

edfi_to_sap_complex = {value: key for key, value in sap_to_edfi_complex.items()}

### Main Code

In [34]:
def add_metadata_columns(df, **kwargs):
    for column_name, constant_value in kwargs.items():
        if column_name not in df.columns:
            df = df.withColumn(column_name, F.lit(constant_value))
    return df

def get_latest_s2r_changes(source_path, sink_path, filtering_date = 'rundate',debugMode = False, sap_pipeline = 'PLACEHOLDER', sap_pipelineType = 'PLACEHOLDER', districtId = '101912', schoolYear = 'PLACEHOLDER'):
        maxdatetime = None
        try:
            sink_df = oea.query(source_path = sink_path, 
                                query_str = f"""select max({filtering_date}) maxdatetime 
                                                """,
                                criteria_str = f"""DistrictId = '{districtId}'
                                                   AND SchoolYear = '{schoolYear}' 
                                                   AND sap_pipeline = '{sap_pipeline}'
                                                   AND sap_pipelineType = '{sap_pipelineType}'
                                                """)
            maxdatetime = sink_df.first()['maxdatetime']
        except AnalysisException as e:
            pass

        changes_df = oea.query(source_path = source_path, 
                               query_str = f"""select * """,
                               criteria_str = f"""DistrictId = '{districtId}'
                                                   AND SchoolYear = '{schoolYear}' 
                                                   AND sap_pipeline = '{sap_pipeline}'
                                                   AND sap_pipelineType = '{sap_pipelineType}'
                                                """)
        if maxdatetime and not(debugMode):
            changes_df = changes_df.where(f"{filtering_date} > '{maxdatetime}'")        
        return changes_df

def upsert_with_logging(df, 
                        sap_pipeline,
                        destination_path, 
                        primary_key, 
                        partitioning, 
                        partitioning_cols,
                        table_name,
                        ext_entity):
        start_time = datetime.now()
        numInputRows, numOutputRows, numTargetRowsInserted, numTargetRowsUpdated = oea.upsert(df = df, 
                                                                    destination_path = destination_path,
                                                                    primary_key = primary_key,#['RECORD', 'DistrictId', 'SchoolYear'],
                                                                    partitioning = True,
                                                                    partitioning_cols = partitioning_cols,
                                                                    surrogate_key = False)   
        end_time = datetime.now()
        log_data = error_logger.create_log_dict(uniqueId = error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                                pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                                sparkSessionId = spark.sparkContext.applicationId,
                                                sap_pipeline = sap_pipeline,
                                                sap_pipelineType = sap_pipelineType,
                                                stageName = "S2R-To-Stage3",
                                                schemaFormat = 'ed-fi',
                                                entityType = ext_entity,
                                                entityName = table_name,
                                                numInputRows = numInputRows,
                                                totalNumOutputRows = numOutputRows,
                                                numTargetRowsInserted = numTargetRowsInserted,
                                                numTargetRowsUpdated = numTargetRowsUpdated,
                                                numRecordsSkipped = 0,
                                                numRecordsDeleted = 0,
                                                start_time = start_time,
                                                end_time = end_time,
                                                insertionType = 'upsert',
                                                emptySchemaMetadata = False)
        error_logger.consolidate_logs(log_data,'entity')


def dump_to_stage3(source_path, 
                   sap_pipeline,
                   sap_pipelineType,
                   destination_path,
                   primary_key,
                   partitioning,
                   partitioning_cols,
                   extension,
                   schoolYear,
                   districtId):
    items = oea.get_folders(source_path)
    for item in items: 
        table_path = source_path +'/'+ item
        sink_path = f'{destination_path}/{item}'
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        # NOTE: The commented code below makes no impact in the loop. Hence it's under review
        # elif metadata_pipeline_type is not None and not(metadata_pipeline_type.lower().startswith('peims_')) and sap_pipeline == 'peims-submissions':
        #     pass
        # elif metadata_pipeline_type is not None and not(metadata_pipeline_type.lower().startswith('tsds_')) and sap_pipeline == 'tsds_crf-submissions':
        #     # FIXME: Temporary fix for current dir. structure
        #     sap_pipeline_arg = sap_pipeline = 'peims-submissions'
        else:
            try:
                logger.info(f'Transfering {item} ({sap_pipeline}, {sap_pipelineType}) to stage 3 for submissions pipeline')
                # logger.info(f"SOURCE PATH - {table_path}")
                # logger.info(f"SINK PATH - {sink_path}")
                if '/general/' in source_path:
                    df_changes = get_latest_s2r_changes(source_path = table_path, 
                                                        sink_path = sink_path,
                                                        filtering_date = 'rundate',
                                                        debugMode = False,
                                                        sap_pipeline = sap_pipeline,
                                                        sap_pipelineType = sap_pipelineType,
                                                        schoolYear = schoolYear,
                                                        districtId = districtId)
                    # TODO: UNDER REVIEW
                    df_changes = add_metadata_columns(df = df_changes,
                                                      edfi_location = 'INVALID_VALUE_PLACEHOLDER',
                                                      edfi_id = 'INVALID_VALUE_PLACEHOLDER',
                                                      edfi_id_modified = 'INVALID_VALUE_PLACEHOLDER'
                                                      )
                    if df_changes.count() > 0:
                        upsert_with_logging(df = df_changes, 
                                sap_pipeline = sap_pipeline,
                                destination_path = sink_path, 
                                primary_key = primary_key, 
                                partitioning = partitioning, 
                                partitioning_cols = partitioning_cols,
                                table_name = item,
                                ext_entity = 'ed-fi' if extension is None else extension)
                    else:
                        logger.info(f'No Ingress Records - {item}')
            except AnalysisException as e:
                # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
                logger.info(e)
            
            logger.info('Refined table: ' + item + ' from: ' + table_path)

In [35]:
primary_key = ['NATURAL_KEY_HASH','DistrictId', 'SchoolYear']
partitioning = True
partitioning_cols = ['DistrictId', 'SchoolYear']

# submission_periods = {sap_pipeline : submissionPeriod}
# # elif sap_pipeline == 'All':
# #    submission_periods = {'peims-submissions': submissionPeriod , 
# #                         'tsds_crf-submissions':'winter'}

In [36]:
schema = 'ed-fi'
extension = None

source_path = f'stage2/Refined/SAP/data-submissions/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/general/{schema}'
destination_path = f'stage3/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{apiVersion}/general/{schema}'

dump_to_stage3(source_path = source_path, 
                sap_pipeline = sap_pipeline,
                sap_pipelineType = sap_pipelineType,
                destination_path = destination_path,
                primary_key = primary_key,
                partitioning = partitioning,
                partitioning_cols = partitioning_cols,
                extension = extension,
                schoolYear = schoolYear,
                districtId = districtId)

In [37]:
schema = 'tx'
extension = 'tx'

source_path = f'stage2/Refined/SAP/data-submissions/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/general/{schema}'
destination_path = f'stage3/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{apiVersion}/general/{schema}'

dump_to_stage3(source_path = source_path, 
                sap_pipeline = sap_pipeline,
                sap_pipelineType = sap_pipelineType,
                destination_path = destination_path,
                primary_key = primary_key,
                partitioning = partitioning,
                partitioning_cols = partitioning_cols,
                extension = extension,
                schoolYear = schoolYear,
                districtId = districtId)

### Error Logging

In [38]:
if error_logger.entity_logs != []:
    df_logs = error_logger.create_spark_df('entity')
    error_logger.write_logs_to_delta_lake(df = df_logs, 
                                log_type = 'entity',
                                destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
    error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = True)