In [3]:
instance = instanceId = InstanceId
apiUrl = ApiUrl
schoolYear = SchoolYear
DistrictId = DistrictID = districtId = districtID
apiLimit = batchLimit

prepareSAPMetaData = prepareSAPMetadata
zone = submissionsType = sap_pipeline

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import input_file_name, lit, expr

In [5]:
print('INGESTION - TESTING PARAMETERIZATION')
try:
    print(kVName)
    print(workspace)
    print(apiUrl)
    print(instanceId)
    print(moduleName)
    print(apiLimit)
    print(minChangeVer)
    print(maxChangeVer)
    print(sapVersion)
    print(prepareSAPMetaData)
    print(submissions)
    print(submissionsType)
    print(schoolYear)
    print(districtID)
    print(pipelineExecutionId)

    kvName = kVName
    districtId = districtID
    districtPath = districtId
    
except Exception as params_error:
    print('CATCHING ERROR!!!')
    print(params_error)

### URLs Initializations

In [6]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [7]:
instance_id = instanceId
school_year = schoolYear
api_year = school_year
api_url = apiUrl

# FIXME: 2024-01-31 TEMP FIX FOR FY
try:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, api_year)
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion
except Exception as error:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, '')
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initiliazations

In [8]:
%run EdGraph/modules/SAP_PEIMS/v0.6/src/utilities/sap_peim_v0_6_sap_py

In [9]:
from datetime import datetime
oea = SAPEdFiOEAChild(workspace='dev', 
                      logging_level=logging.INFO, 
                      storage_account=None, 
                      keyvault=None, 
                      timezone=None,
                      sap_pipeline = sap_pipeline,
                      sap_pipelineType = sap_pipelineType)   
oea.set_workspace(workspace)
oea.ingestionHistoryMode=ingestionHistoryMode

### Metadata For Processing

In [10]:
metadata_path = "stage1/Transactional/SAP/metadata-assets/sap-to-edfi.json"
metadata_url = oea.to_url(metadata_path)

In [11]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
config_data = json.loads(json_string)

### SAP & Error Logging Initializations

In [12]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

In [13]:
oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

primitive_datatypes = ['timestamp', 'date', 'decimal', 'boolean', 'integer', 'string', 'long']

In [14]:
sap_utilities = SAPUtilities(spark = spark, 
                             oea = oea)

In [15]:
sap_to_edfi_complex = config_data.get('sap_to_edfi_complex', {})
final_columns = config_data.get('final_columns', {})
_ext_TX_cols = config_data.get('_ext_TX_cols', {})
descriptorsDFRef = config_data.get('descriptorsDFRef', {})
descriptors = config_data.get('descriptors', [])
metadata_pipeline_type_mappings = config_data.get('metadata_pipeline_type_mappings', {})
natural_keys = config_data.get('natural_keys', {}).get('sap', {}) 

### Main Code

In [16]:
def should_ingest(entity_path):
    raw_path = f'stage1/Transactional/{entity_path}'
    try:
        batch_info = oea.get_batch_info(raw_path)
        batch_type, source_data_format = batch_info[0]

        logger.info(f'Ingesting from: {raw_path}, batch type of: {batch_type}, source data format of: {source_data_format}')
        source_url = oea.to_url(f'{raw_path}/{batch_type}_batch_data')
        logger.info(f"{oea.get_latest_folder(source_url)}")
        source = f'{source_url}/{oea.get_latest_folder(source_url)}'
    
        return oea.get_folder_size(source) > 0
    except:
        return False

In [17]:
def return_sap_entities(sap_pipeline, sap_pipelineType):
    if sap_pipeline == 'TEA':
        if sap_pipelineType == 'PEIMS_FALL':
            return ['YHROHPM04', 'YHROHPM07', 'YHROHPM08', 'YHROHPM09', 'YHROHPM10', 'YFMOHPEIM']
        elif sap_pipelineType == 'PEIMS_MIDYR':
            return ['YFIOHPEIM']
        elif sap_pipelineType == 'PEIMS_EXYR':
            return ['YHROHPM04']
        elif sap_pipelineType == 'TSDS_ECDS_KG':
            return ['YHROHPM03']
        elif sap_pipelineType == 'TSDS_CLASS_ROSTER_FALL':
            return ['YHROHPM05']
        elif sap_pipelineType == 'TSDS_ECDS_PK':
            return ['YHROHPM02']
    return None

In [18]:
from datetime import datetime

def threaded_task(input_tuple):
    item,entity_base_path,tables_source, create_natural_key, natural_keys = input_tuple
    table_path = tables_source +'/'+ item
    try:
        entity_path = entity_base_path + '/' + item  
        if item == 'metadata.csv' or item == 'descriptorTables':
            logger.info('Ignore Metadata, since this is not a table to be ingested')
        else:
            start_time = datetime.now()
            number_of_inbound_changes = 0
            if(should_ingest(entity_path)):
                if create_natural_key:
                    hashing = True
                    natural_key = natural_keys.get(item, ['RECORD']) #+ ['DistrictId', 'SchoolYear']
                else:
                    hashing = False
                    natural_key = None#natural_keys.get(item, ['RECORD']) #+ ['DistrictId', 'SchoolYear']
                number_of_inbound_changes = oea.ingest(entity_path, 
                                                        primary_key = primary_key,
                                                        hashing = hashing,
                                                        natural_key = natural_key,
                                                        landingDateTimeFormat = landingDateTimeFormat,
                                                        ingestionHistoryMode = ingestionHistoryMode)
            end_time = datetime.now()
            
            #NOTE: Output Rows = Input Rows = Inbound Rows
            numInputRows = numOutputRows = number_of_inbound_changes
            log_data = error_logger.create_log_dict(uniqueId = error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                            pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                            sparkSessionId = spark.sparkContext.applicationId,
                                            sap_pipeline = sap_pipeline,
                                            sap_pipelineType = sap_pipelineType,
                                            stageName = "Ingestion",
                                            schemaFormat = 'sap',
                                            entityType = 'sap',
                                            entityName = item,
                                            numInputRows = numInputRows,
                                            totalNumOutputRows = numOutputRows,
                                            numTargetRowsInserted = numOutputRows, #TODO: To be reviews
                                            numTargetRowsUpdated = 0,
                                            numRecordsSkipped = 0,
                                            numRecordsDeleted = 0,
                                            start_time = start_time,
                                            end_time = end_time,
                                            insertionType = 'append' if ingestionHistoryMode else 'upsert',
                                            emptySchemaMetadata = False)
            error_logger.consolidate_logs(log_data,'entity')
    except AnalysisException as e:
        # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
        logger.info(f"Analysis Exception - {e}")

def ingest_sap_dataset(tables_source,zone, items = None,primary_key = 'RECORD',create_natural_key = False, natural_keys = {}):
    global schoolYear, districtId, pipelineExecutionId, landingDateTimeFormat, ingestionHistoryMode

    entity_base_path = '/'.join(tables_source.split('/')[2:])
    executionId = f'TEST_{error_logger.generate_random_alphanumeric(10)}' 
    sap_pipeline = zone
    if items is None:
        items = oea.get_folders(tables_source)
    else:
        temp_items = set(oea.get_folders(tables_source))
        items = list(temp_items.intersection(items))
    # items = ['YFIOHPEIM']
    with ThreadPoolExecutor(max_workers=2) as tpe:
        logger.info('[INGESTION THREAD] Entered Threadpool')
        tpe.map(threaded_task,[(item,entity_base_path,tables_source, create_natural_key, natural_keys) for item in items])        

In [51]:
create_natural_key = True
primary_key = 'NATURAL_KEY_HASH' # NOTE: Use 'NATURAL_KEY_HASH' when create_natural_key = True and 'RECORD' is not right PK
items = return_sap_entities(sap_pipeline, sap_pipelineType)

ingest_sap_dataset(tables_source = f'stage1/Transactional/SAP/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}', 
                   zone = zone,
                   items = items,
                   primary_key = primary_key,
                   create_natural_key = create_natural_key,
                   natural_keys = natural_keys)

In [45]:
if error_logger.entity_logs != []:
    logger.info('Writing Entity Level Error Logs')
    df = error_logger.create_spark_df('entity')
    error_logger.write_logs_to_delta_lake(df = df, 
                                log_type = 'entity',
                                destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
    error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = True)