In [50]:
# NOTE: Under Experimentation
pipeline_suffix_mappings = {'PEIMS_FALL': '_peims',
                            'PEIMS_MIDYR': '_peims',
                            'TSDS_CLASS_ROSTER_FALL': '_tsds',
                            'ANALYTICS': '_analytics'}

In [51]:
instance = instanceId = InstanceId
apiUrl = ApiUrl
schoolYear = SchoolYear
DistrictId = DistrictID = districtId = districtID
apiLimit = batchLimit

prepareSAPMetaData = prepareSAPMetadata
zone = submissionsType = sap_pipeline

### Pre-Requisites (Dev)

In [52]:
from notebookutils import mssparkutils
import configparser

config_path = "/tmp/conf.ini"
def copy_config_to_temp():
    mssparkutils.fs.cp(oea.to_url("stage1/Transactional/SAP/metadata-assets/edfi-configs.ini"),"file:/tmp/conf.ini")

def read_edfi_credentials(config_path):
    config = configparser.ConfigParser()
    config.read(config_path)

    edfi_credentials = {}

    if 'EdFi' in config:
        edfi_credentials['client_id'] = config['EdFi'].get('client_id', '')
        edfi_credentials['client_secret'] = config['EdFi'].get('client_secret', '')

    return edfi_credentials

### Actual Code

In [53]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StringType, StructType, StructField, IntegerType

from pyspark.sql.functions import col, substring, regexp_extract, split, lit, struct, to_date, from_unixtime, date_format
from pyspark.sql.functions import create_map, lit, when, array, coalesce, concat_ws
from pyspark.sql.functions import collect_list, create_map, lit, struct, array, concat
from pyspark.sql.functions import expr

from pyspark.sql import functions as F
import pyspark.sql.functions as f
from pyspark.sql.functions import udf

import json
import os
import pandas as pd
import re

import copy
from itertools import chain

In [54]:
print('REFINEMENT - TESTING PARAMETERIZATION')
try:
    print(kVName)
    print(workspace)
    print(apiUrl)
    print(instanceId)
    print(moduleName)
    print(apiLimit)
    print(minChangeVer)
    print(maxChangeVer)
    print(sapVersion)
    print(prepareSAPMetaData)
    print(submissions)
    print(submissionsType)
    print(schoolYear)
    print(districtID)
    print(pipelineExecutionId)

    kvName = kVName
    districtId = districtID
    districtId = districtId
except Exception as params_error:
    print('CATCHING ERROR!!!')
    print(params_error)

### URLs Initializations

In [55]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [56]:
instance_id = instanceId
school_year = schoolYear
api_year = school_year
api_url = apiUrl

# FIXME: 2024-01-31 TEMP FIX FOR FY
try:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, api_year)
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion
except Exception as error:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, '')
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initiliazations

In [57]:
%run EdGraph/modules/SAP_PEIMS/v0.6/src/utilities/sap_peim_v0_6_sap_py

In [58]:
from datetime import datetime
oea = SAPEdFiOEAChild(workspace='dev', 
                      logging_level=logging.INFO, 
                      storage_account=None, 
                      keyvault=None, 
                      timezone=None,
                      sap_pipeline = sap_pipeline,
                      sap_pipelineType = sap_pipelineType)   
oea.set_workspace(workspace)
oea.ingestionHistoryMode = ingestionHistoryMode

In [59]:
# swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

In [60]:
# Set Ed-Fi Credentials
# copy_config_to_temp()

# credentials = read_edfi_credentials(config_path)
# client_id = credentials.get('client_id')
# client_secret_id = credentials.get('client_secret')

### Metadata for Processing

In [61]:
metadata_path = "stage1/Transactional/SAP/metadata-assets/sap-to-edfi.json"
metadata_url = oea.to_url(metadata_path)

In [62]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
config_data = json.loads(json_string)

### SAP & Error Logging Initiliazations

In [63]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

In [64]:
# oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
# oea_utils.create_definitions()
# schemas = schema_gen.create_spark_schemas()

primitive_datatypes = ['timestamp', 'date', 'decimal', 'boolean', 'integer', 'string', 'long']
sap_to_edfi_complex = config_data.get('sap_to_edfi_complex', {})
final_columns = config_data.get('final_columns', {})
_ext_TX_cols = config_data.get('_ext_TX_cols', {})
descriptorsDFRef = config_data.get('descriptorsDFRef', {})
descriptors = config_data.get('descriptors', [])
sap_essential_columns = config_data.get('sap_essential_columns', {}) # ['DistrictId', 'SchoolYear', 'metadata_pipeline_type', 'lakeId', 'validationRecordId', 'LastModifiedDate', 'rundate', 'stage1_source_url', 'RECORD', 'NATURAL_KEY_HASH', 'RECORD_HASH']

In [65]:
sap_utilities = SAPUtilities(spark = spark, 
                             oea = oea,
                             sap_essential_columns = sap_essential_columns)

In [66]:
sap_process_client = SAPProcessClient(spark = spark, 
                                      oea = oea,
                                      sap_utilities = sap_utilities,
                                      sap_to_edfi_complex = sap_to_edfi_complex,
                                      final_columns = final_columns,
                                      _ext_TX_cols = _ext_TX_cols, 
                                      descriptorsDFRef = descriptorsDFRef,
                                      descriptors = descriptors) 

In [67]:
def initialize_sap_pipeline_vars(sapVersion = '1.0',
                                 districtId = '101912',
                                 schoolYear = '2023',
                                 base_path = '',
                                 column_mapping_file_path = ''):
    for descriptor in sap_process_client.descriptors:
        descriptor_path = f"{base_path}/ed-fi/{descriptor}"
        # FIXME: Temp Fix
        try:
            sap_process_client.descriptorsDFRef[descriptor] = sap_utilities.loadDescriptors(descriptor_path).cache()
        except Exception as error:
            sap_process_client.descriptorsDFRef[descriptor] = sap_utilities.loadDescriptors(descriptor_path.replace('/ed-fi', '/TX')).cache()

    column_mapping_file_url = oea.to_url(column_mapping_file_path) 
    column_mappings = sap_utilities.extract_refined_cols_mapping(column_mapping_file_url)

    return column_mappings

### REFINEMENT - UTILITIES

In [68]:
def add_metadata_columns(df, overwrite,**kwargs):
    for column_name, constant_value in kwargs.items():
        if overwrite:
            df = df.withColumn(column_name, F.lit(constant_value))
        elif column_name not in df.columns:
            df = df.withColumn(column_name, F.lit(constant_value))
    return df

def apply_data_transformations(df, edfi_item, source_path):
    if edfi_item in ['budgetExts', 'actualExts']:
        df = df.withColumn('educationOrganizationId', lit(sap_utilities.extract_district_id(source_path)))
    elif edfi_item == 'staffs':
        if 'raceDescriptor' in df.columns:
            df = df.withColumn('raceDescriptor', concat(lit('0'), col('raceDescriptor')))
        if 'pkTeacherRequirementDescriptor' in df.columns:
            df = df.withColumn('pkTeacherRequirementDescriptor', concat(lit('0'), col('pkTeacherRequirementDescriptor')))
    elif edfi_item == 'staffEducationOrganizationAssignmentAssociations':
        df = sap_utilities.format_digit_vals(df, 'staffClassificationDescriptor')
    elif edfi_item == 'staffEducationOrganizationEmploymentAssociations':
        pass

    return df

def transform_descriptor_columns(df, descriptor_columns):
    # Transform descriptor columns
    beforeTransform = df.count()
    for descriptor in descriptor_columns:
        if not(descriptor.startswith('race')):
            if descriptor == 'highestCompletedLevelOfEducationDescriptor':
                descriptorKey = 'levelOfEducationDescriptor'
            else:
                descriptorKey = descriptor
            descriptorKey = f"{descriptorKey}s"
            df = sap_utilities.transform_dataframe(df, descriptor, sap_process_client.descriptorsDFRef[descriptorKey])
    afterTransform = df.count()
    if beforeTransform != afterTransform:
        logger.info(f"[REFINEMENT DESCRIPTOR JOIN] NUMBER OF RECORDS MISMATCHED - {edfi_item}")
        return None
    return df

def process_edfi_item(df, edfi_item):
    # Process the specific edfi_item
    processing_functions = {
        'budgetExts': sap_process_client.processBudgetExts,
        'actualExts': sap_process_client.processActualExts,
        'staffEducationOrganizationAssignmentAssociations': sap_process_client.processStaffEducationOrganizationAssignmentAssociations,
        'payrollExts': sap_process_client.processPayrollExts,
        'contractedInstructionalStaffFTEExts': sap_process_client.processContractedInstructionalStaffFTEExts,
        'staffs': sap_process_client.processStaffs,
        'staffEducationOrganizationEmploymentAssociations': sap_process_client.processStaffEducationOrganizationEmploymentAssociations
    }

    if edfi_item in processing_functions:
        df = processing_functions[edfi_item](df)
        if edfi_item == 'payrollExts':
            df = df.withColumn('beginDate', to_date(col('beginDate'), "M/d/yyyy"))
    return df

def transform_sap_to_edfi(source_path, sap_pipeline,sap_pipelineType, item,edfi_item=None, edfi_version=None):
    global column_mappings
    logger.info("[REFINEMENT TRANSFORMING SAP TO EDFI]")
    sink_general_path, _ = sap_utilities.get_sink_general_sensitive_paths(source_path = source_path,
                                                                          edfi_version = edfi_version,
                                                                          edfi_item = edfi_item,
                                                                          partitioning = True,
                                                                          SAP_SUB='data-submissions',
                                                                          TEST_MODE = False)
    entity_column_mapping = column_mappings[sap_pipeline][sap_pipelineType][f'tx/{edfi_item}'].asDict()
    df_changes = oea.get_latest_changes(source_path, 
                                        sink_general_path,
                                        filtering_date = 'rundate',
                                          primary_key = ['NATURAL_KEY_HASH'],
                                          debugMode = False)
    df_changes = sap_utilities.map_to_hard_values(df_changes, edfi_item)
    df_changes = sap_utilities.map_columns(df_changes, entity_column_mapping)
    descriptor_columns = sap_utilities.infer_descriptor_columns(df_changes.columns)
    
    df_changes = apply_data_transformations(df_changes, edfi_item, source_path)
    df_changes = transform_descriptor_columns(df_changes, descriptor_columns)
    df_changes = process_edfi_item(df_changes, edfi_item)
    return df_changes

def upsert_edfi_complex_data(df_changes, sink_general_path, primary_key):
    # TODO: Make if better for passing the right params instead of global declarations
    global districtId, schoolYear
    primary_key = ['DistrictId', 'SchoolYear', 'sap_pipeline','sap_pipelineType', 'NATURAL_KEY_HASH']
    partitioning_cols = ['DistrictId', 'SchoolYear', 'sap_pipeline', 'sap_pipelineType']
    # FIXME: 2024-01-29 - storing df_changes.count() to changes_count as a temp fix
    changes_count = df_changes.count()
    if changes_count > 0:
        # FIXME: 2024-01-29 - JOIN BASED UPSERT Under Review
        oea.upsert(df_changes, 
                   sink_general_path, 
                   primary_key = primary_key,#primary_key, 
                   partitioning = True, 
                   partitioning_cols = partitioning_cols,
                   join_based_upsert = False
                    )
        oea.add_to_lake_db(sink_general_path, overwrite=True)
        # FIXME: 2024-01-29 - df_changes.count() during logging is 0 (zero) - Why?
        logger.info(f'[REFINEMENT UPSERT EDFI COMPLEX / NESTED] SAP to Ed-Fi API: Processed {changes_count} updated rows into stage2/Refined')
    else:
        logger.info('[REFINEMENT UPSERT EDFI COMPLEX / NESTED] SAP to Ed-Fi API: No updated rows to process.')
    return changes_count

def refine_item_to_edfi_complex(table_path, edfi_version, item, sap_pipeline, sap_pipelineType,districtId, schoolYear):
    logger.info(f"[REFINEMENT ITEM TO EDFI COMPLEX / NESTED] Refining the item {item} to Ed-Fi Complex Standard")
    edfi_item = sap_process_client.sap_to_edfi_complex[item] if not item.lower().endswith('descriptors') else item
    df_processed = transform_sap_to_edfi(table_path, sap_pipeline, sap_pipelineType,item,edfi_item, edfi_version)
    df_processed = add_metadata_columns(df_processed,
                                        overwrite = False, 
                                        DistrictId = districtId, 
                                        SchoolYear = schoolYear,
                                        RECORD_VERSION = 1,
                                        SUBMISSION_RECORD_IS_ACTIVE = True)
    
    if sap_utilities.has_column(df_processed, 'RECORD'):
        df_processed = df_processed.withColumn('lakeId', f.concat_ws('_', f.col('DistrictId'), f.col('SchoolYear'), f.col('NATURAL_KEY_HASH')).cast("String"))
        df_processed = df_processed.withColumn('validationRecordId', f.concat_ws('_', f.col('DistrictId'), f.col('SchoolYear'), f.col('NATURAL_KEY_HASH')).cast("String"))
    else:
        df_processed = df_processed.withColumn('lakeId', f.lit(None).cast("String"))
        df_processed = df_processed.withColumn('validationRecordId', f.lit(None).cast("String"))
    return df_processed

def process_and_refine_sap_entity(tables_source, edfi_version, sap_pipeline, sap_pipelineType, item, test_mode, schoolYear, districtId):
    global deletePrevSubmissions
    table_path = tables_source + '/' + item
    if item == 'metadata.csv':
        logger.info('Ignore metadata processing, since this is not a table to be ingested')
    else:
        df_processed = refine_item_to_edfi_complex(table_path, edfi_version, item, sap_pipeline, sap_pipelineType, districtId, schoolYear)
        try:
            sink_general_path, _ = sap_utilities.get_sink_general_sensitive_paths(source_path = table_path,
                                                                                  edfi_version = edfi_version,
                                                                                  edfi_item = sap_process_client.sap_to_edfi_complex[item],
                                                                                  partitioning = True,
                                                                                  SAP_SUB='data-submissions',
                                                                                  TEST_MODE = False)

            # FIXME: 2024-01-29 - JOIN BASED UPSERT Under Review
            try:
                oea.merge_deletes_into_delta_lake(df = df_processed.select('NATURAL_KEY_HASH', 'rundate').cache(), 
                                                  destination_path = sink_general_path, 
                                                  func_enabled = deletePrevSubmissions)
                df_processed = oea.get_df_latest_records_by_join(df = df_processed, 
                                                                destination_path = sink_general_path, 
                                                                func_enabled = True)
            except:
                logger.info('[REFINEMENT PROCESS AND REFINE SAP] Delta Lake does not exist JOIN BASED UPSERT disabled')
                df_processed = oea.get_df_latest_records_by_join(df = df_processed, 
                                                                destination_path = sink_general_path, 
                                                                func_enabled = False)
            changes_count = upsert_edfi_complex_data(df_changes = df_processed, 
                                                     sink_general_path = sink_general_path, 
                                                     primary_key = 'lakeId')
            
            # FIXME: 2024-01-30: TEMP FIX TO BYPASS LEFT-ANTI DATA CORRUPUTION
            sink_df = oea.query(sink_general_path, f'select max(rundate) maxdatetime')
            maxdatetime = sink_df.first()['maxdatetime']
            df_processed = oea.load(sink_general_path).where(f"rundate >= '{maxdatetime}'")
            logger.info(f"[REFINEMENT PROCESS AND REFINE SAP] SAP to Ed-Fi API: ALERT!!! - LATEST RECORDS LOADED FROM THE NESTED DELTA FOR ANALYTICS - {df_processed.count()}")
        except Exception as e:
            logger.exception(f"[REFINEMENT PROCESS AND REFINE SAP] {e}")
            return e
        
        return df_processed

### Flatten + Transform

In [69]:
def process_transform_and_refine_SAP_entity(item, tables_source, edfi_version, sap_pipeline, sap_pipelineType, test_mode):
    global districtId, schoolYear
    table_path = tables_source + '/' + item

    if not(item.lower().endswith('descriptors')):
        edfi_item = sap_process_client.sap_to_edfi_complex.get(item)
        if edfi_item is None:
            logger.info(f'[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] New SAP item detected - {item} (Mapping Not Available)')
            return None
    else:
        edfi_item = item

    logger.info(f"[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] Processing table: {edfi_item}")
    try:
        if not(item.lower().endswith('descriptors')): 
            # FIXME: Under Review
            source_path = f'stage2/Ingested/{table_path}'
            sink_general_path, sink_sensitive_path = sap_utilities.get_sink_general_sensitive_paths(source_path,
                                                                                                  edfi_version,
                                                                                                  edfi_item,
                                                                                                  partitioning=True,
                                                                                                  SAP_SUB='data-submissions',  # 'FINAL',
                                                                                                  TEST_MODE=test_mode)
            df_changes = oea.get_latest_changes(source_path, 
                                          sink_general_path, 
                                          filtering_date = 'rundate',
                                          primary_key = ['NATURAL_KEY_HASH'],
                                          debugMode = False)
            if df_changes.count() > 0:
                logger.info('[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] SAP to Ed-Fi API: ' + item + ' from: ' + table_path)
                df = process_and_refine_sap_entity(f"stage2/Ingested/{tables_source}",
                                                edfi_version=edfi_version,
                                                sap_pipeline=sap_pipeline,
                                                sap_pipelineType = sap_pipelineType,
                                                item=item,
                                                test_mode=test_mode,
                                                districtId = districtId,
                                                schoolYear = schoolYear)
            
        source_path = f'stage2/Ingested/{table_path}'
        sink_general_path, sink_sensitive_path = sap_utilities.get_sink_general_sensitive_paths(source_path,
                                                                                                  edfi_version,
                                                                                                  edfi_item,
                                                                                                  partitioning=True,
                                                                                                  SAP_SUB='analytics',  # 'FINAL',
                                                                                                  TEST_MODE=test_mode)
        df_changes = oea.get_latest_changes(source_path,
                                          sink_general_path,
                                          filtering_date = 'rundate',
                                          primary_key = ['NATURAL_KEY_HASH'],
                                          debugMode = False)
        logger.info(f"[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] {df_changes.count()}")
        if df_changes.count() > 0:
            logger.info('[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] Ed-Fi to Ed-Fi Relationship Model: ' + edfi_item)

            # NOTE: if-else condition is necessary so that descriptor tables and SAP tables
            # are correctly processed
            if not(item.lower().endswith('descriptors')):
                df_changes = df
            else:
                pass

            current_timestamp = datetime.now()
            df_changes = add_metadata_columns(df_changes, 
                                              overwrite = False,
                                              DistrictId = districtId, 
                                              SchoolYear = schoolYear,
                                              LastModifiedDate = current_timestamp,
                                              RECORD_VERSION = 1,
                                              SUBMISSION_RECORD_IS_ACTIVE = True)
      
            df = sap_to_edfi_client.transform(df = df_changes,
                                              schema_name = schema_name,
                                              table_name = edfi_item,
                                              primary_key = 'NATURAL_KEY_HASH',
                                              ext_entity = ext_entity,
                                              sink_general_path = sink_general_path,
                                              parent_schema_name = None,
                                              parent_table_name = None)
        else:
            logger.info(f'[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] Ed-Fi to Ed-Fi Relationship Model: No updated rows in {source_path} to process.')

    except AnalysisException as e:
        logger.exception(f"[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] {e}")
    except Exception as e:
        logger.exception(f"[REFINEMENT PROCESS, TRANSFORM AND REFINE SAP] {e}")

def refine_and_transform_descriptor(item, tables_source, edfi_version, sap_pipeline, sap_pipelineType, test_mode):
    global districtId, schoolYear
    table_path = tables_source + '/' + item

    edfi_item = item
    logger.info(f"Processing table: {edfi_item}")
    try:
        source_path = f'stage2/Ingested/{table_path}'
        
        path_dict = oea.parse_path(source_path)  
        entity_parent_path = path_dict['entity_parent_path']
        sink_general_path = entity_parent_path.replace('Ingested', 'Refined').replace('SAP', f'SAP/analytics') +'/general/' + edfi_item
        partitioning = True
        if partitioning:
            pattern = re.compile(r'DistrictId=.*?/|SchoolYear=.*?/')
            sink_general_path = re.sub(pattern, '', sink_general_path)
        
        df_changes = oea.get_latest_changes(source_path,
                                            sink_general_path,
                                            filtering_date = 'rundate',
                                            primary_key = ['NATURAL_KEY_HASH'],
                                            debugMode = False)
        if df_changes.count() > 0:
            logger.info('[REFINEMENT PROCESS, TRANSFORM AND REFINE DESCRIPTOR] Ed-Fi to Ed-Fi Relationship Model: ' + edfi_item)

            current_timestamp = datetime.now()
            df_changes = add_metadata_columns(df_changes, 
                                              overwrite = False,
                                              DistrictId = districtId, 
                                              SchoolYear = schoolYear,
                                              LastModifiedDate = current_timestamp,
                                              RECORD_VERSION = 1,
                                              SUBMISSION_RECORD_IS_ACTIVE = True)
                                              
            df = sap_to_edfi_client.transform(df = df_changes,
                                              schema_name = schema_name,
                                              table_name = edfi_item,
                                              primary_key = 'NATURAL_KEY_HASH',
                                              ext_entity = ext_entity,
                                              sink_general_path = sink_general_path,
                                              parent_schema_name = None,
                                              parent_table_name = None)
        else:
            logger.info(f'Ed-Fi to Ed-Fi Relationship Model: No updated rows in {source_path} to process.')

    except AnalysisException as e:
        logger.info(e)


def refine_and_explode_data(sap_pipeline, sap_pipelineType, schema_name, tables_source, ext_entity, transform_mode, test_mode, items, sapVersion, edfi_version):
    global districtId, schoolYear
    if items is None:
        items = oea.get_folders(f"stage2/Ingested/{tables_source}")
    else:
        temp_items = set(oea.get_folders(f"stage2/Ingested/{tables_source}"))
        items = list(temp_items.intersection(items))

    with ThreadPoolExecutor(max_workers=8) as tpe:
        logger.info('[REFINEMENT REFINE AND EXPLODE SAP] Entered Threadpool')
        for item in items:
            if item == 'metadata.csv' or item == 'descriptorTables':
                logger.info('Ignore Metadata, since this is not a table to be ingested')
            else:
                if item.lower().endswith('descriptors'):
                    tpe.submit(refine_and_transform_descriptor, item, tables_source, edfi_version, sap_pipeline, sap_pipelineType, test_mode)
                else:                
                    tpe.submit(process_transform_and_refine_SAP_entity, item, tables_source, edfi_version, sap_pipeline, sap_pipelineType, test_mode)

### Empty Schema - Utilities

In [70]:
def upsert_data(df_changes, 
                metadata,
                schema_name, 
                transform_mode,
                table_name,
                primary_key,
                ext_entity,
                sink_general_path,
                sink_sensitive_path):
        # NOTE: Here using debugging = True so as to circumvent the issue of metadata = []
        df_pseudo, df_lookup = oea.pseudonymize(df_changes, 
                                                metadata,
                                                transform_mode,
                                                True)
                                
        df = sap_to_edfi_client.transform(df = df_pseudo,
                                              schema_name = schema_name,
                                              table_name = table_name,
                                              primary_key = 'NATURAL_KEY_HASH',
                                              ext_entity = ext_entity,
                                              sink_general_path = sink_general_path,
                                              parent_schema_name = None,
                                              parent_table_name = None)

def threaded_task_empty_schema(input_tuple):
    item,schema_name,s2r_path,ext_entity,transform_mode,districtId,schoolYear,sap_pipeline,sap_pipelineType = input_tuple
    table_name = item
    edfi_item = item
    metadata = []
    try:                       
        sink_general_path = f'{s2r_path}/general/{schema_name}/{item}'
        sink_sensitive_path = f'{s2r_path}/sensitive/{schema_name}/{item}_lookup'
                        
        sink_general_path = sap_to_edfi_client.sink_path_cleanup(sink_general_path)
        sink_sensitive_path = sap_to_edfi_client.sink_path_cleanup(sink_sensitive_path)
        if not oea.path_exists(sink_general_path):
            logger.info(f'[REFINEMENT EMPTY SCHEMA DUMPING THREAD] Path does not exist - attempting to create empty data frame - {item}')            
            target_schema = copy.deepcopy(sap_to_edfi_client.schemas[table_name])    
            df_changes = spark.createDataFrame(data = [],
                                                schema = target_schema)
            current_timestamp = datetime.now()
            df_changes = add_metadata_columns(df_changes,
                                                overwrite = True, 
                                                DistrictId = districtId, 
                                                SchoolYear = schoolYear,
                                                LastModifiedDate = current_timestamp,
                                                sap_pipeline = sap_pipeline,
                                                sap_pipelineType = sap_pipelineType,
                                                RECORD = '1',
                                                rundate = '2023-01-01',
                                                NATURAL_KEY_HASH = 'PLACEHOLDER',
                                                RECORD_HASH = 'PLACEHOLDER',
                                                RECORD_VERSION = 1,
                                                SUBMISSION_RECORD_IS_ACTIVE = True
                                                )
                            

            if 'id' in df_changes.columns:
                upsert_data(df_changes, 
                            metadata, #NOTE: This is empty list
                            schema_name, 
                            transform_mode,
                            table_name,
                            'id',
                            ext_entity,
                            sink_general_path,
                            sink_sensitive_path)
            else:
                pass#logger.info(f'{item} does not have id as primary key - flagged for future')
    except Exception as error:
        logger.exception(f"[REFINEMENT EMPTY SCHEMA DUMPING THREAD] {item} {error}")

def dump_empty_schemas(sap_pipeline,
                       sap_pipelineType,
                       schema_name, 
                       s2r_path,
                       ext_entity,
                       transform_mode, 
                       items = []):
    global districtId,schoolYear
    if schema_name is None:
        schema_name = 'ed-fi'
    
    with ThreadPoolExecutor(max_workers=12) as tpe:
        logger.info('[REFINEMENT EMPTY SCHEMA DUMPING] Entered Threadpool')
        tpe.map(threaded_task_empty_schema,[(item,schema_name if not item.lower().endswith('exts') else 'tx',s2r_path,ext_entity,transform_mode,districtId,schoolYear, sap_pipeline,sap_pipelineType) for item in items])
 

def get_non_ext_entities(entities_meta_info):
    non_ext_table_names = list()
    for entity_meta_info in entities_meta_info:
        non_ext_table_names.append(entity_meta_info['resource'].split('/')[-1])
    return non_ext_table_names

def add_all_empty_tables_to_lake_db(empty_tables_path, schema_name, emptyTables = None, suffix = '_analytics'):
    if emptyTables is None:
        empty_tables_source = oea.to_url(empty_tables_path)
        items = oea.get_folders(empty_tables_source)
    else:
        items = emptyTables
    if schema_name == 'ed-fi':
        extension = None
    else:
        extension = schema_name  
    
    with ThreadPoolExecutor(max_workers=12) as tpe:
        logger.info('[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] Entered Threadpool')
        for item in items:
            source_entity_path = empty_tables_path + '/' + item 
            tpe.submit(add_empty_table_to_lake_db,source_entity_path,False,extension, suffix)

def add_empty_table_to_lake_db(source_entity_path, overwrite = False, extension = None, suffix = '_analytics'):
        # FIXME: Temporary Fix for Empty Schemas
        """ Adds the given entity as a table (if the table doesn't already exist) to the proper lake db based on the path.
            This method will also create the lake db if it doesn't already exist.
            eg: add_to_lake_db('stage2/Ingested/contoso_sis/v0.6/students')

            Note that a spark db that points to source data in the delta format can't be queried via SQL serverless pool. More info here: https://docs.microsoft.com/en-us/azure/synapse-analytics/sql/resources-self-help-sql-on-demand#delta-lake
        """
        source_dict = oea.parse_path(source_entity_path)
        if '/emptySchemas/' in source_entity_path:
            try:
                base_db_name = source_dict['ldb_name']
                base_table_name = source_dict['entity']
                for submission_type in [suffix]:     
                    if extension is not None:
                        if not(extension.startswith('_')):
                            extension = '_' + extension
                        source_dict['entity'] = base_table_name + str(extension)
                    
                    db_name = base_db_name + submission_type

                    logger.info(f"[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] Adding: Lake DB: {db_name}; Table: {source_dict['entity']}")
                    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
                    if overwrite:
                        spark.sql(f"drop table if exists {db_name}.{source_dict['entity']}")

                    spark.sql(f"create table if not exists {db_name}.{source_dict['entity']} using DELTA location '{oea.to_url(source_dict['entity_path'])}'")
            except Exception as error:
                logger.error(f'[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] {error}')

### Main Tables

In [71]:
def return_sap_entities(sap_pipeline, sap_pipelineType):
    if sap_pipeline == 'TEA':
        if sap_pipelineType == 'PEIMS_FALL':
            return ['YHROHPM04', 'YHROHPM07', 'YHROHPM08', 'YHROHPM09', 'YHROHPM10', 'YFMOHPEIM']
        elif sap_pipelineType == 'PEIMS_MIDYR':
            return ['YFIOHPEIM']
        elif sap_pipelineType == 'PEIMS_EXYR':
            return ['YHROHPM04']
        elif sap_pipelineType == 'TSDS_ECDS_KG':
            return ['YHROHPM03']
        elif sap_pipelineType == 'TSDS_CLASS_ROSTER_FALL':
            return ['YHROHPM05']
        elif sap_pipelineType == 'TSDS_ECDS_PK':
            return ['YHROHPM02']
    return None

In [72]:
# sap_pipeline = "peims-submissions"
# FIXME: Ed-Fi pseudonymization metadata is NOT being used in here 
metadata = []#oea.get_metadata_from_path(path = f'stage1/Transactional/SAP/pipeline={sap_pipeline}/{sapVersion}')

schema_name = 'ed-fi'
ext_entity = 'TX'
natural_upsert_mode = True
items = return_sap_entities(sap_pipeline, sap_pipelineType)

In [73]:
from datetime import datetime
descriptors_base_path = f'stage2/Ingested/SAP/descriptorTables/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}'
column_mapping_file_path = f"stage1/Transactional/SAP/metadata-assets/ingestion-mappings.json"
    
column_mappings = initialize_sap_pipeline_vars(sapVersion = sapVersion,
                                    districtId = districtId,
                                    schoolYear = schoolYear,
                                    base_path = descriptors_base_path,
                                    column_mapping_file_path = column_mapping_file_path)

### PEIMS

In [74]:
test_mode = False
transform_mode = True
edfi_version = apiVersion

if etlProcessing:
    if sap_pipeline == 'TEA' and (sap_pipelineType.lower().startswith('peims') or sap_pipelineType.lower().startswith('tsds')):
        tables_source = f'SAP/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}'
        sap_to_edfi_client = SAPToEdFiRefine(workspace = workspace, 
                                            oea = oea, 
                                            spark = spark,
                                            sap_oea_utils = oea_utils,
                                            sap_process_client = sap_process_client,
                                            logger = logger,
                                            schema_gen = schema_gen, 
                                            moduleName = moduleName, 
                                            authUrl = authUrl, 
                                            swaggerUrl = swaggerUrl, 
                                            dataManagementUrl = dataManagementUrl, 
                                            changeQueriesUrl = changeQueriesUrl, 
                                            dependenciesUrl = dependenciesUrl, 
                                            apiVersion = apiVersion, 
                                            schoolYear = schoolYear,
                                            districtId = districtId, 
                                            test_mode = False,
                                            pipelineExecutionId = pipelineExecutionId,
                                            error_logger = error_logger,
                                            natural_upsert_mode = natural_upsert_mode,
                                            sap_essential_columns = sap_essential_columns)
        sap_to_edfi_client.set_params(params = {'sap_pipeline': sap_pipeline,
                                                'sap_pipelineType': sap_pipelineType})

        df = refine_and_explode_data(sap_pipeline = sap_pipeline, 
                                    sap_pipelineType = sap_pipelineType,
                                    schema_name = schema_name, 
                                    tables_source = tables_source,
                                    ext_entity = ext_entity,
                                    transform_mode = transform_mode, 
                                    test_mode = test_mode,
                                    items = items,
                                    sapVersion = sapVersion,
                                    edfi_version = edfi_version) 

### Descriptor Tables

In [28]:
if prepareSAPMetaData: # and sap_pipeline in ['All', 'analytics', 'validation']:
    for edfi_extension in ['ed-fi', 'TX', 'tpdm']:
        tables_source = f'SAP/descriptorTables/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/{edfi_extension}'

        test_mode = False
        transform_mode = True
        edfi_version = apiVersion

        swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
        oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
        oea_utils.create_definitions()
        schemas = schema_gen.create_spark_schemas()

        sap_to_edfi_client = SAPToEdFiRefine( workspace = workspace, 
                                                oea = oea, 
                                                spark = spark,
                                                sap_oea_utils = oea_utils,
                                                sap_process_client = sap_process_client,
                                                logger = logger,
                                                schema_gen = schema_gen, 
                                                moduleName = moduleName, 
                                                authUrl = authUrl, 
                                                swaggerUrl = swaggerUrl, 
                                                dataManagementUrl = dataManagementUrl, 
                                                changeQueriesUrl = changeQueriesUrl, 
                                                dependenciesUrl = dependenciesUrl, 
                                                apiVersion = apiVersion, 
                                                schoolYear = schoolYear,
                                                districtId = districtId, 
                                                test_mode = False,
                                                pipelineExecutionId = pipelineExecutionId,
                                                error_logger = error_logger,
                                                natural_upsert_mode = natural_upsert_mode,
                                                sap_essential_columns = sap_essential_columns)

        sap_to_edfi_client.set_params(params = {'sap_pipeline': sap_pipeline,
                                                'sap_pipelineType': sap_pipelineType})

        df = refine_and_explode_data(sap_pipeline = sap_pipeline,
                                    sap_pipelineType = sap_pipelineType,
                                        schema_name = schema_name, 
                                        tables_source = tables_source,
                                        ext_entity = ext_entity,
                                        transform_mode = transform_mode, 
                                        test_mode = test_mode,
                                        items = None,#items,
                                        sapVersion = sapVersion,
                                        edfi_version = edfi_version)

### Entity Level Logs

In [29]:
if error_logger.entity_logs != []:
    df = error_logger.create_spark_df('entity')
    error_logger.write_logs_to_delta_lake(df = df, 
                                log_type = 'entity',
                                destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
    error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = True)

### Empty Schemas

In [30]:
# FIXME: Undergoing Major Changes
# prepareSAPMetaData = True
transform_mode = True

if prepareSAPMetaData: # and sap_pipeline in ['All', 'analytics']:
    if sap_pipeline == 'All':
        sap_pipelines = ['peims-submissions', 'tsds_crf-submissions', 'analytics']
    else:
        sap_pipelines = [sap_pipeline]

    edfiAPIClient = edfi = EdFiClient(workspace = workspace, 
                                    kvName = kvName, #NOTE: Default to None 
                                    moduleName = moduleName, 
                                    authUrl = authUrl, 
                                    dataManagementUrl = dataManagementUrl, 
                                    changeQueriesUrl = changeQueriesUrl, 
                                    dependenciesUrl = dependenciesUrl, 
                                    apiVersion = apiVersion, 
                                    batchLimit = batchLimit, 
                                    minChangeVer = minChangeVer, 
                                    maxChangeVer = maxChangeVer,
                                    schoolYear = schoolYear,
                                    districtId = districtId,
                                    kvSecret_clientId = kvSecret_clientId,
                                    kvSecret_clientSecret = kvSecret_clientSecret,
                                    retry_strategy = None, 
                                    threadMode = False, 
                                    devMode = False)
    
    entities_meta_info = edfiAPIClient.getEntities()#[0]['resource']
    non_ext_table_names = get_non_ext_entities(entities_meta_info) #TODO: To Be Reviewed
    non_ext_table_names = ['schoolYearTypes'] + non_ext_table_names

    for swagger_resource_type in ['Resources', 'Descriptors']:
        swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url(swagger_resource_type)
        oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
        oea_utils.create_definitions()
        schemas = schema_gen.create_spark_schemas()
        
        sap_to_edfi_client = SAPToEdFiRefine(workspace = workspace, 
                                                    oea = oea, 
                                                    spark = spark,
                                                    sap_oea_utils = oea_utils,
                                                    sap_process_client = sap_process_client,
                                                    logger = logger,
                                                    schema_gen = schema_gen, 
                                                    moduleName = moduleName, 
                                                    authUrl = authUrl, 
                                                    swaggerUrl = swaggerUrl, 
                                                    dataManagementUrl = dataManagementUrl, 
                                                    changeQueriesUrl = changeQueriesUrl, 
                                                    dependenciesUrl = dependenciesUrl, 
                                                    apiVersion = apiVersion, 
                                                    schoolYear = schoolYear,
                                                    districtId = districtId, 
                                                    test_mode = False,
                                                    pipelineExecutionId = pipelineExecutionId,
                                                    error_logger = error_logger,
                                                    natural_upsert_mode = natural_upsert_mode)
        sap_to_edfi_client.set_params(params = {'sap_pipeline': sap_pipeline,
                                                'sap_pipelineType': sap_pipelineType})

        # non_ext_table_names = sap_to_edfi_client.return_non_ext_tables()  
        if swagger_resource_type == 'Resources':
            transform_items = [item for item in non_ext_table_names if not(item.lower().endswith('descriptors'))]
        elif swagger_resource_type == 'Descriptors':
            transform_items = [item for item in non_ext_table_names if item.lower().endswith('descriptors')]
      
        s2r_path = f'stage2/Refined/SAP/analytics/emptySchemas'

        dump_empty_schemas(sap_pipeline = None, 
                        sap_pipelineType = None,#NOTE: To Be Reviewed
                        schema_name = schema_name , 
                        s2r_path = s2r_path,
                        ext_entity = ext_entity,
                        transform_mode = transform_mode, 
                        items = transform_items)

In [31]:
if prepareSAPMetaData:
    # FIXME: Under Review
    if True:#sap_pipeline == 'analytics' or sap_pipeline == 'All':
        tables_source = f'SAP/pipeline={sap_pipeline}/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}'
        
        edfi_mainTables = oea.get_folders(f"stage2/Refined/SAP/analytics/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/general/ed-fi")#oea.get_folders(f"stage2/Ingested/{tables_source}")
        tx_mainTables = oea.get_folders(f"stage2/Refined/SAP/analytics/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{sapVersion}/general/tx")#oea.get_folders(f"stage2/Ingested/{tables_source}") 
        mainTables = edfi_mainTables + tx_mainTables
        # if mainTables != []:
        #    mainTables = [sap_to_edfi_complex.get(item) for item in mainTables if item != 'descriptorTables']
        
        txDescriptorTables = [sap_to_edfi_complex.get(item, item) for item in oea.get_folders(f"stage2/Ingested/SAP/descriptorTables/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/TX") if item != 'descriptorTables']
        edfiDescriptorTables = [sap_to_edfi_complex.get(item, item) for item in oea.get_folders(f"stage2/Ingested/SAP/descriptorTables/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/ed-fi") if item != 'descriptorTables']
        tpdmDescriptorTables = [sap_to_edfi_complex.get(item, item) for item in oea.get_folders(f"stage2/Ingested/SAP/descriptorTables/{sapVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/tpdm") if item != 'descriptorTables']
        descriptorTables = txDescriptorTables + edfiDescriptorTables + tpdmDescriptorTables

        edfi_emptyTables = oea.get_folders('stage2/Refined/SAP/analytics/emptySchemas/general/ed-fi')
        edfi_emptyTables = sap_to_edfi_client.non_common_elements(edfi_emptyTables, 
                                                             mainTables + descriptorTables)
        tx_emptyTables = oea.get_folders('stage2/Refined/SAP/analytics/emptySchemas/general/tx')
        tx_emptyTables = sap_to_edfi_client.non_common_elements(tx_emptyTables, 
                                                             mainTables + descriptorTables)

        emptyTables_path = f'stage2/Refined/SAP/analytics/emptySchemas/general/ed-fi'
        add_all_empty_tables_to_lake_db(emptyTables_path, 'ed-fi', edfi_emptyTables, pipeline_suffix_mappings[sap_pipelineType])

        emptyTables_path = f'stage2/Refined/SAP/analytics/emptySchemas/general/tx'
        add_all_empty_tables_to_lake_db(emptyTables_path, 'tx', tx_emptyTables, pipeline_suffix_mappings[sap_pipelineType])