In [3]:
instance = InstanceId = instanceId
ApiUrl = apiUrl
SchoolYear = schoolYear
districtPath = DistrictId = DistrictID  = districtID = districtId
apiLimit = batchLimit

prepareSAPMetaData = prepareSAPMetadata
zone = submissions_type = submissionsType = sap_pipeline

In [4]:
def assign_default_variable(variable_name, default_value):
    if variable_name not in globals():
        globals()[variable_name] = default_value
        logger.info(f'{variable_name} not found - using system default')

### Pre-Requisites (Dev)

In [5]:
from notebookutils import mssparkutils
import configparser
import random

config_path = "/tmp/conf.ini"
def copy_config_to_temp():
    mssparkutils.fs.cp(oea.to_url("stage1/Transactional/SAP/metadata-assets/edfi-configs.ini"),"file:/tmp/conf.ini")

def read_edfi_credentials(config_path):
    config = configparser.ConfigParser()
    config.read(config_path)

    edfi_credentials = {}

    if 'EdFi' in config:
        edfi_credentials['client_id'] = config['EdFi'].get('client_id', '')
        edfi_credentials['client_secret'] = config['EdFi'].get('client_secret', '')
    
    return edfi_credentials

### Actual Code

In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import * #StringType, StructType, StructField, IntegerType, DateType

import pyspark.sql.functions as F
from pyspark.sql.functions import col, struct, concat, lit, round, concat, array
from pyspark.sql.functions import regexp_replace, expr, when, date_format, to_date

import uuid
from datetime import datetime
import logging
import json
import csv
import copy

import threading
import requests
from requests.auth import HTTPBasicAuth

import random
import string

In [7]:
print('Submissions - TESTING PARAMETERIZATION')
try:
    print(kVName)
    print(workspace)
    print(apiUrl)
    print(instanceId)
    print(moduleName)
    print(apiLimit)
    print(minChangeVer)
    print(maxChangeVer)
    print(sapVersion)
    print(prepareSAPMetaData)
    print(submissions)
    print(sap_pipeline)
    print(sap_pipelineType)
    print(schoolYear)
    print(districtID)
    print(pipelineExecutionId)

    kvName = kVName
    districtId = districtID
    districtPath = districtId
except Exception as params_error:
    print('CATCHING ERROR!!!')
    print(params_error)

### URL Initializations

In [8]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [9]:
instance_id = instanceId
school_year = schoolYear
api_year = school_year
api_url = apiUrl

# FIXME: 2024-01-31 TEMP FIX FOR FY
try:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, api_year)
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion
except Exception as error:
    edfi_api_manager = EdFiApiManager(api_url, instance_id, '')
    edfi_api_manager.update_urls()
    edfi_api_manager.set_other_metadata()

    dependenciesUrl = edfi_api_manager.dependencies_url
    openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
    dataManagementUrl = edfi_api_manager.data_management_url
    authUrl = edfi_api_manager.auth_url

    changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
    changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
    swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

    apiVersion = edfi_api_manager.api_version
    apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [10]:
%run EdGraph/modules/SAP_PEIMS/v0.6/src/utilities/sap_peim_v0_6_sap_py

In [11]:
from datetime import datetime
oea = SAPEdFiOEAChild(workspace='dev', 
                      logging_level=logging.INFO, 
                      storage_account=None, 
                      keyvault=None, 
                      timezone=None,
                      sap_pipeline = sap_pipeline,
                      sap_pipelineType = sap_pipelineType)   
oea.set_workspace(workspace)
oea.ingestionHistoryMode = ingestionHistoryMode

In [12]:
# swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
oea_utils = schema_gen = SAPOpenAPIUtilChild(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

In [13]:
# Set Ed-Fi Credentials
# copy_config_to_temp()

# credentials = read_edfi_credentials(config_path)
# client_id = credentials.get('client_id')
# client_secret_id = credentials.get('client_secret')

### Metadata Processing

In [14]:
metadata_path = "stage1/Transactional/SAP/metadata-assets/sap-to-edfi.json"
metadata_url = oea.to_url(metadata_path)

In [15]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
config_data = json.loads(json_string)

### SAP & Error Logging Initiliazations

In [16]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

In [17]:
sap_utilities = SAPUtilities(spark = spark, 
                             oea = oea)

In [18]:
sap_to_edfi_complex = config_data.get('sap_to_edfi_complex', {})
final_columns = config_data.get('final_columns', {})
final_columns = {key: [col for col in columns if col not in ['rundate']] for key, columns in final_columns.items()}

_ext_TX_cols = config_data.get('_ext_TX_cols', {})
descriptorsDFRef = config_data.get('descriptorsDFRef', {})
descriptors = config_data.get('descriptors', [])

In [19]:
nine_digit_number = random.randint(100000000, 999999999)
assign_default_variable(variable_name = 'pipeline_execution_id', 
                        default_value = 'Test_1234')
pipeline_execution_id = pipelineExecutionId

characters = string.ascii_letters + string.digits
random_word = ''.join(random.choice(characters) for _ in range(15))
run_id = f"runid_{random_word}"

### EdFi Client Initializations

In [20]:
logger = logging.getLogger('EdFiAPIClient')
exception = None
edfiAPIClient = edfiLandClient = None

from datetime import datetime
minChangeVer = None
maxChangeVer = None

# kvSecret_clientId = None # oea._get_secret("oea-edfi-api-client-id")
# kvSecret_clientSecret = None # oea._get_secret("oea-edfi-api-client-secret")

try:
    global edfiAPIClient, edfiLandClient
    edfiAPIClient = SAPEdFiClient(workspace = workspace, 
                                    kvName = kvName, #NOTE: Default to None 
                                    moduleName = moduleName, 
                                    authUrl = authUrl, 
                                    dataManagementUrl = dataManagementUrl, 
                                    changeQueriesUrl = changeQueriesUrl, 
                                    dependenciesUrl = dependenciesUrl, 
                                    apiVersion = apiVersion, 
                                    batchLimit = batchLimit, 
                                    minChangeVer = minChangeVer, 
                                    maxChangeVer = maxChangeVer,
                                    oea = oea,
                                    schoolYear = schoolYear,
                                    districtId = districtId,
                                    final_columns  = final_columns,
                                    lookup_table_name = 'submissions_lookup_table',
                                    lookup_table_base_path = 'stage1/Transactional/SAP/metadata-assets',
                                    lookup_db_name = f'ldb_{workspace}_sap_etl_logs',
                                    kvSecret_clientId = kvSecret_clientId,
                                    kvSecret_clientSecret = kvSecret_clientSecret)
except Exception as exception:
    logger.exception(exception)

# FIXME: Temporary workaround to deal with REDACTED values
# edfiAPIClient.clientId = client_id#oea._get_secret('oea-edfi-api-client-id')
# edfiAPIClient.clientSecret = client_secret_id#oea._get_secret('oea-edfi-api-client-secret')

In [21]:
entities_info = edfiAPIClient.getEntities()

### Main Code

In [22]:
from datetime import datetime
from pyspark.sql.functions import lit

class EdfiSubmissionProcessor:
    def __init__(self, oea, edfiAPIClient, error_logger, logger, spark, pipelineExecutionId, test_mode=False):
        self.oea = oea
        self.edfiAPIClient = edfiAPIClient
        self.error_logger = error_logger
        self.logger = logger
        self.spark = spark
        self.test_mode = test_mode
        self.pipelineExecutionId = pipelineExecutionId
    
    def set_submission_type(self, sap_pipeline, sap_pipelineType):
        self.sap_pipeline = sap_pipeline
        self.sap_pipelineType = sap_pipelineType

    def extract_resources_dict(self, file_path):
        items = self.oea.get_folders(file_path)
        resource_names = [item for item in items if not(item == 'data-submission-logs' or item.lower().endswith('descriptors'))]

        resource_json_dict = self.edfiAPIClient.getDataForEdFiPosts(
            resource_names = resource_names,
            file_path = file_path,
            resource_json_dict = dict())
        resource_names = {resource: resource for resource in resource_names}
        return resource_json_dict, resource_names

    def post_to_edfi_resources(self, 
                               run_id = None,
                               resource_json_dict = None, 
                               resource_names = None,
                               data_sample = None, 
                               test_mode = True):
        # global pipelineExecutionId
        for stage3Name, edfiName in resource_names.items():
            start_time = datetime.now()

            if test_mode:
                data_slice = data_sample if data_sample else resource_json_dict[edfiName]
            else:
                data_slice = resource_json_dict[edfiName]

            try:
                self.logger.info(f"POST Requests for the Resource: {stage3Name}")
                if stage3Name == 'staffs' or stage3Name == 'staffEducationOrganizationEmploymentAssociations' or stage3Name == 'staffEducationOrganizationAssignmentAssociations':
                    self.edfiAPIClient.upsert_records(
                        pipeline_execution_id = self.pipelineExecutionId,
                        run_id = run_id,
                        resource = f'/ed-fi/{edfiName}',
                        resource_name = stage3Name,
                        records = data_slice,
                        chunk_size = 500,
                        num_threads = 10,
                        function_name = 'post',
                        success_logging = True,
                        error_logging = True
                    )

                    end_time = datetime.now()
                    log_data = self.error_logger.create_log_dict(
                        uniqueId = self.error_logger.generate_random_alphanumeric(10),
                        pipelineExecutionId = self.pipelineExecutionId,
                        sparkSessionId = self.spark.sparkContext.applicationId,
                        sap_pipeline = self.sap_pipeline,
                        sap_pipelineType = self.sap_pipelineType,
                        stageName = "Submission",
                        schemaFormat = 'ed-fi',
                        entityType = 'ed-fi',
                        entityName = edfiName,
                        numInputRows = len(data_slice),
                        totalNumOutputRows = len(data_slice),
                        numTargetRowsInserted = 0,
                        numTargetRowsUpdated = 0,
                        numRecordsSkipped = 0,
                        numRecordsDeleted = 0,
                        start_time = start_time,
                        end_time = end_time,                                                
                        insertionType = 'upsert',
                        emptySchemaMetadata = False
                    )
                else:
                    self.edfiAPIClient.upsert_records(
                        pipeline_execution_id = self.pipelineExecutionId,
                        run_id = run_id,
                        resource = f'/TX/{edfiName}',
                        resource_name = stage3Name,
                        records = data_slice,
                        chunk_size = 500,
                        num_threads = 10,
                        function_name = 'post',
                        success_logging = True,
                        error_logging = True
                    )

                    end_time = datetime.now()
                    log_data = self.error_logger.create_log_dict(
                        uniqueId = self.error_logger.generate_random_alphanumeric(10),
                        pipelineExecutionId = self.pipelineExecutionId,
                        sparkSessionId = self.spark.sparkContext.applicationId,
                        sap_pipeline = self.sap_pipeline,
                        sap_pipelineType = self.sap_pipelineType,
                        stageName = "Submission",
                        schemaFormat = 'ed-fi',
                        entityType = 'tx',
                        entityName = edfiName,
                        numInputRows = len(data_slice),
                        totalNumOutputRows = len(data_slice),
                        numTargetRowsInserted = 0,
                        numTargetRowsUpdated = 0,
                        numRecordsSkipped = 0,
                        numRecordsDeleted = 0,
                        start_time = start_time,
                        end_time = end_time,
                        insertionType = 'upsert',
                        emptySchemaMetadata = False
                    )

                self.error_logger.consolidate_logs(log_data, 'entity')
            except Exception as e:
                self.logger.info(f"Exception {e}")

    def process_logs(self, df, sap_pipeline, sap_pipelineType, is_post_success, run_date):
        df = df.withColumn('sap_pipeline', lit(sap_pipeline))
        df = df.withColumn('sap_pipelineType', lit(sap_pipelineType))
        df = df.withColumn('is_post_success', lit(is_post_success))
        df = df.withColumn('run_date', lit(run_date))
        df = df.withColumn('entityType', F.split(col('resource'), '/')[1])
        df = df.withColumn('entityName', F.split(col('resource'), '/')[2])
        df = df.withColumn('log_type', lit('submission'))
        return df

    def write_logs(self, df, log_file_url):
        if self.test_mode:
            self.logger.info('TEST MODE - LOGS NOT WRITTEN')
            return df, log_file_url
        else:
            # self.spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
            df.write.format('delta').partitionBy('sap_pipeline', 'sap_pipelineType', 'entityName').mode('append').save(log_file_url)
            return None, None

    def return_resource_json_dict(self, apiVersion, ext_type):
        file_path = f'stage3/pipeline={self.sap_pipeline}/pipelineType={self.sap_pipelineType}/{apiVersion}/general/{ext_type}'
        try:
            resource_json_dict, resource_names = self.extract_resources_dict(file_path)
        except:
            self.logger.exception(f'[POST TO ED-FI] Loading Data From Stage 3 {file_path}')
            resource_json_dict = dict()
            resource_names = list()
        return resource_json_dict, resource_names

In [23]:
test_mode = False
testEntity = 'contractedInstructionalStaffFTEExts'
sap_pipeline = zone = submissions_type #= 'peims-submissions' 

edfiSubmissionProcessor = EdfiSubmissionProcessor(oea = oea, 
                                                  edfiAPIClient = edfiAPIClient, 
                                                  error_logger = error_logger, 
                                                  logger = logger, 
                                                  spark = spark, 
                                                  pipelineExecutionId = pipelineExecutionId, 
                                                  test_mode = False)

In [24]:
edfiSubmissionProcessor.set_submission_type(sap_pipeline = sap_pipeline, 
                                            sap_pipelineType = sap_pipelineType) 
tx_resource_json_dict, tx_resource_names = edfiSubmissionProcessor.return_resource_json_dict(apiVersion = apiVersion,
                                                                        ext_type = 'tx')
edfi_resource_json_dict, edfi_resource_names = edfiSubmissionProcessor.return_resource_json_dict(apiVersion = apiVersion,
                                                                        ext_type = 'ed-fi')
if test_mode:  
    if testEntity in edfi_resource_json_dict.keys():
        data_sample = edfi_resource_json_dict[testEntity]    
    elif testEntity in tx_resource_json_dict.keys():
        data_sample = tx_resource_json_dict[testEntity]    
else:
    data_sample = None

for ext_type in ['ed-fi', 'tx']:
    if ext_type == 'ed-fi':
        copy_resource_json_dict = copy.deepcopy(edfi_resource_json_dict)
        resource_names = edfi_resource_names
    elif ext_type == 'tx':
        copy_resource_json_dict = copy.deepcopy(tx_resource_json_dict)
        resource_names = tx_resource_names
    if test_mode and testEntity in copy_resource_json_dict.keys():
        edfiSubmissionProcessor.post_to_edfi_resources(run_id = run_id,
                            resource_json_dict = copy_resource_json_dict,
                            resource_names = {testEntity: testEntity}, 
                            data_sample = data_sample)
    elif test_mode and testEntity not in copy_resource_json_dict.keys():
        pass
    elif copy_resource_json_dict != dict():
        edfiSubmissionProcessor.post_to_edfi_resources(run_id = run_id,
                            resource_json_dict = copy_resource_json_dict,
                            resource_names =  resource_names, 
                            data_sample = None)

### Ed-Fi IDs - Lookup Table

In [25]:
# TODO: Partitioning by ['entity_name']
# NOTE: current version of upsert has bug
# UPSERT is only valid for partitionigs DI and SY; otherwise dynamic overwrites 
# For Partitions
edfi_ids_lookup_table_path = 'stage1/Transactional/SAP/metadata-assets/edfi_ids_lookup_table'
if edfiAPIClient.edfi_id_records != [] and sap_pipeline != 'analytics':
    edfi_id_records_df = spark.createDataFrame(edfiAPIClient.edfi_id_records, schema = edfiAPIClient.edfi_id_record_schema) if len(edfiAPIClient.edfi_id_records) != 0 else spark.createDataFrame([], schema = edfiAPIClient.edfi_id_record_schema)
    oea.upsert(df = edfi_id_records_df,
               destination_path = edfi_ids_lookup_table_path,
               primary_key = 'edfi_location',
               partitioning = False,
               partitioning_cols = [],
               surrogate_key = False)
    spark.sql(f'CREATE DATABASE IF NOT EXISTS ldb_{workspace}_sap_etl_logs')
    spark.sql(f"drop table if exists ldb_{workspace}_sap_etl_logs.edfi_ids_lookup_table")
    spark.sql(f"create table if not exists ldb_{workspace}_sap_etl_logs.edfi_ids_lookup_table using DELTA location '{oea.to_url(edfi_ids_lookup_table_path)}'")

### Submissions - Lookup Table

In [26]:
if edfiAPIClient.max_rundates != []:
    edfiAPIClient.dump_lookup_table()
    edfiAPIClient.add_lookup_table_to_lake_db(overwrite = True)

### Error Logging

In [27]:
if edfiSubmissionProcessor.error_logger.entity_logs != [] and sap_pipeline != 'analytics':
    df_logs = edfiSubmissionProcessor.error_logger.create_spark_df('entity')
    edfiSubmissionProcessor.error_logger.write_logs_to_delta_lake(df = df_logs, 
                                log_type = 'entity',
                                destination_url = edfiSubmissionProcessor.error_logger.to_logs_url('etl-logs/log_type=entity'))
    edfiSubmissionProcessor.error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = False)

In [28]:
if sap_pipeline != 'analytics':
    error_df = spark.createDataFrame(edfiAPIClient.error_logs, schema = edfiAPIClient.log_schema) if len(edfiAPIClient.error_logs) != 0 else spark.createDataFrame([], schema = edfiAPIClient.log_schema)
    success_df = spark.createDataFrame(edfiAPIClient.success_logs, schema = edfiAPIClient.log_schema) if len(edfiAPIClient.success_logs) != 0 else spark.createDataFrame([], schema = edfiAPIClient.log_schema)

    current_datetime = datetime.today()
    run_date = current_datetime.strftime("%Y-%m-%d")
    test_mode = False

In [29]:
if sap_pipeline != 'analytics':
    # NOTE: If all is passed all will be the submission_type
    success_df = edfiSubmissionProcessor.process_logs(df = success_df, 
                                sap_pipeline = sap_pipeline,
                                sap_pipelineType = sap_pipelineType,
                                is_post_success = True, 
                                run_date = run_date)
    # NOTE: If all is passed all will be the submission_type
    error_df = edfiSubmissionProcessor.process_logs(df = error_df, 
                                sap_pipeline = sap_pipeline,
                                sap_pipelineType = sap_pipelineType,
                                is_post_success = False, 
                                run_date = run_date)
    logs_df = success_df.union(error_df)

In [30]:
if sap_pipeline != 'analytics':
   log_file_url = edfiSubmissionProcessor.error_logger.to_logs_url('etl-logs/log_type=submissions')
   df_delta, _ = edfiSubmissionProcessor.write_logs(df = logs_df, 
                           log_file_url = log_file_url)
   edfiSubmissionProcessor.error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_sap_etl_logs',
                                       logs_base_path = 'etl-logs',
                                       log_type = 'submissions',
                                          overwrite = True)

### Update Stage 3

In [31]:
logs_df.createOrReplaceTempView('temp_edfi_logs_table')
ext_types = ['ed-fi', 'tx']
primary_key = ['NATURAL_KEY_HASH', 'sap_pipeline','sap_pipelineType']
update_cols = {"sink.edfi_location": "updates.edfi_location", 
               "sink.edfi_id": "updates.edfi_id", 
               "sink.edfi_id_modified": "updates.edfi_id_modified"}

if sap_pipeline != 'analytics':
    for ext_type in ext_types:
        file_path = f'stage3/pipeline={sap_pipeline}/pipelineType={sap_pipelineType}/{apiVersion}/general/{ext_type}'
        items = oea.get_folders(file_path)
        for item in items:
            logger.info(f'Updating stage 3 delta lake for the item - {item}')
            query = f"""SELECT NATURAL_KEY_HASH, 
                                         sap_pipeline as sap_pipeline,
                                         sap_pipelineType as sap_pipelineType,
                                         edfi_location, 
                                         edfi_id, 
                                         edfi_id_modified 
                      FROM temp_edfi_logs_table
                      WHERE is_post_success = True
                        AND entityName = '{item}'
                        """
            temp_df = spark.sql(query)
            entity_path = f"{file_path}/{item}"
            destination_url = oea.to_url(entity_path)

            pk_statement = oea.return_pk_statement(primary_key)
            if DeltaTable.isDeltaTable(spark, destination_url):
                logger.info('TRUE UPSERT')
                delta_table_sink = DeltaTable.forPath(spark, destination_url)
                delta_table_sink.alias('sink').merge(temp_df.alias('updates'), pk_statement).whenMatchedUpdate(set = update_cols).execute()# .whenNotMatchedInsert(values = insert_cols).execute()
            else:
                logger.error(f'Invalid stage 3 delta location for the item - {item}')