In [2]:
instance = instanceId = InstanceId
apiUrl = ApiUrl
schoolYear = SchoolYear
DistrictId = DistrictID = districtId = districtID
apiLimit = batchLimit

prepareEdFiMetaData = prepareEdFiMetadata

### URL Initializations

In [2]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [3]:
instance_id = instanceId
school_year = schoolYear
api_url = apiUrl

edfi_api_manager = EdFiApiManager(api_url, instance_id, school_year)
edfi_api_manager.update_urls()
edfi_api_manager.set_other_metadata()

dependenciesUrl = edfi_api_manager.dependencies_url
openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
dataManagementUrl = edfi_api_manager.data_management_url
authUrl = edfi_api_manager.auth_url

changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

apiVersion = edfi_api_manager.api_version
apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [4]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_edfi_py

In [5]:
oea = EdFiOEAChild()   
oea.set_workspace(workspace)

### Error Logging Initializations

In [ ]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

### Main Code

In [13]:
from concurrent.futures import ThreadPoolExecutor


def threaded_task(input_tuple):
    item,full_source_path,tables_source = input_tuple
    table_path = full_source_path +'/'+ item
    options = {'source_format': 'json', 'multiline': False}
    try:
        entity_path = f"{tables_source}/{item}"
        logger.info(oea.to_url(f'stage2/Ingested/{entity_path}'))
        # continue
        if item == 'metadata.csv' or item == 'parameterizedRunLogs':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        else:
            start_time = datetime.now()
            number_of_inbound_changes = 0
            number_of_inbound_changes = oea.ingest(entity_path, 
                                                   primary_key='id', 
                                                   hashing = False,
                                                   natural_key = None,
                                                   landingDateTimeFormat = landingDateTimeFormat,
                                                   ingestionHistoryMode = ingestionHistoryMode,
                                                   options=options)
            end_time = datetime.now()
            log_data = error_logger.create_log_dict(uniqueId = error_logger.generate_random_alphanumeric(10), # Generate a random 10-character alphanumeric value
                                                pipelineExecutionId = pipelineExecutionId,#'TEST_1234',#executionId,
                                                sparkSessionId = spark.sparkContext.applicationId,
                                                stageName = "ed-fi: Ingestion",
                                                schemaFormat = 'ed-fi: nested',
                                                entityType =  'ed-fi+tx',
                                                entityName = item,
                                                numInputRows = number_of_inbound_changes,
                                                totalNumOutputRows = number_of_inbound_changes,
                                                numTargetRowsInserted = number_of_inbound_changes,
                                                numTargetRowsUpdated = 0,
                                                numRecordsSkipped = 0,
                                                numRecordsDeleted = 0,
                                                start_time = start_time,
                                                end_time = end_time,
                                                insertionType = 'append' if ingestionHistoryMode else 'upsert')
            error_logger.consolidate_logs(log_data,'entity')
    
    except AnalysisException as e:
        # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
        pass
    except Exception as error:
        # This means the table may have not been properly refined due to errors with the primary key not aligning with columns expected in the lookup table.
        logger.error(error)

def ingest_edfi_dataset(tables_source, 
                        items = 'All'):
    full_source_path = f"stage1/Transactional/{tables_source}"
    if items == 'All':
        items = oea.get_folders(full_source_path) 
   
    options = {'source_format': 'json', 'multiline': False}
    #items = ['staffs', 'students']
        
    with ThreadPoolExecutor(max_workers=8) as tpe:
        # Use map to pass each item as a tuple along with items to the task function
        logger.info('[INGESTION THREAD] Entered Threadpool')
        tpe.map(threaded_task,[(item,full_source_path,tables_source) for item in items])

In [11]:
from datetime import datetime
import math
source_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_etl.csv'  
destination_path = source_path #f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_based_etl.csv'  
logs_path = f"stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/_frequency_etl_logs/run_logs_{datetime.today().strftime('%Y-%m-%d')}.csv"

processor = EntityFrequencyProcessor(oea = oea, 
                                     filepath = source_path, 
                                     highFrequentDelta = highFrequentDelta,#0.005, 
                                     moderateFrequentDelta = moderateFrequentDelta, #5, 
                                     lowFrequentDelta = lowFrequentDelta, #10, 
                                     descriptorsDelta = descriptorsDelta) #360)

processor.load_lookup_df()
_, entities_to_etl = processor.return_entities_to_etl()

edfiEntities = "All" #['schoolYearTypes']
tpdmEntities = 'All'

edfiEntities = entities_to_etl.get('ed-fi', [])
tpdmEntities = entities_to_etl.get('tpdm', [])

In [14]:
from datetime import datetime
ingest_edfi_dataset(f'{moduleName}/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/ed-fi',
                    edfiEntities)
ingest_edfi_dataset(f'{moduleName}/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/tpdm',
                    tpdmEntities)

### Writing Logs

In [ ]:
if error_logger.entity_logs != []:
    logger.info('[INGESTION ERROR LOGGING] Writing Entity Level Error Logs')
    df = error_logger.create_spark_df('entity')
    error_logger.write_logs_to_delta_lake(df = df, 
                                log_type = 'entity',
                                destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
    error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edfi_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = True)