In [16]:
import copy
import pyspark.sql.functions as f
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [17]:
from notebookutils import mssparkutils
import configparser

config_path = "/tmp/conf.ini"
def copy_config_to_temp():
    file_path = "abfss://oea@yourstorageaccount.dfs.core.windows.net/sandboxes/configs/edfi-configs-2024-02-01.ini" # oea.to_url("stage1/Transactional/SAP/metadata-assets/edfi-configs.ini")
    mssparkutils.fs.cp(file_path,"file:/tmp/conf.ini")

def read_edfi_credentials(config_path):
    config = configparser.ConfigParser()
    config.read(config_path)

    edfi_credentials = {}

    if 'EdFi' in config:
        edfi_credentials['client_id'] = config['EdFi'].get('client_id', '')
        edfi_credentials['client_secret'] = config['EdFi'].get('client_secret', '')
        edfi_credentials['instance_id'] = config['EdFi'].get('instance_id', '')

    return edfi_credentials

try:
    copy_config_to_temp()
    edfi_credentials = read_edfi_credentials(config_path)
    client_id = edfi_credentials['client_id']
    client_secret_id = edfi_credentials['client_secret']
    instanceId = edfi_credentials['instance_id']
except Exception as error:
    print(f'Error Message - {error}')

In [18]:
instance = InstanceId = instanceId
ApiUrl = apiUrl
SchoolYear = schoolYear
DistrictId = DistrictID = districtID = districtId
apiLimit = batchLimit

prepareEdFiMetaData = prepareEdFiMetadata

### URL Initializations

In [19]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [20]:
instance_id = instanceId
school_year = schoolYear
api_url = apiUrl

edfi_api_manager = EdFiApiManager(api_url, instance_id, school_year)
edfi_api_manager.update_urls()
edfi_api_manager.set_other_metadata()

dependenciesUrl = edfi_api_manager.dependencies_url
openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
dataManagementUrl = edfi_api_manager.data_management_url
authUrl = edfi_api_manager.auth_url

changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

apiVersion = edfi_api_manager.api_version
apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

### OEA Initializations

In [21]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_edfi_py

In [22]:
# FIXME: 2024-02-12: ingestionHistoryMode Under Review
oea = EdFiOEAChild()   
oea.set_workspace(workspace)
oea.ingestionHistoryMode = ingestionHistoryMode

In [23]:
# swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
oea_utils = schema_gen = OpenAPIUtil(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

### Error Logging Initializations

In [24]:
error_logger = ErrorLogging(spark = spark, 
                            oea = oea, 
                            logger = logger)

### Threading Utilities

In [25]:
def upsert_data(df_changes, 
                metadata,
                schema_name, 
                transform_mode,
                table_name,
                primary_key,
                ext_entity,
                sink_general_path,
                sink_sensitive_path):
        df_pseudo, df_lookup = oea.pseudonymize(df_changes, 
                                                metadata,
                                                transform_mode,
                                                True)            
        edfiRefineAgent.transform(df = df_pseudo, 
                schema_name = schema_name, 
                table_name = table_name, 
                primary_key = 'id_pseudonym', 
                ext_entity = ext_entity, 
                sink_general_path = sink_general_path,
                districtId_col_name = 'DistrictId', 
                schoolYear_col_name = 'SchoolYear')
        if '/emptySchemas/' not in sink_sensitive_path:            
                oea.upsert(df = df_lookup, 
                        destination_path = sink_sensitive_path, 
                        primary_key = 'id',
                        partitioning = True,
                        partitioning_cols = ['DistrictId', 'SchoolYear'])    
                oea.add_to_lake_db(source_entity_path = sink_sensitive_path,
                                overwrite = True,
                                extension = None)

In [26]:
def threaded_task_empty_schema(input_tuple):
    # FIXME: 2024-02-07: Threading Under Dev
    item, schema_name, s2r_path, ext_entity, metadata, transform_mode, districtId, schoolYear = input_tuple
    
    table_name = item #sap_to_edfi_complex[item]
    try:
        logger.info('[REFINEMENT EMPTY SCHEMA DUMPING THREAD] Path does not exist - attempting to create empty data frame')                        
        sink_general_path = f'{s2r_path}/general/{schema_name}/{item}'
        sink_sensitive_path = f'{s2r_path}/sensitive/{schema_name}/{item}_lookup'
                        
        sink_general_path = edfiRefineAgent.sink_path_cleanup(sink_general_path)
        sink_sensitive_path = edfiRefineAgent.sink_path_cleanup(sink_sensitive_path)
        if not oea.path_exists(sink_general_path):  
            # FIXME: 2024-02-07 TEMP FIX       
            target_schema = copy.deepcopy(edfiRefineAgent.schemas[table_name])    
            df_changes = spark.createDataFrame(data = [],
                                                schema = target_schema)
            df_changes = df_changes.withColumn('DistrictId', F.lit(districtId))
            df_changes = df_changes.withColumn('SchoolYear', F.lit(schoolYear))
            
            current_timestamp = datetime.now()
            df_changes = df_changes.withColumn('LastModifiedDate', F.lit(current_timestamp))
            df_changes = df_changes.withColumn('rowIsActive', F.lit(True))
            df_changes = df_changes.withColumn('rundate', F.lit(current_timestamp))                            
            df_changes = df_changes.withColumn('stage1_source_url', F.lit('placeholder'))   
            
            if 'id' in df_changes.columns:
                upsert_data(df_changes, 
                            metadata,
                            schema_name, 
                            transform_mode,
                            table_name,
                            'id',
                            ext_entity,
                            sink_general_path,
                            sink_sensitive_path)
            else:
                logger.info(f'[REFINEMENT EMPTY SCHEMA DUMPING THREAD] {item} does not have id as primary key - flagged for future')
    except Exception as error:
        logger.exception(f"[REFINEMENT EMPTY SCHEMA DUMPING THREAD] {error}")

def dump_empty_schemas(schema_name, 
                       s2r_path,
                       ext_entity,
                       transform_mode, 
                       items = []):
    global districtId,schoolYear, metadata
    if schema_name is None:
        schema_name = 'ed-fi'
    
    with ThreadPoolExecutor(max_workers=8) as tpe:
        logger.info('[REFINEMENT EMPTY SCHEMA DUMPING] Entered Threadpool')
        tpe.map(threaded_task_empty_schema,[(item,schema_name if not item.lower().endswith('exts') else 'tx',s2r_path,ext_entity,metadata,transform_mode,districtId,schoolYear) for item in items])
    
    
def threaded_task(input_tuple):
    item,schema_name,tables_source,ext_entity,metadata,transform_mode,test_mode = input_tuple
    
    # print('inside thread')
    table_name = item #sap_to_edfi_complex[item]
    table_path = f"{tables_source}/{item}"
    logger.info(f"[REFINEMENT ETL TABLE THREAD] Processing schema/table: {schema_name}/{table_name}")
    if item == 'metadata.csv':
        logger.info('ignore metadata processing, since this is not a table to be ingested')
    else: 
        try:
            if not(oea.path_exists(f"stage2/Ingested/{table_path}")):
                pass
            else:
                if not(transform_mode):
                    df = oea.refine(table_path, 
                                    metadata = metadata[item], 
                                    primary_key = 'id')
                if transform_mode:
                    logger.info('[REFINEMENT ETL TABLE THREAD] Ed-Fi to Ed-Fi Relationship Model: ' + table_name)               
                    source_path = f'stage2/Ingested/{table_path}'
                    sink_general_path, sink_sensitive_path = oea.get_sink_general_sensitive_paths(source_path)
                    
                    sink_general_path = edfiRefineAgent.sink_path_cleanup(sink_general_path)
                    sink_sensitive_path = edfiRefineAgent.sink_path_cleanup(sink_sensitive_path)
                    df_changes = oea.get_latest_changes(source_path, sink_general_path, filtering_date = 'rundate')

                    df_changes = df_changes.withColumn('DistrictId', F.lit(districtId))
                    
                    # FIXME TO BE REVISED
                    if item != 'schoolYearTypes':
                        df_changes = df_changes.withColumn('SchoolYear', F.lit(schoolYear))
                    else:
                        # df_changes = df_changes.withColumnRenamed("schoolYear", "SchoolYear")
                        pass
                    
                    current_timestamp = datetime.now()
                    df_changes = df_changes.withColumn('LastModifiedDate', F.lit(current_timestamp))
                    
                    if df_changes.count() > 0:
                        upsert_data(df_changes, 
                                    metadata,
                                    schema_name, 
                                    transform_mode,
                                    table_name,
                                    'id_pseudonym',
                                    ext_entity,
                                    sink_general_path,
                                    sink_sensitive_path)
                    else:
                        logger.info(f'[REFINEMENT ETL TABLE THREAD] No updated rows in {source_path} to process.')

        except AnalysisException as e:
            logger.info(F"[REFINEMENT ETL TABLE THREAD] {e}")
        except Exception as e:
            logger.info(F"[REFINEMENT ETL TABLE THREAD] {e}")

def refine_and_explode_data(schema_name, 
                            tables_source,
                            ext_entity,
                            metadata, 
                            transform_mode, 
                            test_mode,
                            items = []):
    global districtId,schoolYear
    if items == 'All':
        items = oea.get_folders(f"stage2/Ingested/{tables_source}")
        items.append('schoolYearTypes')
    #items = ['accountCodes', 'accounts', 'grades', 'students', 'staffs']
    with ThreadPoolExecutor(max_workers=8) as tpe:
        logger.info('[REFINEMENT ETL TABLES] Entered Threadpool')
        tpe.map(threaded_task,[(item,schema_name,tables_source,ext_entity,metadata,transform_mode,test_mode) for item in items])
            

def get_non_ext_entities(entities_meta_info):
    non_ext_table_names = list()
    for entity_meta_info in entities_meta_info:
        non_ext_table_names.append(entity_meta_info['resource'].split('/')[-1])
    return non_ext_table_names

def add_all_empty_tables_to_lake_db(empty_tables_path, schema_name, emptyTables = None):
    if emptyTables is None:
        empty_tables_source = oea.to_url(empty_tables_path)
        items = oea.get_folders(empty_tables_source)
    else:
        items = emptyTables
    if schema_name == 'ed-fi':
        extension = None
    else:
        extension = schema_name 

    with ThreadPoolExecutor(max_workers=8) as tpe:
        logger.info('[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] Entered Threadpool')
        for item in items:
            source_entity_path = empty_tables_path + '/' + item 
            tpe.submit(add_empty_table_to_lake_db,source_entity_path,False,extension)
     
    # for item in items:
    #     source_entity_path = empty_tables_path + '/' + item 
    #     add_empty_table_to_lake_db(source_entity_path, 
    #                               overwrite = False, 
    #                               extension = extension)

def add_empty_table_to_lake_db(source_entity_path, overwrite = False, extension = None):
        # FIXME: Temporary Fix for Empty Schemas
        """ Adds the given entity as a table (if the table doesn't already exist) to the proper lake db based on the path.
            This method will also create the lake db if it doesn't already exist.
            eg: add_to_lake_db('stage2/Ingested/contoso_sis/v0.2/students')

            Note that a spark db that points to source data in the delta format can't be queried via SQL serverless pool. More info here: https://docs.microsoft.com/en-us/azure/synapse-analytics/sql/resources-self-help-sql-on-demand#delta-lake
        """
        source_dict = oea.parse_path(source_entity_path)
        if '/emptySchemas/' in source_entity_path:
            try:
                base_db_name = source_dict['ldb_name']
                base_table_name = source_dict['entity']
                for submission_type in ['']:     
                    if extension is not None:
                        if not(extension.startswith('_')):
                            extension = '_' + extension
                        source_dict['entity'] = base_table_name + str(extension)
                    
                    db_name = base_db_name + submission_type

                    logger.info(f"[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] Adding: Lake DB: {db_name}; Table: {source_dict['entity']}")
                    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
                    if overwrite:
                        spark.sql(f"drop table if exists {db_name}.{source_dict['entity']}")

                    spark.sql(f"create table if not exists {db_name}.{source_dict['entity']} using DELTA location '{oea.to_url(source_dict['entity_path'])}'")
            except Exception as error:
                logger.error(f'[REFINEMENT EMPTY SCHEMA ADD TO LAKE DB] {error}')

### Main Code

In [27]:
edfiRefineAgent = EdFiRefine(workspace = workspace, 
                             oea = oea, 
                             spark = spark,
                             schema_gen = schema_gen,
                             moduleName = moduleName, 
                             authUrl = authUrl,
                             swaggerUrl = swaggerUrl, 
                             dataManagementUrl = dataManagementUrl, 
                             changeQueriesUrl = changeQueriesUrl, 
                             dependenciesUrl = dependenciesUrl, 
                             apiVersion = apiVersion, 
                             schoolYear = schoolYear, 
                             districtId = districtId,
                             pipelineExecutionId = pipelineExecutionId,
                             error_logger = error_logger,
                             test_mode = False)

In [28]:
from datetime import datetime
import math
source_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_etl.csv'  
destination_path = source_path #f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_based_etl.csv'  
logs_path = f"stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/_frequency_etl_logs/run_logs_{datetime.today().strftime('%Y-%m-%d')}.csv"

processor = EntityFrequencyProcessor(oea = oea, 
                                     filepath = source_path, 
                                     highFrequentDelta = highFrequentDelta,#0.005, 
                                     moderateFrequentDelta = moderateFrequentDelta, #5, 
                                     lowFrequentDelta = lowFrequentDelta, #10, 
                                     descriptorsDelta = descriptorsDelta) #360)

processor.load_lookup_df()
_, entities_to_etl = processor.return_entities_to_etl()

edfiEntities = "All" #['schoolYearTypes']
tpdmEntities = 'All'

edfiEntities = entities_to_etl.get('ed-fi', [])
tpdmEntities = entities_to_etl.get('tpdm', [])

In [29]:
from datetime import datetime
schema_name = 'ed-fi'
ext_entity = 'TPDM'
test_mode = False
transform_mode = True
tables_source = f'{moduleName}/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/{schema_name}'
transform_items = edfiEntities #"All#['staffs', 'students']#" #non_ext_table_names#edfiEntities 

# Create or overwrite Metadata.csv
metadataPath = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets'
metadata = oea.get_metadata_from_path(metadataPath) # metadata = oea.get_metadata_from_url(metadataUrl)

In [30]:
if etlProcessing:
    df = refine_and_explode_data(schema_name, 
                            tables_source,
                            ext_entity,
                            metadata,
                            transform_mode, 
                            test_mode,
                            transform_items)

### Empty Schemas

In [31]:
from datetime import datetime
transform_mode = True

if prepareEdFiMetaData:
    retry_strategy = Retry(total = 3,
                       backoff_factor = 1,
                       status_forcelist = [429, 500, 502, 503, 504],
                       allowed_methods = ["HEAD", "GET", "OPTIONS", "POST", "DELETE"])

    edfiAPIClient = EdFiClient(workspace = workspace, 
                                    kvName = kvName, #NOTE: Default to None 
                                    moduleName = moduleName, 
                                    authUrl = authUrl, 
                                    dataManagementUrl = dataManagementUrl, 
                                    changeQueriesUrl = changeQueriesUrl, 
                                    dependenciesUrl = dependenciesUrl, 
                                    apiVersion = apiVersion, 
                                    batchLimit = batchLimit, 
                                    minChangeVer = minChangeVer, 
                                    maxChangeVer = maxChangeVer,
                                    schoolYear = schoolYear,
                                    districtId = districtId,
                                    kvSecret_clientId = client_id,
                                    kvSecret_clientSecret = client_secret_id,
                                    retry_strategy = retry_strategy,
                                    threadMode = True,
                                    devMode = True)

    entities_meta_info = edfiAPIClient.getEntities()#[0]['resource']
    non_ext_table_names = get_non_ext_entities(entities_meta_info) #TODO: To Be Reviewed
    non_ext_table_names = ['schoolYearTypes'] + non_ext_table_names

    for swagger_resource_type in ['Resources', 'Descriptors']:
        swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url(swagger_resource_type)
        oea_utils = schema_gen = OpenAPIUtil(swagger_url)
        oea_utils.create_definitions()
        schemas = schema_gen.create_spark_schemas()

        
        edfiRefineAgent = EdFiRefine(workspace = workspace, 
                             oea = oea, 
                             spark = spark,
                             schema_gen = schema_gen,
                             moduleName = moduleName, 
                             authUrl = authUrl,
                             swaggerUrl = swaggerUrl, 
                             dataManagementUrl = dataManagementUrl, 
                             changeQueriesUrl = changeQueriesUrl, 
                             dependenciesUrl = dependenciesUrl, 
                             apiVersion = apiVersion, 
                             schoolYear = schoolYear, 
                             districtId = districtId,
                             pipelineExecutionId = pipelineExecutionId,
                             error_logger = error_logger,
                             test_mode = False)

        # non_ext_table_names = sap_to_edfi_client.return_non_ext_tables()  
        if swagger_resource_type == 'Resources':
            transform_items = [item for item in non_ext_table_names if not(item.lower().endswith('descriptors'))]
        elif swagger_resource_type == 'Descriptors':
            transform_items = [item for item in non_ext_table_names if item.lower().endswith('descriptors')]
      
        s2r_path = f'stage2/Refined/Ed-Fi/{apiVersion}/emptySchemas'

        dump_empty_schemas(schema_name = schema_name , 
                         s2r_path = s2r_path,
                         ext_entity = ext_entity,
                         transform_mode = transform_mode, 
                         items = transform_items)

In [32]:
if prepareEdFiMetadata:
    tables_source = f'Ed-Fi/{apiVersion}/ed-fi'
    mainTables = [item for item in oea.get_folders(f"stage2/Refined/{tables_source}/general") if item != 'descriptorTables']

    tables_source = f'Ed-Fi/{apiVersion}/{ext_entity.lower()}'
    extTables = [item for item in oea.get_folders(f"stage2/Refined/{tables_source}/general") if item != 'descriptorTables']
    if extTables != []:
        mainTables = mainTables + extTables
    edfi_emptyTables = oea.get_folders(f'stage2/Refined/Ed-Fi/{apiVersion}/emptySchemas/general/ed-fi')
    edfi_emptyTables = edfiRefineAgent.non_empty_elements(edfi_emptyTables, 
                                                             mainTables)
    ext_emptyTables = oea.get_folders(f'stage2/Refined/Ed-Fi/{apiVersion}/emptySchemas/general/{ext_entity.lower()}')
    ext_emptyTables = edfiRefineAgent.non_empty_elements(ext_emptyTables, 
                                                             mainTables)

    emptyTables_path = f'stage2/Refined/Ed-Fi/{apiVersion}/emptySchemas/general/ed-fi'
    if edfi_emptyTables != list():
        add_all_empty_tables_to_lake_db(emptyTables_path, 'ed-fi', edfi_emptyTables)

    emptyTables_path = f'stage2/Refined/Ed-Fi/{apiVersion}/emptySchemas/general/{ext_entity.lower()}'
    if ext_emptyTables != list():
        add_all_empty_tables_to_lake_db(emptyTables_path, 'tpdm', ext_emptyTables)

In [482]:
if error_logger.entity_logs != []:
    logger.info('[REFINEMENT ERROR LOGGING] Writing Entity Level Error Logs')
    df = error_logger.create_spark_df('entity')
    error_logger.write_logs_to_delta_lake(df = df, 
                                log_type = 'entity',
                                destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))
    error_logger.add_etl_logs_to_lake_db(db_name = f'ldb_{workspace}_edfi_etl_logs',
                                        logs_base_path = 'etl-logs',
                                        log_type = 'entity',
                                        overwrite = True)