### Initializations

In [3]:
%run OEA/modules/Ed-Fi/v0.6/src/utilities/edfi_v0_6_edfi_py

In [4]:
%run EdGraph/modules/EdGraph_DW/v0.5/src/utilities/edgraph_dw_v0_5_dw_builder

In [5]:
from datetime import datetime
oea = EdFiOEAChild()   
error_logger = ErrorLogging(spark = spark,
                            oea = oea,
                            logger = logger)

### Global Parameters

In [6]:
def assign_default_variable(variable_name, default_value):
    if variable_name not in globals():
        globals()[variable_name] = default_value
        logger.info(f'{variable_name} not found - using system default')

In [7]:
assign_default_variable(variable_name = 'workspace', 
                        default_value = 'sandbox2')
oea.set_workspace(workspace)

In [8]:
metadata_path = "stage3/EdGraph_DW/build-metadata/stage3-query-assets.json"
metadata_url = oea.to_url(metadata_path)

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import IntegerType

#### Global Query Parameters

In [10]:
from datetime import datetime, timedelta

incremental_prefix = ''
current_datetime = datetime.now()
last_datetime = current_datetime - timedelta(days=30) # TODO: Example Offset

current_execution_datetime_str = current_datetime.strftime("%Y-%m-%dT%H:%M:%S.%f0")
last_execution_datetime_str = last_datetime.strftime("%Y-%m-%dT%H:%M:%S.%f0")

assign_default_variable(variable_name = 'stage2_db_name', 
                        default_value = 'ldb_sandbox2_s2r_ed_fi')

assign_default_variable(variable_name = 'base_table_db_name', 
                        default_value = 'ldb_sandbox2_s3_ed_fi')

assign_default_variable(variable_name = 'stage3_db_name', 
                        default_value = 'ldb_sandbox2_s3_ed_fi')

assign_default_variable(variable_name = 'current_school_year', 
                        default_value = 'SchoolYear')

assign_default_variable(variable_name = 'schoolYear_varParam', 
                        default_value = 'SchoolYear')

assign_default_variable(variable_name = 'districtId_varParam', 
                        default_value = 'DistrictId')

assign_default_variable(variable_name = 'present_year', #TODO: Fix
                        default_value = '2024')

### Main Code

#### DimDate

In [55]:
def dump_dim_date(table_name, table_generator, metadata_processor):
    fiscal_month = table_generator.generate_fiscal_month()
    first_day_of_week = table_generator.generate_first_day_of_week()
        
    dimDate_staging = table_generator.generate_dim_date(fiscal_month,num_years=20)
    dimDate_staging = dimDate_staging.withColumn("SchoolYearShort", dimDate_staging["SchoolYearShort"].cast(IntegerType()))
    dimDate_staging = dimDate_staging.withColumn("DatSkey", lit(None).cast(IntegerType()))
    dimDate_staging.createOrReplaceTempView("staging_vw_DimDate")
    query=""" SELECT * from staging_vw_dimDate UNION ALL
    SELECT
    1900 AS CalendarYear,'2023-10-17 15:06:01.1438416 +05:30' AS DW_CreatedDateTime,'2023-10-17 15:06:01.1438416 +05:30' AS DW_ModifiedDateTime,CAST('1900-01-01' AS DATE) AS Date,
    -1 AS DateKey,-1 AS DayOfMonth,'-1' AS DayOfMonthWithSuffix,-1 AS DayOfWeek,-1 AS DayOfYear,
    -1 AS Month,'Not Specified' AS MonthName,'Not Specified' AS MonthNameShort,'Not Specified' AS SchoolYear,1900 AS SchoolYearShort,
    'Not Specified' AS WeekdayName,-1 AS WeekOfMonth,-1 AS WeekOfYear,'Not Specified' AS WeekdayNameShort,'-1' as DatSkey
    """
    dimDate = spark.sql(query)
    dimDate.createOrReplaceTempView("dbo_vw_DimDate")
    # print(spark.sql("SELECT count(*) from dbo_vw_DimDate").collect())
    metadata_processor.dump_to_stage3_delta_lake(step_prefix = 'dbo_vw', 
                                                            table_name = table_name,
                                                            surrogate_key = True)
    metadata_processor.add_to_lake_db_stage3(step_prefix = 'dbo_vw', 
                                                        table_name = table_name, 
                                                        overwrite = True)

def common_elements_preserve_order(list1, list2):
    # NOTE: Returns the list of entities to etl that are common in the order of list1
    set_list1 = set(list1)
    common_elements = [elem for elem in list1 if elem in set_list1.intersection(list2)]
    return common_elements

In [56]:
# table_generator = SparkTableGenerator(spark, base_table_db_name, present_year, current_datetime)
# fiscal_month = table_generator.generate_fiscal_month()
# first_day_of_week = table_generator.generate_first_day_of_week()

# DimDate = table_generator.generate_dim_date(fiscal_month)
# DimDate.createOrReplaceTempView("dbo_vw_DimDate")

#### Automated (via MetadataProcessor)

In [57]:
jsonDF = spark.read.option("multiline", "true").json(metadata_url).cache()

json_string = jsonDF.toJSON().collect()[0]
original_metadata = json.loads(json_string)

In [58]:
test_version = ''
metadata_processor = EdgraphDWHBuilder(original_metadata = original_metadata, 
                                       stage3_db_name = stage3_db_name,
                                       stage_3_path = f'stage3/EdGraph_DW{test_version}', 
                                       partitioning = False, 
                                       spark = spark, 
                                       oea = oea, 
                                       logger = logger,
                                       error_logger = error_logger)
metadata_processor.process_metadata()
table_generator = SparkTableGenerator(spark, base_table_db_name, present_year, current_datetime)

In [59]:
from datetime import datetime
import math
source_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/edgraph_frequency_etl.csv'  
destination_path = source_path #f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_based_etl.csv'  
logs_path = f"stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/_edgraph_frequency_etl_logs/run_logs_{datetime.today().strftime('%Y-%m-%d')}.csv"

In [60]:
processor = EntityFrequencyProcessor(oea = oea, 
                                     filepath = source_path, 
                                     highFrequentDelta = highFrequentDelta,#0.005, 
                                     moderateFrequentDelta = moderateFrequentDelta, #5, 
                                     lowFrequentDelta = lowFrequentDelta, #10, 
                                     descriptorsDelta = descriptorsDelta) #360)

In [61]:
processor.load_lookup_df()
processor.write_lookup_df(logs_path)
entities_to_etl, _ = processor.edgraph_return_entities_to_etl()

def common_elements_preserve_order(list1, list2):
    set_list1 = set(list1)
    common_elements = [elem for elem in list1 if elem in set_list1.intersection(list2)]
    return common_elements

# if len(entities_to_etl) != 0:
#     edfiAPIClient.landEntities(entities_to_etl, False)
# else:
#     logger.info("No entities to be fetched has been specified")

In [74]:
from datetime import datetime
queries_in_order = dict()
queries_params_in_order = dict()
parameterized_queries = dict()
schema_names = ['dbo','config','auth']

for schema_name in schema_names:
    queries_in_order[schema_name], queries_params_in_order[schema_name] = metadata_processor.return_schema_queries_in_order(schema_name)

for schema_name in schema_names:
    non_surrogate_tables = ['DimSchoolYear']
    schema_queries_in_order = queries_in_order[schema_name]
    schema_queries_params_in_order = queries_params_in_order[schema_name]
    
    table_names = list(schema_queries_in_order.keys())
    table_names = common_elements_preserve_order(table_names, entities_to_etl)
    #table_names=['FactSchoolAttendance','DimAssessmentPeriod','FactStudentDisciplineAction','User','UserAuthorization']
    for table_name in table_names:
        try:
            if table_name == 'DimDate':
                logger.info(f'Dependent TABLE CREATION - config.Parameter')
                logger.info(f'TABLE CREATION - {table_name}')
                parameterized_queries = metadata_processor.parameterize_table_queries(
                                                    schema_queries = queries_in_order['config'],
                                                    schema_queries_params = queries_params_in_order['config'],
                                                    table_name = 'Parameter',
                                                    stage2_db_name = stage2_db_name,
                                                    stage3_db_name = stage3_db_name,
                                                    base_table_db_name = base_table_db_name,
                                                    current_execution_datetime_str = current_execution_datetime_str,
                                                    last_execution_datetime_str = last_execution_datetime_str,
                                                    current_school_year = current_school_year,
                                                    schoolYear_varParam = schoolYear_varParam,
                                                    districtId_varParam = districtId_varParam,
                                                    incremental_prefix = incremental_prefix,
                                                    # CurrentSchoolYear = current_school_year,
                                                    query_params={})
                
                metadata_processor.execute_table_queries('config',
                                                        'Parameter', 
                                                        parameterized_queries,
                                                        surrogate_key = True)
                # fiscal_month = table_generator.generate_fiscal_month()
                # first_day_of_week = table_generator.generate_first_day_of_week()
                    
                # DimDate = table_generator.generate_dim_date(fiscal_month)
                # DimDate.createOrReplaceTempView("dbo_vw_DimDate")
                dump_dim_date(table_name, table_generator, metadata_processor)
            
            elif table_name == 'Parameter' and schema_name == 'config':
                pass
            else:
                logger.info(f'TABLE CREATION - {table_name}')
                parameterized_queries = metadata_processor.parameterize_table_queries(
                                                    schema_queries=schema_queries_in_order,
                                                    schema_queries_params=schema_queries_params_in_order,
                                                    table_name=table_name,
                                                    stage2_db_name = stage2_db_name,
                                                    stage3_db_name = stage3_db_name,
                                                    base_table_db_name = base_table_db_name,
                                                    current_execution_datetime_str = current_execution_datetime_str,
                                                    last_execution_datetime_str = last_execution_datetime_str,
                                                    current_school_year = current_school_year,
                                                    schoolYear_varParam = schoolYear_varParam,
                                                    districtId_varParam = districtId_varParam,
                                                    incremental_prefix = incremental_prefix,
                                                    # CurrentSchoolYear = current_school_year,
                                                    query_params={})
                
                if table_name in non_surrogate_tables:
                    metadata_processor.execute_table_queries(schema_name,
                                                            table_name, 
                                                            parameterized_queries,
                                                            surrogate_key = False)
                else:
                    metadata_processor.execute_table_queries(schema_name,
                                                        table_name, 
                                                        parameterized_queries,
                                                        surrogate_key = True)
            print()
        except Exception as error:
            logger.exception(error)
            #logger.error(f"An Error Occurred while creating: {table_name}")

In [75]:
processor.update_lookup_df()
processor.write_lookup_df(destination_path)


In [76]:
df_logs = error_logger.create_spark_df('entity')
error_logger.write_logs_to_delta_lake(df = df_logs, 
                                      log_type = 'entity',
                                      destination_url = error_logger.to_logs_url('etl-logs/log_type=entity'))

error_logger.add_etl_logs_to_lake_db(db_name = 'edgraph_etl_logs',
                                     logs_base_path = 'etl-logs',
                                     log_type = 'entity',
                                     overwrite = False)

In [77]:
jsonDF.unpersist()
logger.info('Cached Data Removed From Memory')

In [78]:
# %run Dev/s2r_to_edgraphDwh/v0_1/Auth tables

In [82]:
# %run Dev/s2r_to_edgraphDwh/v0.5/main/dev_v0_5_populate_SQL_DB

In [81]:
# %run Dev/s2r_to_edgraphDwh/v0.5/main/dev_v0_5_semantic_views