In [66]:
instance = InstanceId = instanceId
ApiUrl = apiUrl
SchoolYear = schoolYear
DistrictId = DistrictID = districtID = districtId
apiLimit = batchLimit

prepareEdFiMetaData = prepareEdFiMetadata

In [ ]:
from datetime import datetime, timedelta
import pandas as pd
from notebookutils import mssparkutils
from io import StringIO

### URL Initializations

In [67]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_fetch_urls

In [68]:
instance_id = instanceId
school_year = schoolYear
api_url = apiUrl

edfi_api_manager = EdFiApiManager(api_url, instance_id, school_year)
edfi_api_manager.update_urls()
edfi_api_manager.set_other_metadata()

dependenciesUrl = edfi_api_manager.dependencies_url
openApiMetadataUrl = edfi_api_manager.openapi_metadata_url
dataManagementUrl = edfi_api_manager.data_management_url
authUrl = edfi_api_manager.auth_url

changeQueriesUrl = edfi_api_manager.get_referenced_url('Change-Queries')
changeQueriesUrl = changeQueriesUrl[:-13].replace('/metadata/', '/')
swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Resources')

apiVersion = edfi_api_manager.api_version
apiVersion = apiVersion[1:] if apiVersion.startswith('v') else apiVersion

resourcesUrl = edfi_api_manager.get_referenced_url('Resources')
descriptorsUrl = edfi_api_manager.get_referenced_url('Descriptors')
enrollmentUrl = resourcesUrl.replace('/data/', '/composites/').replace('/resources/', '/ed-fi/enrollment/')

### OEA Initializations

In [69]:
%run OEA/modules/Ed-Fi/v0.7/src/utilities/edfi_v0_7_edfi_py

In [70]:
oea = EdFiOEAChild()   
oea.set_workspace(workspace)

In [71]:
# swagger_url = swaggerUrl = edfi_api_manager.get_referenced_url('Descriptors')
oea_utils = schema_gen = OpenAPIUtil(swagger_url)
oea_utils.create_definitions()
schemas = schema_gen.create_spark_schemas()

### Main Code

In [77]:
from notebookutils import mssparkutils
import json

swagger_urls = [
    #"https://api.edgraph.com/edfi/v5.3/saas/core/metadata/composites/v1/4533604f-bb91-4bd9-aa60-2b49b4208cca/2023/ed-fi/enrollment/swagger.json",
    #"https://api.edgraph.com/edfi/v5.3/saas/core/metadata/data/v3/4533604f-bb91-4bd9-aa60-2b49b4208cca/2023/descriptors/swagger.json",
    #"https://api.edgraph.com/edfi/v5.3/saas/core/metadata/data/v3/4533604f-bb91-4bd9-aa60-2b49b4208cca/2023/resources/swagger.json"
    resourcesUrl,
    descriptorsUrl,
    enrollmentUrl
]

# Create or overwrite Metadata.csv
metadata_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/metadata.csv'
metadata_url = oea.to_url(metadata_path)

In [78]:
definitions = dict()
mssparkutils.fs.put(metadata_url, 'Entity Name,Attribute Name,Attribute Data Type,Pseudonymization\n', True)

for url in swagger_urls:
    logger.info(url)
    swagger = requests.get(url).json()

    # Build all definitions
    for definition in swagger['definitions'].keys():
        key = definition
        logger.info(key)

        if key not in definitions:
            definitions[key] = {}
            for prop in swagger["definitions"][definition]["properties"].keys():
                logger.info(prop, swagger["definitions"][definition]["properties"][prop])

                if "$ref" in swagger["definitions"][definition]["properties"][prop]:
                    # For now, don't add refs to the definitions.
                    pass
                elif prop == "id":
                    definitions[key][prop] = ["", prop, swagger["definitions"][definition]["properties"][prop]["type"], "hash"]
                elif swagger["definitions"][definition]["properties"][prop]["type"] == "array":
                    definitions[key][prop] = ["", prop, "string", "no-op"]
                elif swagger["definitions"][definition]["properties"][prop]["type"] == "number":
                    definitions[key][prop] = ["", prop, "float", "no-op"]
                else:
                    definitions[key][prop] = ["", prop, swagger["definitions"][definition]["properties"][prop]["type"], "no-op"]
            logger.info("--------------------------------------")

    # Iterate over the entities by path and write metadata for each entity to file
    for entity in swagger["paths"].keys():
        logger.info(entity)
        entity_split = entity.split("/")
        entity_key = entity_split[2]
        logger.info(entity_key)
        logger.info(swagger["paths"][entity]["get"]["responses"]["200"])
        rows = []

        for prop in swagger["paths"][entity]["get"]["responses"]["200"]['schema'].keys():
            if prop == "$ref":
                definition = swagger["paths"][entity]["get"]["responses"]["200"]['schema'][prop].split("/")[-1]
                logger.info(definitions[definition])
                for field in definitions[definition]:
                    rows.append(definitions[definition][field])

        if len(rows) > 0:
            rows.insert(0, [entity_key, "", "", ""])
            # Append to Metadata.csv
            mssparkutils.fs.append(metadata_url, '\n'.join(','.join(row) for row in rows) + '\n', True)

        logger.info("------------------------------")


### Frequency Based ETL Metadata

In [ ]:
exception = None
entitiesToFetch = "All"
from datetime import datetime
import math

retry_strategy = Retry(total = 3,
                       backoff_factor = 1,
                       status_forcelist = [429, 500, 502, 503, 504],
                       allowed_methods = ["HEAD", "GET", "OPTIONS", "POST", "DELETE"])
try:
    edfiAPIClient = EdFiClient(workspace = workspace, 
                                    kvName = kvName, #NOTE: Default to None 
                                    moduleName = moduleName, 
                                    authUrl = authUrl, 
                                    dataManagementUrl = dataManagementUrl, 
                                    changeQueriesUrl = changeQueriesUrl, 
                                    dependenciesUrl = dependenciesUrl, 
                                    apiVersion = apiVersion, 
                                    batchLimit = batchLimit, 
                                    minChangeVer = minChangeVer, 
                                    maxChangeVer = maxChangeVer,
                                    landingDateTimeFormat = "yyyyMMddHHmmss",
                                    schoolYear = schoolYear,
                                    districtId = districtId,
                                    kvSecret_clientId = client_id,
                                    kvSecret_clientSecret = client_secret_id,
                                    retry_strategy = retry_strategy,
                                    threadMode = True,
                                    devMode = True)
except Exception as e:
    exception = e
    logger.error(exception)

    raise(exception)

In [ ]:
entities_info = edfiAPIClient.getEntities()
entity_frequency_lookup = dict()

entity_frequency_lookup['resource_full_name'] = list()
entity_frequency_lookup['resource_domain'] = list()
entity_frequency_lookup['resource_sub_name'] = list()
entity_frequency_lookup['resource_frequency_code'] = list()
entity_frequency_lookup['lastrundatetime'] = list()
entity_frequency_lookup['lastrundate'] = list()

datetime_placeholder = datetime.today() - timedelta(days = 50)

In [ ]:
for entity_info in entities_info:
    resource_full_name = entity_info['resource']
    _, resource_domain, resource_sub_name = resource_full_name.split('/')
    
    entity_frequency_lookup['resource_full_name'].append(resource_full_name)
    entity_frequency_lookup['resource_domain'].append(resource_domain)
    entity_frequency_lookup['resource_sub_name'].append(resource_sub_name)
    
    if resource_full_name.lower().endswith('descriptors'):
        entity_frequency_lookup['resource_frequency_code'].append('descriptor')
        
        datetime_oneYearBefore = datetime_placeholder - timedelta(days = 360)
        entity_frequency_lookup['lastrundatetime'].append(datetime_oneYearBefore)
        entity_frequency_lookup['lastrundate'].append(datetime_oneYearBefore.date())
    else:
        entity_frequency_lookup['resource_frequency_code'].append('high')
        
        entity_frequency_lookup['lastrundatetime'].append(datetime_placeholder)
        entity_frequency_lookup['lastrundate'].append(datetime_placeholder.date())

entity_frequency_lookup['resource_full_name'].append('/ed-fi/schoolYearTypes')
entity_frequency_lookup['resource_domain'].append('ed-fi')
entity_frequency_lookup['resource_sub_name'].append('schoolYearTypes')
entity_frequency_lookup['resource_frequency_code'].append('high') 
entity_frequency_lookup['lastrundatetime'].append(datetime_placeholder)
entity_frequency_lookup['lastrundate'].append(datetime_placeholder.date())

In [ ]:
destination_path = f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets/frequency_etl.csv'

entity_frequency_lookup_df = pd.DataFrame(entity_frequency_lookup)
data_str = entity_frequency_lookup_df.to_csv(index=False) 
destination_url = oea.to_url(destination_path)
mssparkutils.fs.put(destination_url, data_str, True)  

In [64]:
# metadata = oea.get_metadata_from_path(f'stage1/Transactional/Ed-Fi/{apiVersion}/DistrictId={districtId}/SchoolYear={schoolYear}/metadata-assets')
# spark_schema = oea.to_spark_schema(metadata['students'])