# About this Notebook

- This notebook demonstrates the common functions and their usage by the main ETL in [nb-city-safety.ipynb](./nb-city-safety.ipynb) notebook.
- Unit test cases for this notebook are created in another notebook - [test_nb-city-safety-common.ipynb](../../tests/test_nb-city-safety-common.ipynb).

To DO:
- Ensure type formatting
- Addition of doc strings

## External parameters

In [None]:
common_execution_mode = "testing"  # Global for controlling the execution of notebook cells.

In [None]:
# Enable the following for local testing
if common_execution_mode == "testing":
    onelake_table_path = "dummy_path"
    table_name = "test_table"
    onelake_name = "onelake_name"
    workspace_name = "dummy_workspace"
    lakehouse_name = "dummy_lakehouse"
    job_exec_instance = "dummy_exec_id"
    user_name = "current_user"

## Formatting the Notebook - Run only when developing the Notebook
Reference: https://learn.microsoft.com/en-us/fabric/data-engineering/author-notebook-format-code#extend-fabric-notebooks 


In [None]:
#  %load_ext jupyter_black
if common_execution_mode == "testing":
    import jupyter_black

    jupyter_black.load()

## Common functions

- Note that there are some additional imports in the statements below - which are not used here but used by main code. This is *NOT* a good practice. Generally, imports should be performed where you are using them.

In [None]:
import sys
from delta.tables import DeltaTable
import logging
from typing import Optional, Union
from opentelemetry import trace
from opentelemetry.trace.status import StatusCode
from opentelemetry.trace import SpanKind
import requests
import urllib.parse
import re
from pyspark.sql.functions import (
    lit,
    to_utc_timestamp,
    current_timestamp,
    unix_timestamp,
    avg,
    max,
    min,
    sum,
    count,
)

### Common utility functions

- NOTE: Consider moving this to a sepearate file

In [None]:
def query_app_insights(
    log_workspace_id: str, query: str, timedelta_in_mins: Optional[int] = 15
) -> Union[None, int]:
    # time delta ensures we are only looking in to data from past few mins specified
    try:
        response = client.query_workspace(
            log_workspace_id, query, timespan=timedelta(minutes=timedelta_in_mins)
        )
        if response.status == LogsQueryStatus.SUCCESS:
            data = response.tables
            # print(data[0].rows)
            count = data[0].rows[0][0]
            column_names = data[0].columns
        else:
            # LogsQueryPartialResult - handle error here
            error = response.partial_error
            data = response.partial_data
            count = None
            print(error)

    except HttpResponseError as err:
        print("something fatal happened")
        print(err)
    else:
        return count


def store_unit_test_results(unit_tests_results: object) -> None:
    ansi_escape = re.compile(r"\x1b\[[0-9;]*m")
    cleaned_text = ansi_escape.sub(
        "", " ".join(unit_tests_results.stdout.split("\n")[-3:])
    )
    cleaned_text = cleaned_text.replace("=", "")
    errors = re.findall(r"(\d+)\s+error", cleaned_text)
    num_errors = int(errors[0]) if errors else 0
    passes = re.findall(r"(\d+)\s+passed", cleaned_text)
    num_passes = int(passes[0]) if passes else 0
    fails = re.findall(r"(\d+)\s+failed", cleaned_text)
    num_fails = int(fails[0]) if fails else 0
    runtime = re.findall(r"\s+in\s+(.+)\s+", cleaned_text)
    runtime = runtime[0].strip() if runtime else "N/A"

    # TO DO: Store these results somewhere
    # Details to include: workspace details, deployment identifierers release name, configs/params used, user information, test statuses etc
    print(f"{num_errors =}, {num_passes =}, {num_fails =}, {runtime =}")


# Getting workspace id - using Fabric REST APIS
def make_fabric_api_call(token: str, url: str, call_type: str, payload: str) -> object:
    headers = {"Authorization": f"Bearer {token}"}
    try:
        if call_type == "get":
            response = requests.get(url, data=payload, headers=headers)
        elif call_type == "put":
            response = requests.put(url, data=payload, headers=headers)
        else:
            raise ValueError(
                f"Invalid {call_type = }. It must be either 'get' or 'put'."
            )
    except Exception as e:
        logger.error(f"Failed with error {e}")
        raise
    else:
        ## print(f"{response.status_code = }\n\n{response.content = }\n\n{response.headers = }\n\n{response.json() = }\n\n{response.text =}")
        return response


def verify_onelake_connection():
    cur_span = trace.get_current_span()

    # Check onelake existence - otherwise abort notebook execution
    error_message = f"Specfied lakehouse table path {onelake_table_path} doesn't exist. Ensure onelake={onelake_name}, workspace={workspace_name} and lakehouse={lakehouse_name} exist."
    try:
        if not (notebookutils.fs.exists(onelake_table_path)):
            raise ValueError(
                "Encountered error while checking for Lakehouse table path specified."
            )
    except Exception as e:
        logger.exception(f"Error message: {e}")
        cur_span.record_exception(e)
        cur_span.set_status(StatusCode.ERROR, "Onelake connection verification failed.")
        # no further execution but Session is still active
        notebookutils.notebook.exit(error_message)
    else:
        cur_span.set_status(StatusCode.OK)
        logger.info(
            f"Target table path: {onelake_table_path} is valid and exists.\nListing source data contents to check connectivity\n{notebookutils.fs.ls(wasbs_path)}"
        )


def identify_table_load_mode(table_name: str, span_obj: object) -> bool:

    # Preferred option - Assuming default lakehouse is not set, checking based on the delta path
    load_mode = (
        "append"
        if DeltaTable.isDeltaTable(spark, f"{onelake_table_path}/{table_name}")
        else "overwrite"
    )

    # getting span object as an argument - as opposed to using trace.get_current_span() to find current span.
    span_obj.set_attribute("load_mode", load_mode)

    return load_mode


def delete_delta_table(table_name: str) -> bool:

    delta_table_path = f"{onelake_table_path}/{table_name}"

    if notebookutils.fs.exists(delta_table_path):
        logger.info(
            f"Attempting to delete existing delta table with {delta_table_path = }...."
        )

        try:
            notebookutils.fs.rm(dir=delta_table_path, recurse=True)
        except Exception as e:
            logger.error(f"Deletion failed with the error:\n===={e}\n=====")
            raise
        else:
            logger.info(f"Deleted existing delta table: {table_name}.")
    else:
        logger.info(f"The specified delta table doesn't exist. No need for deletion.")

### Common functions specific to this business process

NOTE: These functions assume all required parameters are set outside of this code. Note that `tracer` is used as a decorator for one of the functions. So, this must be already set before running these functions.

In [None]:
def transform_data(city: str, data_frame: object) -> object:

    # Need timezone to convert to UTC
    if city in ("Boston", "NewYorkCity"):
        timezone = "America/New_York"
    elif city in ("Seattle", "SanFrancisco"):
        timezone = "America/Los_Angeles"
    else:
        timezone = "America/Chicago"

    data_frame = (
        data_frame.withColumn(
            "dateTimeUTC", to_utc_timestamp(data_frame.dateTime, timezone)
        )
        .withColumn("City", lit(city))
        .withColumn("jobExecId", lit(job_exec_instance))
        .withColumn(
            "lastUpdateUTC",
            to_utc_timestamp(
                current_timestamp(), spark.conf.get("spark.sql.session.timeZone")
            ),
        )
        .withColumn("lastUpdateUser", lit(user_name))
    )

    return data_frame


def city_data_etl(table_name: str, cities: tuple, current_span: object):

    delta_table_path = f"{onelake_table_path}/{table_name}"

    for city in cities:
        current_span.add_event(name=f"etl start for {city}", attributes=None)
        logger.info(f"ETL started for {city = }.")

        #  creating child-spans for each city using "with" context
        with tracer.start_as_current_span(f"etl_steps_city-{city}") as city_span:

            city_span.set_attribute("city_name", city)
            logger.info(f"\t Data Extraction in progress.")
            city_calls_data_path = f"{wasbs_path}/city={city}"
            city_span.add_event(
                name="Data Extraction in progress.",
                attributes={"etl.city.source_path": city_calls_data_path},
            )
            city_calls_df = spark.read.parquet(city_calls_data_path)

            record_count = city_calls_df.count()
            logger.info(f"\t Read {record_count} records for {city = }.")
            city_span.add_event(
                name="Data transformation in progress.",
                attributes={"record_count": record_count},
            )
            city_calls_df = transform_data(city, city_calls_df)

            delta_mode = identify_table_load_mode(table_name, city_span)
            logger.info(f"\t Data loading in inprogress using {delta_mode} mode.")
            city_span.add_event(
                name="Data loading in progress.", attributes={"delta_mode": delta_mode}
            )
            city_calls_df.write.format("delta").mode(delta_mode).save(delta_table_path)

            city_span.set_status(StatusCode.OK)

        current_span.add_event(name=f"etl end for {city}", attributes=None)
        logger.info(f"ETL completed for {city = }.")


@tracer.start_as_current_span(f"etl_steps") # creating span using a decorator - `tracer` must be defined and globally available
def etl_steps(table_name: str, cities: list, cleanup: Optional[bool] = True) -> None:

    current_span = trace.get_current_span()
    current_span.set_attributes({"etl.table_name": table_name, "etl.cleanup": cleanup})

    # Optionally delete existing contents
    delta_table_path = f"{onelake_table_path}/{table_name}"
    if cleanup:
        delete_delta_table(table_name)
        logger.info(
            f"A new delta table '{table_name}' will be created with {delta_table_path = }"
        )
    else:
        logger.info("No request for cleanup. Proceeding to ETL steps.")

    city_data_etl(table_name, cities, current_span)

    logger.info(
        f"\n=====\nCity safety data is loaded into {table_name =} for {cities =}\n====="
    )
    current_span.set_status(StatusCode.OK)

    return None


def gather_city_level_metrics(table_name: str, counter: object) -> None:

    delta_table = spark.read.format("delta").load(f"{onelake_table_path}/{table_name}")
    logger.info(f"Gathering metrics for: {table_name =} where {job_exec_instance = }")
    city_metrics = (
        delta_table.filter(delta_table.jobExecId == job_exec_instance)
        .groupBy("city")
        .agg(count("*").alias("count"))
    )

    # add the metric
    counter.add(
        amount=1,
        attributes={
            "record_count_total": city_metrics.agg(sum("count")).collect()[0][0]
        },
    )
    display(city_metrics)
    logger.info(f"total:{city_metrics.agg(sum('count')).collect()[0][0]}")

    return None

## Control the execution flow

- Control the behavior of the execution when using as notebook as opposed to a python script/module. The following code allows users to run only function definitions but not any other execution steps based on user arguments.



In [None]:
if __name__ == "__main__":

    if common_execution_mode == "testing":
        run_mode1 = common_execution_mode  # setting a dummy parameter - this can be seen by the calling notebook
        # your testing related code goes here. These can be unit testcases if you are using same notebook for testing as well.
        print(f"{common_execution_mode = }. '__main__' code will not be run.")
    else:
        run_mode2 = common_execution_mode  # setting a dummy parameter - this can be seen by the calling notebook
        # your non-testing related goes here.
        print(f"{common_execution_mode = }. '__main__' code will be run.")