In [None]:
import logging
from typing import List, Optional, Dict
import re
import pandas as pd
from metadata.generated.schema.api.data.createPipeline import CreatePipelineRequest
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
from metadata.generated.schema.entity.data.pipeline import Pipeline
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.pipelineService import PipelineService
from metadata.generated.schema.type.entityLineage import ColumnLineage, EntitiesEdge, LineageDetails
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.ometa.ometa_api import OpenMetadata

In [None]:
# config
lineage_path = f"{DS_ROOT_PATH}/lineage_test.csv"
schema_fqn = f"{DB_SERVICE_NAME}.{DB_NAME}.{SCHEMA_NAME}"

In [None]:


def get_pipeline_entity_by_fqn(om_conn: OpenMetadata, pipeline_fqn: str):
    return om_conn.get_by_name(entity=Pipeline, fqn=pipeline_fqn)


# get the schema fqn
def get_df_pipeline_fqn(lineage_df):
    """
    This function returns the code reference which transform the table
    :param lineage_df:
    :type lineage_df:
    :return:
    :rtype:
    """
    return lineage_df[OM_PIPE_FQN_COL_NAME].unique().tolist()


def get_query_summary(code_ref_file_path: str) -> Optional[str]:
    """
    This function takes the code ref(e.g. a file path, or an url), get the summary of the script which transforms the table
    :param code_ref_file_path:
    :type code_ref_file_path: str
    :return:
    :rtype:
    """
    # Regular expression to find variables like query1, query2, query3, etc.
    query_pattern = re.compile(r'query\d+\s*=\s*[\'"](.+?)[\'"]', re.DOTALL)
    query_summary = ""
    if code_ref_file_path:
        with open(code_ref_file_path, "r") as file:
            file_content = file.read()

        queries = query_pattern.findall(file_content)
        for query in queries:
            query_summary = f"{query_summary}; \n {query}"
    return query_summary


def get_pipeline_service_by_name(om_conn: OpenMetadata, pipeline_service_fqn: str) -> PipelineService:
    """
    This function takes a pipeline service fqn, then returns a pipeline service entity
    :param om_conn:
    :type om_conn:
    :param pipeline_service_fqn: pipeline fully qualified name
    :type pipeline_service_fqn: str
    :return:
    :rtype:
    """
    return om_conn.get_by_name(entity=PipelineService, fqn=pipeline_service_fqn)


def get_table_by_name(om_conn: OpenMetadata, table_fqn: str) -> Table:
    """
    This function takes a table fqn, then returns a table entity
    :param om_conn: openmetadata connection
    :type om_conn: OpenMetadata
    :param table_fqn: table fully qualified name
    :type table_fqn: str
    :return: 
    :rtype: 
    """
    return om_conn.get_by_name(entity=Table, fqn=table_fqn)


def generate_pipeline_params(code_ref: str) -> Dict:
    resu = None
    if code_ref:
        dag_id = "snds-constance-transformation"
        dag_description = "this pipeline transform the snds raw data into constance simplyfy table"
        resu = {"name": dag_id,
                "description": dag_description,
                "air_url": f"https://meta-ingestion.casd.local/dags/{dag_id}/grid",
                "pipeline_loc": "/opt/airflow/dags/airflow_metadata_extraction.py"}
    return resu


def build_query_pipeline(om_conn: OpenMetadata, code_ref: str, pipeline_service_fqn: str) -> Pipeline:
    """
    This function takes the code ref, build a pipeline with the content of the ref
    :param om_conn: Open metadata server connexion
    :type om_conn:
    :param code_ref: A fqn of the code reference(e.g. file path, url)
    :type code_ref: str
    :param pipeline_service_fqn: The target pipeline service
    :type pipeline_service_fqn: str
    :return: the generated pipeline
    :rtype:
    """
    pipeline_service = get_pipeline_service_by_name(om_conn, pipeline_service_fqn)
    pipeline_params = generate_pipeline_params(code_ref)
    create_pipeline = CreatePipelineRequest(
        # pipeline name generated from the code reference
        name=pipeline_params["name"],
        # pipeline description generated from the code reference
        description=pipeline_params["description"],
        sourceUrl=pipeline_params["air_url"],
        concurrency=5,
        pipelineLocation=pipeline_params["pipeline_loc"],
        service=pipeline_service.fullyQualifiedName, )
    pipeline_entity = om_conn.create_or_update(data=create_pipeline)
    return pipeline_entity


def build_column_lineage(lineage_df: pd.DataFrame, source_tab_fqn: str, dest_tab_fqn: str) -> List[ColumnLineage]:
    """
    This function takes a filtered lineage dataframe which only contains one pair of source and dest table, it builds
    all corresponding column lineage of the given pair. If no column lineage provided, return an empty list
    :param lineage_df: A filtered table/column lineage dataframe
    :type lineage_df: pd.Dataframe
    :param source_tab_fqn:
    :type source_tab_fqn:
    :param dest_tab_fqn:
    :type dest_tab_fqn:
    :return:
    :rtype:
    """
    col_lineage_list = []
    # group by the dest col name, and collect all linked source col in a list
    dest_source_map = lineage_df.groupby(DEST_COL_NAME)[SRC_COL_NAME].agg(lambda x: list(x.unique())).reset_index()
    # convert the dataframe to a list of dict
    dest_source_map_list = dest_source_map.to_dict(orient="records")
    if dest_source_map_list:
        # loop the list, for each row build a column lineage
        for row in dest_source_map_list:
            # complete the column fqn with the table fqn
            target_col = f"{dest_tab_fqn}.{row[DEST_COL_NAME]}"
            source_cols = [f"{source_tab_fqn}.{col_name}" for col_name in row[SRC_COL_NAME]]
            column_lineage = ColumnLineage(
                fromColumns=source_cols,
                toColumn=target_col)
            col_lineage_list.append(column_lineage)
    return col_lineage_list


def find_pipeline_fqn_by_dag(dag_file: str, om_conn: OpenMetadata):
    """
    This function read a dag file and extract the dag id, it checks if this dag
    has a mapping pipeline entity inside OM server or not. If it exists, return
    the corresponding pipeline entity fqn, if not raise value error
    :param om_conn:
    :type om_conn:
    :param dag_file:
    :type dag_file:
    :return:
    :rtype:
    """
    if dag_file is None or dag_file == '':
        return None
    try:
        dag_id = get_dag_info(dag_file)
    except FileNotFoundError:
        logger.error(f'DAG file {dag_file} not found')
        raise
    except ValueError:
        logger.error(f'DAG file {dag_file} has invalid format')
        raise
    pipeline_fqn = f"{PIPELINE_SERVICE_NAME}.{dag_id}"
    # the type(pipeline_entity.id) returns
    # metadata.generated.schema.type.basic.Uuid.
    # so we can't store it in a dataframe
    pipeline_entity = get_pipeline_entity_by_fqn(om_conn, pipeline_fqn)
    if pipeline_entity:
        logger.info(f"find the pipeline {pipeline_entity.id}")
        return pipeline_fqn
    else:
        raise ValueError(
            f'Can not find the pipeline {pipeline_fqn} in the OM server. Load the dag file to airflow first please')