<a href="https://colab.research.google.com/github/DOJO-Smart-Ways/DOJO-Beam-Transforms/blob/pbi-footprint/pbi_footprint/pbi_pfx_dims_pipeline_to_trusted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configs

In [None]:
!pip install git+https://github.com/DOJO-Smart-Ways/DOJO-Beam-Transforms.git@pbi-footprint#egg=dojo-beam-transforms

In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

import os

from google.cloud import bigquery

# Google Auth
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

# GCP Project
os.environ["GOOGLE_CLOUD_PROJECT"]= 'nidec-ga'

Authenticated


In [None]:
from pipeline_components.input_file import read_bq
from pipeline_components import data_enrichment as de
from pipeline_components import data_cleaning as dc
from pipeline_components import data_understand as du

# Pipeline Options

In [None]:
pipeline_options = {
    'project':'nidec-ga',
    'runner':'DataflowRunner',
    'region':'us-central1',
    'staging_location':'gs://nidec-ga-temp/data-flow-pipelines/pbi-footprint/staging',
    'temp_location':'gs://nidec-ga-temp/data-flow-pipelines/pbi-footprint/temp',
    'template_location':'gs://nidec-ga-temp/data-flow-pipelines/pbi-footprint/template/pbi-pfx-bw-dims',
    'sdk_container_image': 'us-central1-docker.pkg.dev/nidec-ga/dojo-beam/dojo_beam',
    'sdk_location': 'container'
}


# pipeline_options = {
#     'project':'nidec-ga'
# }

pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
pipeline = beam.Pipeline(options=pipeline_options)

# Aux Methods

In [None]:
def make_path(*args: str) -> str:
    if args:
        return "gs://" + "/".join(args)
    else:
        return ""

In [None]:
def build_schema(column_tuples):
    """
    Builds a BigQuery schema from a list of tuples containing column name and data type.

    Args:
        column_tuples (list): List of tuples containing column name and data type.

    Returns:
        dict: BigQuery schema.
    """
    schema = {'fields': []}
    for name, data_type in column_tuples:
        schema['fields'].append({'name': name, 'type': data_type})
    return schema

In [None]:
def get_pbi_dim_material_query(table: str) -> str:
  pbi_data_query = (
    f"""
      SELECT DISTINCT
        UPPER(ORG) AS ORG,
        UPPER(MATERIAL) AS COMPONENT,
        UPPER(SIMILAR) AS SIMILAR,
        UPPER(MATERIAL_DESCRIPTION) AS MATERIAL_DESCRIPTION,
        UPPER(UOM) AS UOM,
        UPPER(CATEGORY_LOCAL) AS CATEGORY_LOCAL,
        UPPER(FAMILY_LOCAL) AS FAMILY_LOCAL,
        UPPER(DRI_CODE) AS DRI_CODE,
        UPPER(CATEGORY) AS CATEGORY,
        UPPER(FAMILY) AS FAMILY,
        UPPER(`GROUP`) AS `GROUP`,
        PBI_KEY
      FROM
        `{table}`
    """
  )

  return pbi_data_query

In [None]:
def get_pbi_dim_vendor_query(table: str) -> str:
  pbi_data_query = (
      f"""
        SELECT DISTINCT
          VENDOR,
          ORG,
          VENDOR_COUNTRY,
          REGIONAL_DIRECTOR,
          VENDOR_LOCAL_NAME,
          VENDOR_NAME,
          VENDOR_TYPE,
          VENDOR_MANAGEMENT,
          REGIONAL_ACCOUNT_MANAGER,
          GLOBAL_ACCOUNT_MANAGER,
          GLOBAL_DIRECTOR,
          LEAD_NEGOTIATOR,
          COMMERCIAL_STRATEGY
        FROM
          `{table}`
      """
  )

  return pbi_data_query


# Pipeline builds

In [None]:
# Main function of the Pipeline
def build_material_dim(source_table, target_table, temp_location):
  p_collection = read_bq(pipeline, get_pbi_dim_material_query(source_table), temp_location, identifier="DIM Material")

  columns = [
      ('ORG', 'STRING'),
      ('COMPONENT', 'STRING'),
      ('SIMILAR', 'STRING'),
      ('MATERIAL_DESCRIPTION', 'STRING'),
      ('UOM', 'STRING'),
      ('CATEGORY_LOCAL', 'STRING'),
      ('FAMILY_LOCAL', 'STRING'),
      ('DRI_CODE', 'STRING'),
      ('CATEGORY', 'STRING'),
      ('FAMILY', 'STRING'),
      ('GROUP', 'STRING'),
      ('PBI_KEY', 'STRING')
  ]

  schema = build_schema(columns)

  p_collection | 'Write To BigQuery for pbi_dim_material' >> beam.io.WriteToBigQuery(
    target_table,
    schema=schema,
    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
    custom_gcs_temp_location = temp_location
  )


  pipeline.run().wait_until_finish()

In [None]:
# Main function of the Pipeline
def build_vendor_dim(source_table, target_table, temp_location):
  p_collection = read_bq(pipeline, get_pbi_dim_vendor_query(source_table), temp_location, identifier="DIM Vendor")

  columns = [
    ('VENDOR', 'STRING'),
    ('ORG', 'STRING'),
    ('VENDOR_COUNTRY', 'STRING'),
    ('REGIONAL_DIRECTOR', 'STRING'),
    ('VENDOR_LOCAL_NAME', 'STRING'),
    ('VENDOR_NAME', 'STRING'),
    ('VENDOR_TYPE', 'STRING'),
    ('VENDOR_MANAGEMENT', 'STRING'),
    ('REGIONAL_ACCOUNT_MANAGER', 'STRING'),
    ('GLOBAL_ACCOUNT_MANAGER', 'STRING'),
    ('GLOBAL_DIRECTOR', 'STRING'),
    ('LEAD_NEGOTIATOR', 'STRING'),
    ('COMMERCIAL_STRATEGY', 'STRING')
  ]

  schema = build_schema(columns)

  p_collection | 'Write To BigQuery pbi_dim_vendor' >> beam.io.WriteToBigQuery(
    target_table,
    schema=schema,
    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
    custom_gcs_temp_location = temp_location
  )


  pipeline.run().wait_until_finish()


In [None]:
# Call Pipeline and pass parameters
if __name__ == '__main__':
  #Temp
  temp_location = make_path("nidec-ga-temp", "data_flow_pipelines", "pbi-footprint", "temp")

  #Common Data
  pbi_source_table = "nidec-ga.bq_refined.PBI_PFX"

  #PBI_MATERIAL_DIM
  dim_material_output_table = "nidec-ga.bq_trusted.PBI_MATERIAL_DIM_DF"
  build_material_dim(pbi_source_table, dim_material_output_table, temp_location)

  #PBI_VENDORS_DIM
  dim_vendors_output_table = "nidec-ga.bq_trusted.PBI_VENDORS_DIM_DF"
  build_vendor_dim(pbi_source_table, dim_vendors_output_table, temp_location)