In [1]:
# !pip3 install google-cloud-aiplatform --upgrade
# !pip3 install kfp google-cloud-pipeline-components==0.1.1 --upgrade

In [2]:
# Importaciones 
from typing import NamedTuple
from kfp.v2 import dsl
import kfp
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components import aiplatform as gcc_aip

## Constantes

In [3]:
PROJECT_ID = "tc-sc-bi-bigdata-corp-tsod-dev" 
BUCKET = "gs://test_pod_images"
REGION = "us-west1"

# Pipeline root
PIPELINE_ROOT = f"{BUCKET}/PIPELINES/"

# Get data Component

In [4]:
@component(
    packages_to_install=[
        "datetime",
        "numpy",
        "pandas",
        "multiprocessing",
        "opencv-python"
    ],
    base_image="python:3.9",
    output_component_file="get_data.yaml"
)
def get_data_parallel(query: str):
    import datetime
    import numpy as np
    import pandas as pd
    import cv2
    import urllib
    import urllib.request
    from multiprocessing import cpu_count
    from multiprocessing.pool import ThreadPool

    from google.cloud import bigquery
    from google.cloud import storage

    date = datetime.datetime.now()    
    client = bigquery.Client()

    def get_data(url):
        try:
            url_str = url.split('/')[-1]
            url_open = urllib.request.urlopen(url)
            image_cv = np.asarray(bytearray(url_open.read()), dtype="uint8")
            image = cv2.imdecode(image_cv, cv2.IMREAD_COLOR)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            img_str = cv2.imencode('.png', image)[1].tostring()

            storage_client = storage.Client()
            bucket = storage_client.bucket('pod_images')
            blob = bucket.blob(f'{date.year}{date.month}0{date.day}/{url_str}')
            blob.upload_from_string(img_str)
        except:
            pass
        
    df_images = client.query(query).to_dataframe()
    urls = df_images.drop_duplicates(['url']).loc[:,'url'].iloc[:30]
    
    cpus = cpu_count()
    results = ThreadPool(cpus-1).imap_unordered(get_data, urls)
    # lista_base = [result for result in results]

# Build Pipeline

In [5]:
@kfp.dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="get-data",  # Your own naming for the pipeline.
)
def pipeline(
      query : str
    ):
    # get name component
    get_data_parallel_task = get_data_parallel(query)

In [6]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="get-data.json"
)



In [7]:
sql = """
SELECT DISTINCT transport_ord_id as SOC, i.url as url, shipment.plate_num as plate_num, 
provider.doc_id as provider_id, 
provider.doc_verify_digit as provider_verify_digit,
provider.name as provider_name, driver.doc_id as driver_id, 
driver.doc_verify_digit as driver_verify_digit,
driver.name as driver_name, driver.last_name as driver_last_name,
DATETIME(event_crte_tmst, 'America/Santiago') as event_crte_tmst, dfl_crte_tmst
FROM 
`tc-sc-bi-bigdata-corp-tsod-dev.image_recognition.btd_scha_fal_trmg_api_transport_order_temp`,
unnest(image) as i
 
WHERE
  i.url is not null
  and provider.name is not null
  and provider.doc_id is not null
  and DATE(event_crte_tmst, 'America/Santiago') = current_date() - 2

"""

job = aiplatform.PipelineJob(display_name = "get_data",
                             template_path = "get-data.json",
                             pipeline_root = PIPELINE_ROOT,
                             parameter_values = {"query":sql},
                             enable_caching = True,
                             project = PROJECT_ID,
                             location = REGION)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/1003479373544/locations/us-west1/pipelineJobs/get-data-20221103180613
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/1003479373544/locations/us-west1/pipelineJobs/get-data-20221103180613')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-west1/pipelines/runs/get-data-20221103180613?project=1003479373544
