In [1]:
import os
import re
from IPython.core.display import display, HTML
from datetime import datetime
import mlflow
import pymysql

In [2]:
# Jupyter magic template to create Python file with variable substitution. Will not working with string format inside the cell.
from IPython.core.magic import register_line_cell_magic
@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

  and should_run_async(code)


In [3]:
experiment_name = "chicago-taxi"
mlflow.set_experiment(experiment_name)

mlflow_tracking_uri = mlflow.get_tracking_uri()
MLFLOW_TRACKING_EXTERNAL_URI = os.environ["MLFLOW_TRACKING_EXTERNAL_URI"]
MLFLOW_EXPERIMENTS_URI = os.environ["MLFLOW_EXPERIMENTS_URI"]
training_artifacts_uri = MLFLOW_EXPERIMENTS_URI+experiment_name
REGION=os.environ["MLOPS_REGION"]
ML_IMAGE_URI = os.environ["ML_IMAGE_URI"]
COMPOSER_NAME = os.environ["MLOPS_COMPOSER_NAME"]
MLFLOW_GCS_ROOT_URI = os.environ["MLFLOW_GCS_ROOT_URI"]

print(f"Cloud Composer instance name: {COMPOSER_NAME}")
print(f"Cloud Composer region: {REGION}")
print(f"MLflow tracking server URI: {mlflow_tracking_uri}")
print(f"MLflow articfacts store root: {MLFLOW_EXPERIMENTS_URI}")
print(f"GCS root: {MLFLOW_GCS_ROOT_URI}")

experiment_path = MLFLOW_EXPERIMENTS_URI.replace("gs://","")
display(HTML('<hr>You can check results of this test in MLflow and GCS folder:'))
display(HTML(f'<h4><a href="{MLFLOW_TRACKING_EXTERNAL_URI}" rel="noopener noreferrer" target="_blank">Click to open MLflow UI</a></h4>'))
display(HTML(f'<h4><a href="https://console.cloud.google.com/storage/browser/{experiment_path}" rel="noopener noreferrer" target="_blank">Click to open GCS folder</a></h4>'))

!mkdir -p ./package/training
!touch ./package/training/__init__.py

Cloud Composer instance name: mlops-43-af
Cloud Composer region: us-central1
MLflow tracking server URI: http://127.0.0.1:80
MLflow articfacts store root: gs://mlops-43-artifacts/experiments
GCS root: gs://mlops-43-artifacts


In [4]:
%%writefile ./package/setup.py
from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['mlflow==1.8.0','PyMySQL==0.9.3']

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Customer training setup.'
)

Overwriting ./package/setup.py


In [5]:
%%writetemplate ./package/training/task.py

import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.linear_model import LogisticRegression
import sys, stat
import argparse
import os

def train_model(args):
    print("Taxi fare estimation model training step started...")
    with mlflow.start_run(nested=True):
        #TODO 😁😁😁😁
        mlflow.log_metric("score", score)
        mlflow.sklearn.log_model(lr, "model")
    print("Training finished.")

def main():
    print("Training arguments: " + " ".join(sys.argv[1:]))
    parser = argparse.ArgumentParser()
    parser.add_argument("--epochs", type=int)
    parser.add_argument("--job-dir", type=str)
    parser.add_argument("--local_data", type=str)
    args, unknown_args = parser.parse_known_args()

    # CLOUD_ML_JOB conatains other CAIP Training runtime parameters in JSON object
    # job = os.environ["CLOUD_ML_JOB"]
    
    # MLflow locally available
    mlflow.set_tracking_uri("http://127.0.0.1:80")
    mlflow.set_experiment("{experiment_name}")

    print("Training main started")
    train_model(args)

    # if --job-dir provided in 'ai-platform jobs submit' command you can upload any training result to that
    # if args.job_dir:
    # args.local_data, args.job_dir

if __name__ == "__main__":
    main()

In [6]:
submit_time = datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_NAME=f"training_job_{submit_time}"
JOB_DIR=f"{training_artifacts_uri}/training_{submit_time}"
print(f"Training job name: '{JOB_NAME}' will run in {REGION} region using image from:\n {ML_IMAGE_URI}\n")
!gcloud composer environments storage data import \
    --environment {COMPOSER_NAME} \
    --location {REGION} \
    --source ./package \
    --destination dual_model_trainer_dag


Training job name: 'training_job_20201020_084356' will run in us-central1 region using image from:
 gcr.io/edgeml-demo/mlops-43-mlimage:latest



In [7]:
%%writetemplate dual_model_trainer_dag.py
# GCS folder where dataset CSV files are stored
DATASET_GCS_FOLDER = "{MLFLOW_GCS_ROOT_URI}/data"
REGION="{REGION}"
MLFLOW_EXPERIMENTS_URI="{MLFLOW_EXPERIMENTS_URI}"

JOB_DIR="{JOB_DIR}"
JOB_NAME="{JOB_NAME}"

In [10]:
%%writefile -a dual_model_trainer_dag.py

import os
import datetime
import logging
from datetime import timedelta
import tensorflow_data_validation as tfdv
import airflow
from airflow import DAG
from airflow.models import Variable
from airflow.operators.bash_operator import BashOperator

# Working with Airflow 1.10.10 version!
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator

default_args = dict(retries=1,start_date=airflow.utils.dates.days_ago(0))
INTERVAL = "@once"
START_DATE = datetime.datetime(2020, 9, 1)

PROJECT_ID = os.getenv("GCP_PROJECT", "edgeml-demo")
REGION = os.getenv("COMPOSER_LOCATION", "us-central")

# Postfixes for temporary BQ tables and output CSV files
TRAINING_POSTFIX = "_training"
EVAL_POSTFIX = "_eval"
VALIDATION_POSTFIX = "_validation"

BQ_DATASET = "chicago_taxi_trips"
BQ_TABLE = "taxi_trips"
BQ_QUERY = """
    SELECT unique_key, taxi_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles, pickup_census_tract, 
        dropoff_census_tract, pickup_community_area, dropoff_community_area, fare, tips, tolls, extras, trip_total, 
        payment_type, company, pickup_latitude, pickup_longitude, pickup_location, dropoff_latitude, dropoff_longitude, dropoff_location
    FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
    WHERE
      dropoff_latitude IS NOT NULL and
      dropoff_longitude IS NOT NULL and
      dropoff_location  IS NOT NULL and
      MOD(ABS(FARM_FINGERPRINT(unique_key)), 100) {}
    LIMIT 100
    """

command=f"""gcloud ai-platform jobs submit training {JOB_NAME} \
    --region {REGION} \
    --scale-tier BASIC \
    --job-dir {JOB_DIR} \
    --package-path /home/airflow/gcs/data/dual_model_trainer_dag/package/training/ \
    --module-name training.task \
    --master-image-uri {JOB_DIR} \
    -- \
    --mlflowuri {MLFLOW_EXPERIMENTS_URI} \
    --epochs 2"""

def generate_tfdv_statistics(gcs_file_name):
    logging.info("Processing %s", gcs_file_name)
    train_stats = tfdv.generate_statistics_from_csv(gcs_file_name)
    #tfdv.WriteStatisticsToTFRecord(output_path = gcs_file_name + ".tfrecord")
    return None

with DAG("dual_model_trainer",
         description = "Train evaluate and validate two models on taxi fare dataset. Select the best one and register it to Mlflow v0.02",
         schedule_interval = INTERVAL,
         start_date = START_DATE,
         catchup = False,
         doc_md = __doc__
         ) as dag:
    
    tasks = [{
            "postfix" : "training",
            "dataset_range" : "between 0 and 80"
        },{
            "postfix" : "eval",
            "dataset_range" : "between 80 and 95"
        },{
            "postfix" : "validation",
            "dataset_range" : "between 95 and 100"
        }]
    
    # Define task list
    for task in tasks:
        logging.info("task: %s", task)
        postfix = task.get("postfix")
        # Note: fix table names causes race condition in case when DAG triggered before the previous finished.
        table_name = f"{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}_{postfix}"
        gcs_file_name = f"{DATASET_GCS_FOLDER}/ds_{postfix}.csv"
        
        # Deletes previous training temporary tables
        task["delete_table"] = BigQueryTableDeleteOperator(
            task_id = "delete_table_" + postfix,
            deletion_dataset_table = table_name,
            ignore_if_missing = True)

        # Splits and copy source BQ table to 'dataset_range' sized segments
        task["split_table"] = BigQueryOperator(
            task_id = "split_table_" + postfix,
            use_legacy_sql=False,
            destination_dataset_table = table_name,
            sql = BQ_QUERY.format(task["dataset_range"]),
            location = REGION)
        
        # Extract split tables to CSV files in GCS
        task["extract_to_gcs"] = BigQueryToCloudStorageOperator(
            task_id = "extract_to_gcs_" + postfix,
            source_project_dataset_table = table_name,
            destination_cloud_storage_uris = [gcs_file_name],
            field_delimiter = '|')
        
        # Generates statisctics by TFDV
        task["tfdv_statisctics"] = PythonOperator(
            task_id = "tfdv_statistics_for_" + postfix,
            python_callable = generate_tfdv_statistics,
            provide_context = True,
            op_kwargs={'gcs_file_name': gcs_file_name})

    # Exectute tasks
    for task in tasks:
        task["delete_table"] >> task["split_table"] >> task["extract_to_gcs"] >> task["tfdv_statisctics"]
    
    # Train two models (two separate AI Platform Training Jobs) (PythonOperator)
    #  Input: data in GCS
    #  Output: model1.joblib model2.joblib
    #  Note: eval metric (one eval split) is stored in MLflow

    # Evaluate the previous model on the current  eval split
    #  Input: experiment Id (fetch the last (registered) model)
    #  Output: eval stored in MLflow for the previous model

    # Validate the model (PythonOperator)
    #  Input: Mflow metric
    #  Output: which model (path) to register

    # Register the model (PythonOperator) 
    #  Input: Path of the winning model
    #  Output: Model in specific GCS location

Appending to dual_model_trainer_dag.py


  and should_run_async(code)


In [9]:
!gcloud composer environments storage dags import \
  --environment {COMPOSER_NAME}  \
  --location {REGION} \
  --source dual_model_trainer_dag.py