# Classifying your own images using transfer learning and Google Cloud ML Engine
---
## Introduction
This notebook can be used to classify a new dataset of images using *transfer learning* based on *Google Cloud Machine Learning Engine*.

It is based on the following github repo: https://github.com/amygdala/tensorflow-workshop.git

The notebook is intended to be executed from inside the *__tensorflow-workshop/workshop_sections/transfer_learning/cloudml/__* directory.

## Setup

In [23]:
project_name = "pathomatic"
user_name = "bardi"
model_version = "v1"
train_on_cloud = True
predict_on_cloud = True
skip_preproc = True
optimize_hyper_parameters = False
model_type = "multi_resolution" # Supported: "baseline" or "multi_resolution"

In [None]:
# General imports
from datetime import datetime

# Helper function for printing out streaming subprocess output
import subprocess
import sys
def exec_subprocess(cmd):
  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
  while proc.poll() is None:
    line = proc.stdout.readline()
    sys.stdout.write(line)
  # Might still be data on stdout at this point. Grab any remainder.
  for line in proc.stdout.read().split('\n'):
    sys.stdout.write(line)

In [None]:
# Retrieve the Project ID
project_id_rd = !gcloud config list project --format "value(core.project)"
project_id = project_id_rd.fields()[0][0]
print ("Project ID: %s" % project_id)

In [None]:
# Define the Google Storage bucket
bucket = "gs://%s-%s-ml" % (project_id, project_name)
print ("Bucket name: %s" % bucket)

## Pre-processing

In [None]:
# Define a timestemp for the pre-processing JOB ID
# Note that DataFlow doesn't like underscores
timestamp_preproc = datetime.now().strftime("%Y%m%d-%H%M%S")
print("Time stamp: %s" % timestamp_preproc)

In [None]:
# Execute the pre-processing
if not skip_preproc:
  exec_subprocess("chmod a+x ./%s_preproc.sh" % project_name)
  exec_subprocess("USER=%s DATE=%s ./%s_preproc.sh %s" % (user_name, timestamp_preproc, project_name, bucket))

In [None]:
# Define pre-processing data output path
if skip_preproc:
  gcs_path_preproc = "gs://asl_project/preproc"
else:
  gcs_path_preproc = "%s/%s/preproc/%s" % (bucket, user_name, timestamp_preproc)
print ("Google Cloud Storage pre-processing path: %s" % gcs_path_preproc)

## Training

In [25]:
# Define training Job ID
timestamp_training = datetime.now().strftime("%Y%m%d-%H%M%S")
job_id=("%s_%s_%s" % (project_name, user_name, timestamp_training)).replace('-', "_")
print ("Job ID: %s" % job_id)

Job ID: pathomatic_bardi_20170804_171019


In [None]:
# Define training path
gcs_path_train = "%s/%s/train/%s" % (bucket, user_name, timestamp_training)
print ("Google Cloud Storage training path: %s" % gcs_path_train)

In [None]:
# Optionally enable hyper parameter tuning:
if optimize_hyper_parameters:
  config_hp = " --config hp_config.yaml"
else:
  config_hp = ""

In [None]:
# Run the training on CLOUD
# =========================
#
# This script will output summary and model checkpoint information under <gcs_path>/training
#
# If --package-path /my/code/path/trainer is specified and there is a setup.py file 
# at /my/code/path/setup.py then that file will be invoked with sdist and the generated tar files
# will be uploaded to Cloud Storage. Otherwise a temporary setup.py file will be generated for the build.
#
# See https://cloud.google.com/sdk/gcloud/reference/ml-engine/jobs/submit/training
#
# The scale-tier story:
# > Options are: BASIC, BASIC_GPU, STANDARD_1, PREMIUM_1 or CUSTOM
# > By default there are 25 ML units available. A PREMIUM_1 scale-tier however requires 75 ML units.
# > To speed-up training we've requested an upgrade to 100 ML units
# > using the 'Cloud Machine Learning Engine Quota Request form' on https://cloud.google.com/ml-engine/quotas
#
# Currently unused flags:
# --config=CONFIG
# > Path to the job configuration file. The file should be a YAML document (JSON also accepted)
# > containing a Job resource as defined in the API (all fields are optional)
# > https://cloud.google.com/ml/reference/rest/v1/projects.jobs
# > If an option is specified both in the configuration file and via command line arguments,
# > the command line arguments override the configuration file.
#
# --job-dir=JOB_DIR
# > A Google Cloud Storage path in which to store training outputs and other data needed for training.
# > This path will be passed to your TensorFlow program as --job_dir command-line arg.
# > The benefit of specifying this field is that Cloud ML Engine will validate the path for use in training.
# > If packages must be uploaded and --staging-bucket is not provided, this path will be used instead.
#
# --packages=[PACKAGE,…]
# > Path to Python archives used for training. These can be local paths (absolute or relative),
# > in which case they will be uploaded to the Cloud Storage bucket given by --staging-bucket,
# > or Cloud Storage URLs (gs://bucket-name/path/to/package.tar.gz).
#
# --staging-bucket=STAGING_BUCKET
# > Bucket in which to stage training archives.
# > Required only if a file upload is necessary (that is, other flags include local paths)
# > and no other flags implicitly specify an upload path.
#
# > --stream-logs
# > Block until job completion and stream the logs while the job runs.
# > Note that even if command execution is halted, the job will still run until cancelled with
if train_on_cloud:
  exec_subprocess("gcloud ml-engine jobs submit training %s" % job_id + \
    " --module-name trainer.task" + \
    " --package-path trainer" + \
    " --staging-bucket %s" % bucket + \
    " --region us-central1" + \
    " --runtime-version 1.2" + \
    " --scale-tier PREMIUM_1" + \
    config_hp + \
    " --" + \
    " --output_path %s" % (gcs_path_train + "/training") + \
    " --eval_data_paths %s" % (gcs_path_preproc + "/eval*") + \
    " --train_data_paths %s" % (gcs_path_preproc + "/train*") + \
    " --eval_set_size 474" + \
    " --eval_batch_size 75" + \
    " --classifier_label_count 2" + \
    " --max_steps 10000" + \
    " --model_type %s" % model_type)

# Run the training locally
# ========================
#
# Note that max_steps is configured much lower.
# This is because local training is typically used for initial checks.
# Once local training is working, we can switch to cloud training
if not train_on_cloud:
  exec_subprocess("gcloud ml-engine local train" + \
    " --module-name trainer.task" + \
    " --package-path trainer" + \
    " --" + \
    " --output_path %s" % (gcs_path_train + "/training") + \
    " --eval_data_paths %s" % (gcs_path_preproc + "/eval*") + \
    " --train_data_paths %s" % (gcs_path_preproc + "/train*") + \
    " --eval_set_size 474" + \
    " --eval_batch_size 25" + \
    " --classifier_label_count 2" + \
    " --max_steps 10" + \
    " --model_type %s" % model_type)

In [None]:
# Monitor the training
exec_subprocess("gcloud ml-engine jobs stream-logs %s" % (job_id))

In [None]:
# See the results in TensorBoard
from google.datalab.ml import TensorBoard
pid = TensorBoard.start("%s/training" % gcs_path_train)

In [None]:
# See the running TensorBoard's
TensorBoard.list()

In [None]:
# Execute this cell to stop the previously started TensorBoard process
TensorBoard.stop(pid)

## Deployment

In [28]:
# Deploy the model
# This will give an error if the model already exists, but this is expected and OK.
model_name = "%s_%s" % (project_name, model_type)
exec_subprocess("chmod a+x ./model.sh")
#check the relevant task id
if optimize_hyper_parameters:
    exec_subprocess("gcloud ml-engine jobs describe %s" % (job_id))


createTime: '2017-08-03T23:26:32Z'
endTime: '2017-08-04T02:43:16Z'
jobId: pathomatic_bardi_20170803_232452
startTime: '2017-08-03T23:26:35Z'
state: SUCCEEDED
trainingInput:
  args:
  - --output_path
  - gs://asl_project/bardi/train/20170803-232452/training
  - --eval_data_paths
  - gs://asl_project/preproc/eval*
  - --train_data_paths
  - gs://asl_project/preproc/train*
  - --eval_set_size
  - '474'
  - --eval_batch_size
  - '25'
  - --classifier_label_count
  - '2'
  - --max_steps
  - '100000'
  - --model_type=baseline
  hyperparameters:
    goal: MAXIMIZE
    maxParallelTrials: 1
    maxTrials: 10
    params:
    - maxValue: 200.0
      minValue: 50.0
      parameterName: batch_size
      scaleType: UNIT_LINEAR_SCALE
      type: INTEGER
  packageUris:
  - gs://asl_project/pathomatic_bardi_20170803_232452/71c8cb0ea5796d0fb889631b2fbc8321ae69304ef5e712dcf2a106009e388f1f/trainer-0.1.tar.gz
  pythonModule: trainer.task
  region: us-central1
  runtimeVersion: '1.2'
  scaleTier: PREMIUM_1


In [33]:
if optimize_hyper_parameters:
  task_id = 3 #change to the task id with the best performance
  gcs_path_deploy = "%s/training/%s" %(gcs_path_train,task_id)
else:
  gcs_path_deploy = gcs_path_train
print( gcs_path_deploy)
exec_subprocess("./model.sh %s %s %s" % (gcs_path_deploy, model_version, model_name))

gs://asl_project/bardi/train/20170803-232452/training/3

Using GCS_PATH:  gs://asl_project/bardi/train/20170803-232452/training/3
Using VERSION_NAME:  v1
Using MODEL NAME:  test

# Tell CloudML about a new type of model coming.  Think of a "model" here as
# a namespace for deployed Tensorflow graphs.  This will give an error
# if the model already exists.
gcloud ml-engine models create "$MODEL_NAME" --regions us-central1

set -e

# Each unique Tensorflow graph--with all the information it needs to execute--
# corresponds to a "version".  Creating a version actually deploys our
# Tensorflow graph to a Cloud instance, and gets it ready to serve (predict).
# This will give an error if the version name already exists.
gcloud ml-engine versions create "$VERSION_NAME" \
  --model "$MODEL_NAME" \
  --origin "${GCS_PATH}/model"
Creating version (this might take a few minutes)......
.......................................................................................................done.

# M

In [None]:
# Get a list of deployed models
!gcloud ml-engine models list

## Inference

In [None]:
# Prepare prediction request.json
exec_subprocess("python images_to_json.py -o request.json ./prediction_images/%s.png" % model_type)

In [None]:
# Run predictions on a number of images
if predict_on_cloud:
  exec_subprocess("gcloud ml-engine predict --model %s --json-instances request.json " % (model_name))
else:
  exec_subprocess("gcloud ml-engine local predict --model-dir %s/training/model --json-instances request.json " % (gcs_path_train))

In [None]:
# If needed, run the following to update gcloud
#!yes | gcloud components update