In [None]:
import os

In [None]:
BUCKET = 'gs://edml'
os.environ['BUCKET'] = BUCKET

In [None]:
!python --version

In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

## Local training / Python

In [None]:
%%bash
export PYTHONPATH=${PYTHONPATH}:~/event-driven-ml/edml-trainer
echo $PYTHONPATH

In [10]:
%%bash
echo "bucket=${BUCKET}"
rm -rf model_trained_test
export PYTHONPATH=${PYTHONPATH}:~/event-driven-ml/edml-trainer
python3 -m trainer.task \
  --bucket=${BUCKET} \
  --output-dir=model_trained_test \
  --job-dir=./tmp \
  --train-steps=100 \
  --nembeds 10 \
  --nnsize 10 5 \
  --eval-steps=2

bucket=


2020-04-22 16:44:56.448098: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
INFO:tensorflow:Using config: {'_model_dir': 'model_trained_test', '_tf_random_seed': 2810, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 300, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f29e96a7c50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': Tr

## Local prediction

In [None]:
%%writefile inputs.json
{"uuid": "b1", "dayofweek": 6, "hourofday": 7, "weekofyear": 48, "pickup_zone_name": "World Trade Center", "dropoff_zone_name": "Newark Airport", "passenger_count": 1, "distance": 18000.0}
{"uuid": "g1", "dayofweek": 3, "hourofday": 23, "weekofyear": 27, "pickup_zone_name": "World Trade Center", "dropoff_zone_name": "Times Sq/Theatre District", "passenger_count": 1, "distance": 3400.0}

I had this [issue](https://github.com/GoogleCloudPlatform/cloudml-samples/issues/415). Following instructions from [stackoverflow](https://stackoverflow.com/questions/48824381/gcloud-ml-engine-local-predict-runtimeerror-bad-magic-number-in-pyc-file)

First run command `gcloud ai-platform local predict` with `--verbosity debug` to find the correct path, then delete `*.pyc` files thes launch local train again.

```
%%bash
sudo find /usr/lib/google-cloud-sdk/lib/googlecloudsdk/command_lib/ml_engine -name '*.pyc' -delete
```

In [None]:
%%bash
MODEL_LOCATION=$(ls -d $(pwd)/model_trained_test/export/exporter/* | tail -1)
echo $MODEL_LOCATION
gcloud ai-platform local predict --model-dir=$MODEL_LOCATION --json-instances=inputs.json

## Local training / Gcloud

In [2]:
%%bash
gcloud ai-platform --help

NAME
    gcloud ai-platform - manage AI Platform jobs and models

SYNOPSIS
    gcloud ai-platform GROUP | COMMAND [GCLOUD_WIDE_FLAG ...]

DESCRIPTION
    The gcloud ai-platform command group lets you manage AI Platform jobs and
    training models.

    AI Platform is a managed service that enables you to easily build machine
    learning models, that work on any type of data, of any size. Create your
    model with the powerful TensorFlow framework that powers many Google
    products, from Google Photos to Google Cloud Speech.

    More information on AI Platform can be found here:
    https://cloud.google.com/ml and detailed documentation can be found here:
    https://cloud.google.com/ml/docs/

GCLOUD WIDE FLAGS
    These flags are available to all commands: --account, --billing-project,
    --configuration, --flags-file, --flatten, --format, --help,
    --impersonate-service-account, --log-http, --project, --quiet,
    --trace-token, --user-output-enabled, --verbosity.

    Run $ 

In [None]:
%%bash
echo $PATH

In [None]:
#    --package-path=~/event-driven-ml/edml-trainer \

In [9]:
%%bash
rm -rf model_trained_test
export PYTHONPATH=${PYTHONPATH}:~/event-driven-ml/edml-trainer:/usr/bin/python
gcloud ai-platform local train \
   --module-name=trainer.task \
   --package-path=../../edml-trainer/ \
   --verbosity=debug \
   -- \
   --train-steps=1000 \
   --output-dir=model_trained_test \
   --eval-steps=1

DEBUG: Running [gcloud.ai-platform.local.train] with arguments: [--module-name: "trainer.task", --package-path: "../../edml-trainer/", --verbosity: "debug"]
INFO: launching training process:
command: /opt/conda/bin/python -m trainer.task --train-steps=1000 --output-dir=model_trained_test --eval-steps=1
 config: {
  "cluster": {}, 
  "environment": "cloud", 
  "job": {
    "args": [
      "--train-steps=1000", 
      "--output-dir=model_trained_test", 
      "--eval-steps=1"
    ], 
    "job_name": "trainer.task"
  }, 
  "task": {}
}
1.15.2
, using the default primary node name, aka "chief" for cluster settings
INFO: Display format: "default"
DEBUG: Exception captured in Commands
Traceback (most recent call last):
  File "/usr/lib/google-cloud-sdk/lib/googlecloudsdk/core/metrics.py", line 631, in Wrapper
    return func(*args, **kwds)
  File "/usr/lib/google-cloud-sdk/lib/googlecloudsdk/core/metrics.py", line 764, in Commands
    error_extra_info_json=_GetErrorExtraInfo(error_extra_info

CalledProcessError: Command 'b'rm -rf model_trained_test\nexport PYTHONPATH=${PYTHONPATH}:~/event-driven-ml/edml-trainer:/usr/bin/python\ngcloud ai-platform local train \\\n   --module-name=trainer.task \\\n   --package-path=../../edml-trainer/ \\\n   --verbosity=debug \\\n   -- \\\n   --train-steps=1000 \\\n   --output-dir=model_trained_test \\\n   --eval-steps=1\n'' returned non-zero exit status 1.

In [None]:
%%bash
gsutil cp -r ~/event-driven-ml/edml-trainer/* gs://edml/ai-platform/edml-trainer/