# Creating & Uploading python distribution
https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

In [27]:
!python setup.py sdist

running sdist
running egg_info
writing manifest file 'dat_trainer.egg-info/SOURCES.txt'
running check


creating dat-trainer-0.1
creating dat-trainer-0.1/dat_trainer.egg-info
creating dat-trainer-0.1/trainer
creating dat-trainer-0.1/trainer/dat
creating dat-trainer-0.1/trainer/dat/compression
copying README.md -> dat-trainer-0.1
copying setup.py -> dat-trainer-0.1
copying dat_trainer.egg-info/PKG-INFO -> dat-trainer-0.1/dat_trainer.egg-info
copying dat_trainer.egg-info/SOURCES.txt -> dat-trainer-0.1/dat_trainer.egg-info
copying dat_trainer.egg-info/dependency_links.txt -> dat-trainer-0.1/dat_trainer.egg-info
copying dat_trainer.egg-info/requires.txt -> dat-trainer-0.1/dat_trainer.egg-info
copying dat_trainer.egg-info/top_level.txt -> dat-trainer-0.1/dat_trainer.egg-info
copying trainer/__init__.py -> dat-trainer-0.1/trainer
copying trainer/task.py -> dat-trainer-0.1/trainer
copying trainer/dat/__init__.py -> dat-trainer-0.1/trainer/dat
copying trainer/dat/attack.py

In [28]:
#!gcloud storage cp dist/dat_package-0.1.tar.gz gs://dat-package-bucket
!gsutil cp dist/dat-trainer-0.1.tar.gz gs://dat-project-bucket/dat-package/

Copying file://dist/dat-trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [1 files][ 15.9 KiB/ 15.9 KiB]                                                
Operation completed over 1 objects/15.9 KiB.                                     


# Creating CustomJob
https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-gcloud

# Operationalize Distributed Training with PyTorch on Google Cloud
https://www.youtube.com/watch?v=kXbDEmn2XOw

Pre-built-containers:
https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch

GPUs:
https://cloud.google.com/compute/docs/gpus
https://cloud.google.com/compute/gpus-pricing
https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table

In [32]:
#!pip install google-cloud-aiplatform

In [None]:
!pip install python-dotenv

In [30]:
import os

from google.cloud import aiplatform
from dotenv import load_dotenv

load_dotenv()

project_id = 'sdml-dat'
region = 'us-central1' #TODO: change to better region
bucket_name = 'dat-project-bucket'
package_uri = f'gs://{bucket_name}/dat-package/dat-trainer-0.1.tar.gz'

# Initialize Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_name)

API_KEY = os.getenv('TELEGRAM_API_KEY')
CHAT_ID = os.getenv('TELEGRAM_CHAT_ID')

print(API_KEY, CHAT_ID)

environment_variables = {
    'TELEGRAM_API_KEY': API_KEY,
    'TELEGRAM_CHAT_ID': CHAT_ID,
}

# Define worker pool spec
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_K80",
            "accelerator_count": 1,
        },
        "replica_count": 1,  # Number of masters
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                "--batch-size=512",
                "--dataset=cifar",
                "--dist-backend=nccl",
                "--num-epochs=2",
                "--eval-epochs=1",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    }
    ,
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_K80",
            "accelerator_count": 1,
        },
        "replica_count": 1,  # Number of workers
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                "--batch-size=512",
                "--dataset=cifar",
                "--dist-backend=nccl",
                "--num-epochs=2",
                "--eval-epochs=1",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    },
]

# Define job name
job_name = 'dat-custom-gpu-pytorch-job'

# Create a custom job
job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

# Launch the job
job.run(sync=True)

print(f"Job {job_name} has been launched.")

Creating CustomJob
CustomJob created. Resource name: projects/65830871515/locations/us-central1/customJobs/384304465236197376
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/65830871515/locations/us-central1/customJobs/384304465236197376')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/384304465236197376?project=65830871515
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 cur

CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING


CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_RUNNING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_RUNNING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_RUNNING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_RUNNING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_RUNNING
CustomJob projects/65830871515/locations/us-central1/customJobs/384304465236197376 current state:
JobState.JOB_STATE_FAILED


RuntimeError: Job failed with:
code: 3
message: "The replica workerpool1-0 exited with a non-zero status of 1. Termination reason: Error. \nTraceback (most recent call last):\n  File \"/opt/conda/lib/python3.10/runpy.py\", line 196, in _run_module_as_main\n    return _run_code(code, main_globals, None,\n  File \"/opt/conda/lib/python3.10/runpy.py\", line 86, in _run_code\n    exec(code, run_globals)\n  File \"/root/.local/lib/python3.10/site-packages/trainer/task.py\", line 17, in <module>\n    from trainer.dat.helpers import send_telegram_message\n  File \"/root/.local/lib/python3.10/site-packages/trainer/dat/helpers.py\", line 4, in <module>\n    API_KEY = os.environ[\'TELEGRAM_API_KEY\']\n  File \"/opt/conda/lib/python3.10/os.py\", line 680, in __getitem__\n    raise KeyError(key) from None\nKeyError: \'TELEGRAM_API_KEY\'\n\nTo find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=65830871515&resource=ml_job%2Fjob_id%2F384304465236197376&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22384304465236197376%22"
