# Environment Variables

### Public documentation resources on working with GPUs

[1] https://cloud.google.com/vertex-ai/docs/training/distributed-training#additional-worker-pools

[2] https://cloud.google.com/compute/docs/gpus#a100-gpus

[3] https://cloud.google.com/compute/docs/gpus#h100-gpus

[4] https://cloud.google.com/compute/docs/gpus#t4-gpus

[5] https://cloud.google.com/compute/docs/general-purpose-machines#n1_machine_types

[6] https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types

[7] https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus

[8] https://cloud.google.com/vertex-ai/docs/general/locations#accelerators

[9] https://cloud.google.com/vertex-ai/docs/training/persistent-resource-overview

[10] https://cloud.google.com/vertex-ai/docs/quotas#training

[11] https://cloud.google.com/vertex-ai/docs/training/persistent-resource-create#create-persistent-resource-python

[12] https://cloud.google.com/vertex-ai/docs/training/persistent-resource-train#create_a_training_job_that_runs_on_a_persistent_resource

[13] https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v1/projects.locations.executions/create

[14] https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v1/ExecutionTemplate

[15] https://cloud.google.com/compute/docs/gpus/gpu-regions-zones

[16] https://cloud.google.com/vertex-ai/docs/training/pre-built-containers

---

- gcr.io/deeplearning-platform-release/pytorch-gpu.1-13.py310:latest
  - NVIDIA-SMI `550.54.15`
  - Driver Version: `550.54.15`
  - CUDA Version: `12.4`
  - Torch: `1.13.1+cu117`
  - Torch: `['sm_37', 'sm_50', 'sm_60', 'sm_70', 'sm_75', 'sm_80', 'sm_86']`

- gcr.io/deeplearning-platform-release/pytorch-gpu.2-1.py310:latest
  - NVIDIA-SMI `550.54.15`
  - Driver Version: `550.54.15`
  - CUDA Version: `12.4`
  - Torch: `2.1.0+cu121`
  - Torch: `['sm_50', 'sm_60', 'sm_70', 'sm_75', 'sm_80', 'sm_86', 'sm_90']`

- gcr.io/deeplearning-platform-release/pytorch-gpu.2-2.py310:latest
  - NVIDIA-SMI `550.54.15`
  - Driver Version: `550.54.15`
  - CUDA Version: `12.4`
  
PyTorch compatibility with NVIDIA GPUs:
  
https://developer.nvidia.com/cuda-gpus

https://discuss.pytorch.org/t/gpu-compute-capability-support-for-each-pytorch-version/62434/7?u=daniel_elias

https://discuss.pytorch.org/t/which-pytorch-version-is-compatible-with-h100-gpu-and-cuda-capability-sm-90/202966/2


---

[a] https://cloud.google.com/compute/docs/gpus#h100-gpus

[b] https://cloud.google.com/vertex-ai/docs/training/code-requirements#gpus

[c] https://discuss.pytorch.org/t/rnn-module-weights-are-not-part-of-single-contiguous-chunk-of-memory/6011

[d] https://discuss.pytorch.org/t/difference-between-cuda-0-vs-cuda-with-1-gpu/93080

[e] https://stackoverflow.com/questions/50495053/if-im-not-specifying-to-use-cpu-gpu-which-one-is-my-script-using

[f] https://stackoverflow.com/questions/72610665/in-the-latest-version-of-pytorch-what-is-best-practice-to-get-all-tensors-to-us

In [None]:
COMPILED_PIPELINE_JSON = "custom_training_pipeline.json"

PROJECT_ID = "PROJECT-ID"
LOCATION = "us-central1"
STAGING_BUCKET = "gs://training-custom-gpus-unique-PROJECT-ID"
PIPELINE_DISPLAY_NAME = "training-with-gpus-job"
SCRIPT_PATH = "trainer.py"
#https://cloud.google.com/vertex-ai/docs/training/pre-built-containers
# gcr.io/deeplearning-platform-release/pytorch-gpu.1-13.py310:latest
# gcr.io/deeplearning-platform-release/pytorch-gpu.2-2.py310:latest
# gcr.io/deeplearning-platform-release/pytorch-gpu.2-1.py310:latest
CONTAINER_URI = "gcr.io/deeplearning-platform-release/pytorch-gpu.2-1.py310:latest"
CONTAINER_URI_NO_GPU = "python:3.9"
REPLICA_COUNT = 1
# https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types
# https://cloud.google.com/compute/docs/general-purpose-machines#n1_machine_types
MACHINE_TYPE="n1-highmem-96"
# https://cloud.google.com/vertex-ai/docs/training/configure-compute#specifying_gpus
# https://cloud.google.com/compute/docs/gpus#a100-gpus
# https://cloud.google.com/compute/docs/gpus#h100-gpus
# https://cloud.google.com/compute/docs/gpus#t4-gpus
ACCELERATOR_TYPE="NVIDIA_TESLA_T4"
ACCELERATOR_COUNT=4

PERSISTENT_RESOURCE_ID = "cluster-vertex-ai-training-t4-gpus"
BOOT_DISK_TYPE="pd-standard"
BOOT_DISK_SIZE=100

SCRIPT = """
import os

print('hello world')

import torch
print(torch.__version__)
print(torch.cuda.get_arch_list())

os.system('nvidia-smi --query-gpu=compute_cap --format=csv')

os.system('nvidia-smi -L')
os.system('nvidia-smi')
os.system('cat /proc/meminfo')
"""

# Custom Training Job

You can run the training job directly in Vertex AI Training without the need of a Vertex AI Pipeline

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform

In [None]:
def create_custom_training_job(
    project: str,
    location: str,
    display_name: str,
    staging_bucket: str,
    script_path: str,
    container_uri: str,
    replica_count: int = 0,
    machine_type: str = "n1-standard-4",
    accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
    accelerator_count: int = 0,
    boot_disk_type: str = "pd-ssd",
    boot_disk_size_gb: int = 100,
    persistent_resource_id: str = "",
    script: str = "print('hello world')",
    #args: [List[Union[str, float, int]]] = None,
):
    aiplatform.init(project=project, location=location, staging_bucket=staging_bucket)
    
    # write trainer script to be used - this is just for demonstration purposes
    file_object = open(script_path, "w")
    file_object.write(script)
    file_object.close()
    
    # Main one to use persistent resources: google.cloud.aiplatform.CustomJob
    # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.CustomJob#google_cloud_aiplatform_CustomJob_from_local_script
    # Another one google.cloud.aiplatform.CustomTrainingJob -> This one includes also uploading model to Model Registry but I don't see persistent training
    # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.CustomTrainingJob#google_cloud_aiplatform_CustomTrainingJob
    
    job = aiplatform.CustomJob.from_local_script(
        project=project,
        location=location,
        display_name=display_name,
        staging_bucket=staging_bucket,
        script_path=script_path, # training code/script needs to be accessible from inside the container that runs the job
        container_uri=container_uri,
        replica_count=1,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        persistent_resource_id=persistent_resource_id,
        boot_disk_type=boot_disk_type,
        boot_disk_size_gb=boot_disk_size_gb,
        #args=['--dataset', 'gs://my-bucket/my-dataset'],
    )

    job.run()

In [None]:
#import os

#os.chdir('/home/jupyter')
#os.getcwd()

In [None]:
from google.cloud import aiplatform

create_custom_training_job(
    project=PROJECT_ID,
    location=LOCATION,
    display_name=PIPELINE_DISPLAY_NAME,
    staging_bucket=STAGING_BUCKET,
    script_path=SCRIPT_PATH,
    container_uri=CONTAINER_URI,
    #container_uri=CONTAINER_URI_NO_GPU,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    persistent_resource_id=PERSISTENT_RESOURCE_ID,
    boot_disk_type=BOOT_DISK_TYPE,
    boot_disk_size_gb=BOOT_DISK_SIZE,
    script=SCRIPT
    #args=,
)

# Pipeline to trigger a Custom Training Job

In case you need to trigger the Vertex AI Training Job through a Vertex AI Pipeline.

The Vertex AI Pipeline will also appear as a Vertex AI Training Custom Job

In [None]:
! pip install --quiet kfp

In [None]:
### Pipeline
# https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/pipelines_intro_kfp.ipynb

from kfp import dsl
from typing import Any, Callable
from kfp import compiler

# https://cloud.google.com/vertex-ai/docs/training/pre-built-containers
# Select an image that has aiplatform installed for example 'gcr.io/deeplearning-platform-release/pytorch-gpu.1-13.py310:latest'
@dsl.component(base_image='gcr.io/deeplearning-platform-release/pytorch-gpu.2-2.py310:latest')
# 'gcr.io/deeplearning-platform-release/pytorch-gpu.1-13.py310:latest'
def create_custom_training_job(
    project: str,
    location: str,
    display_name: str,
    staging_bucket: str,
    script_path: str,
    container_uri: str,
    replica_count: int = 0,
    machine_type: str = "n1-standard-4",
    accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED",
    accelerator_count: int = 0,
    boot_disk_type: str = "pd-ssd",
    boot_disk_size_gb: int = 100,
    persistent_resource_id: str = "",
    script: str = "print('hello world')",
    #args: [List[Union[str, float, int]]] = None,
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=location, staging_bucket=staging_bucket)
    
    # Main one to use persistent resources: google.cloud.aiplatform.CustomJob
    # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.CustomJob#google_cloud_aiplatform_CustomJob_from_local_script
    # Another one google.cloud.aiplatform.CustomTrainingJob -> This one includes also uploading model to Model Registry but I don't see persistent training
    # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.CustomTrainingJob#google_cloud_aiplatform_CustomTrainingJob
    
    # write trainer script to be used - this is just for demonstration purposes
    file_object = open(script_path, "w")
    file_object.write(script)
    file_object.close()
    
    job = aiplatform.CustomJob.from_local_script(
        project=project,
        location=location,
        display_name=display_name,
        staging_bucket=staging_bucket,
        script_path=script_path, # training code/script needs to be accessible from inside the container that runs the job
        container_uri=container_uri,
        replica_count=1,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        persistent_resource_id=persistent_resource_id,
        boot_disk_type=boot_disk_type,
        boot_disk_size_gb=boot_disk_size_gb,
        #args=['--dataset', 'gs://my-bucket/my-dataset'],
    )

    job.run()

@dsl.pipeline (name=PIPELINE_DISPLAY_NAME + "-pipeline")
def custom_training_pipeline():
        create_custom_training_job(
        project=PROJECT_ID,
        location=LOCATION,
        display_name=PIPELINE_DISPLAY_NAME,
        staging_bucket=STAGING_BUCKET,
        script_path=SCRIPT_PATH,
        container_uri=CONTAINER_URI,
        replica_count=REPLICA_COUNT,
        machine_type=MACHINE_TYPE,
        accelerator_type=ACCELERATOR_TYPE,
        accelerator_count=ACCELERATOR_COUNT,
        persistent_resource_id=PERSISTENT_RESOURCE_ID,
        boot_disk_type=BOOT_DISK_TYPE,
        boot_disk_size_gb=BOOT_DISK_SIZE,
        script=SCRIPT
        #args=,
    )

def compile_pipeline(func: Callable, file_name: str) -> None:
    compiler.Compiler().compile(func, file_name)

In [None]:
compile_pipeline(custom_training_pipeline, COMPILED_PIPELINE_JSON)

from google.cloud import aiplatform
# https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/pipelines_intro_kfp.ipynb

job = aiplatform.PipelineJob(
    display_name=PIPELINE_DISPLAY_NAME,
    template_path=COMPILED_PIPELINE_JSON
)

job.run()