# Creating & Uploading python distribution
https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

In [1]:
!python setup.py sdist

running sdist
running egg_info
writing dat_trainer.egg-info\PKG-INFO
writing dependency_links to dat_trainer.egg-info\dependency_links.txt
writing requirements to dat_trainer.egg-info\requires.txt
writing top-level names to dat_trainer.egg-info\top_level.txt
reading manifest file 'dat_trainer.egg-info\SOURCES.txt'
writing manifest file 'dat_trainer.egg-info\SOURCES.txt'
running check
creating dat-trainer-0.2
creating dat-trainer-0.2\dat_trainer.egg-info
creating dat-trainer-0.2\trainer
creating dat-trainer-0.2\trainer\dat
creating dat-trainer-0.2\trainer\dat\compression
copying files to dat-trainer-0.2...
copying README.md -> dat-trainer-0.2
copying setup.py -> dat-trainer-0.2
copying dat_trainer.egg-info\PKG-INFO -> dat-trainer-0.2\dat_trainer.egg-info
copying dat_trainer.egg-info\SOURCES.txt -> dat-trainer-0.2\dat_trainer.egg-info
copying dat_trainer.egg-info\dependency_links.txt -> dat-trainer-0.2\dat_trainer.egg-info
copying dat_trainer.egg-info\requires.txt -> dat-trainer-0.2\dat_

In [2]:
#!gcloud storage cp dist/dat_package-0.1.tar.gz gs://dat-package-bucket
!gsutil cp dist/dat-trainer-0.2.tar.gz gs://dat-project-bucket/dat-package/

Copying file://dist\dat-trainer-0.2.tar.gz [Content-Type=application/x-tar]...
/ [0 files][    0.0 B/ 17.3 KiB]                                                
/ [1 files][ 17.3 KiB/ 17.3 KiB]                                                
-

Operation completed over 1 objects/17.3 KiB.                                     


# Creating CustomJob
https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-gcloud

# Operationalize Distributed Training with PyTorch on Google Cloud
https://www.youtube.com/watch?v=kXbDEmn2XOw

Pre-built-containers:
https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch

GPUs:
https://cloud.google.com/compute/docs/gpus
https://cloud.google.com/compute/gpus-pricing
https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table

In [None]:
#!pip install google-cloud-aiplatform

In [None]:
# !pip install python-dotenv

In [3]:
import os
import datetime

from google.cloud import aiplatform
from dotenv import load_dotenv

load_dotenv()

project_id = 'sdml-dat'
region = 'us-central1'
bucket_name = 'dat-project-bucket'
package_uri = f'gs://{bucket_name}/dat-package/dat-trainer-0.2.tar.gz'

# Initialize Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_name)

# Define job name
job_name = 'dat-baseline-pytorch-job'

machine_type = 'n1-standard-4' # 4 vCPUs, 26 GB RAM
accelerator_type = 'NVIDIA_TESLA_P100' # 16 GB RAM
accelerator_count = 1
replica_count = 7 # Keep in mind that total number of nodes is replica_count + 1
dataset='cifarext'
batch_size=2048
warmup_epochs=5
training_epochs=100
group_surfix = f'{datetime.datetime.now().strftime("%m-%d-%H-%M")}'
TELEGRAM_API_KEY = os.getenv('TELEGRAM_API_KEY')
TELEGRAM_CHAT_ID = os.getenv('TELEGRAM_CHAT_ID')
WANDB_API_KEY = os.getenv('WANDB_API_KEY')

if TELEGRAM_API_KEY is None or TELEGRAM_CHAT_ID is None or WANDB_API_KEY is None:
    raise ValueError('Missing either TELEGRAM or WANDB API KEY')

environment_variables = {
    'TELEGRAM_API_KEY': TELEGRAM_API_KEY,
    'TELEGRAM_CHAT_ID': TELEGRAM_CHAT_ID,
    'WANDB_API_KEY': WANDB_API_KEY,
}

# Define worker pool spec
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,  # Number of masters
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}",
                "--dist-backend=nccl",
                f"--warmup-epochs={warmup_epochs}",
                f"--num-epochs={training_epochs}",
                "--eval-epochs=0",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    }
    ,
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,  # Number of workers
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}",
                "--dist-backend=nccl",
                f"--warmup-epochs={warmup_epochs}",
                f"--num-epochs={training_epochs}",
                "--eval-epochs=0",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    },
]


# Create a custom job
job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

# Launch the job
job.run(sync=True)

print(f"Job {job_name} has been launched.")

Creating CustomJob
CustomJob created. Resource name: projects/65830871515/locations/us-central1/customJobs/2076214266360233984
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/65830871515/locations/us-central1/customJobs/2076214266360233984')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2076214266360233984?project=65830871515
CustomJob projects/65830871515/locations/us-central1/customJobs/2076214266360233984 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/2076214266360233984 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/2076214266360233984 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/2076214266360233984 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/65830871515/locations/us-central1/customJobs/207621426636023

# Download profiler-logs from google drive to data folder

In [None]:
#!gsutil -m cp -r "gs://dat-project-bucket/dat-outputs/profiler_logs/n1-standard-4_3_NVIDIA_TESLA_K80_1_cifar_1024_05-30-22-59" ./data/