# Creating & Uploading python distribution
https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

In [5]:
!python setup.py sdist

running sdist
running egg_info
writing dat_profiler_trainer.egg-info\PKG-INFO
writing dependency_links to dat_profiler_trainer.egg-info\dependency_links.txt
writing requirements to dat_profiler_trainer.egg-info\requires.txt
writing top-level names to dat_profiler_trainer.egg-info\top_level.txt
reading manifest file 'dat_profiler_trainer.egg-info\SOURCES.txt'
writing manifest file 'dat_profiler_trainer.egg-info\SOURCES.txt'
running check
creating dat-profiler-trainer-0.1
creating dat-profiler-trainer-0.1\dat_profiler_trainer.egg-info
creating dat-profiler-trainer-0.1\trainer
creating dat-profiler-trainer-0.1\trainer\dat
creating dat-profiler-trainer-0.1\trainer\dat\compression
copying files to dat-profiler-trainer-0.1...
copying README.md -> dat-profiler-trainer-0.1
copying setup.py -> dat-profiler-trainer-0.1
copying dat_profiler_trainer.egg-info\PKG-INFO -> dat-profiler-trainer-0.1\dat_profiler_trainer.egg-info
copying dat_profiler_trainer.egg-info\SOURCES.txt -> dat-profiler-trainer-

In [6]:
#!gcloud storage cp dist/dat_package-0.1.tar.gz gs://dat-package-bucket
!gsutil cp dist/dat-profiler-trainer-0.1.tar.gz gs://dat-project-bucket/dat-package/

Copying file://dist\dat-profiler-trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [0 files][    0.0 B/ 17.2 KiB]                                                
/ [1 files][ 17.2 KiB/ 17.2 KiB]                                                

Operation completed over 1 objects/17.2 KiB.                                     


# Creating CustomJob
https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-gcloud

# Operationalize Distributed Training with PyTorch on Google Cloud
https://www.youtube.com/watch?v=kXbDEmn2XOw

Pre-built-containers:
https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch

GPUs:
https://cloud.google.com/compute/docs/gpus
https://cloud.google.com/compute/gpus-pricing
https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table

In [32]:
#!pip install google-cloud-aiplatform

In [1]:
# !pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [None]:
import os
import datetime

from google.cloud import aiplatform
from dotenv import load_dotenv

load_dotenv()

project_id = 'sdml-dat'
region = 'us-central1'
bucket_name = 'dat-project-bucket'
package_uri = f'gs://{bucket_name}/dat-package/dat-profiler-trainer-0.1.tar.gz'

# Initialize Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_name)

# Define job name
job_name = 'dat-custom-profiler-pytorch-job-chip'

machine_type = 'n1-standard-4'
accelerator_type = 'NVIDIA_TESLA_K80'
accelerator_count = 1
replica_count = 2
dataset='cifar'
batch_size=1024
group_surfix = f'{datetime.datetime.now().strftime("%m-%d-%H-%M")}'
TELEGRAM_API_KEY = os.getenv('TELEGRAM_API_KEY')
TELEGRAM_CHAT_ID = os.getenv('TELEGRAM_CHAT_ID')
WANDB_API_KEY = os.getenv('WANDB_API_KEY')

if TELEGRAM_API_KEY is None or TELEGRAM_CHAT_ID is None or WANDB_API_KEY is None:
    raise ValueError('Missing either TELEGRAM or WANDB API KEY')

environment_variables = {
    'TELEGRAM_API_KEY': TELEGRAM_API_KEY,
    'TELEGRAM_CHAT_ID': TELEGRAM_CHAT_ID,
    'WANDB_API_KEY': WANDB_API_KEY,
}

# Define worker pool spec
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,  # Number of masters
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}",
                "--dist-backend=nccl",
                "--warmup-epochs=1",
                "--num-epochs=2",
                "--eval-epochs=1",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    }
    ,
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,  # Number of workers
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}",
                "--dist-backend=nccl",
                "--warmup-epochs=1",
                "--num-epochs=2",
                "--eval-epochs=1",
            ],
            "env": [
                    {'name': name, 'value': value}
                    for name, value in environment_variables.items()
            ],
        },
    },
]


# Create a custom job
job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

# Launch the job
job.run(sync=True)

print(f"Job {job_name} has been launched.")

In [None]:
import glob

file_path = glob.glob(f'./data/**', recursive=True)
print(file_path)


['./trainer\\', './trainer\\dat', './trainer\\dat\\attack.py', './trainer\\dat\\cifar_resnet18.py', './trainer\\dat\\compression', './trainer\\dat\\compression\\sparsification.py', './trainer\\dat\\compression\\__init__.py', './trainer\\dat\\dataset.py', './trainer\\dat\\eval.py', './trainer\\dat\\helpers.py', './trainer\\dat\\lamb.py', './trainer\\dat\\main.py', './trainer\\dat\\models.py', './trainer\\dat\\quantization.py', './trainer\\dat\\utils.py', './trainer\\dat\\wide_resnet.py', './trainer\\dat\\__init__.py', './trainer\\dat\\__pycache__', './trainer\\dat\\__pycache__\\attack.cpython-310.pyc', './trainer\\dat\\__pycache__\\dataset.cpython-310.pyc', './trainer\\dat\\__pycache__\\lamb.cpython-310.pyc', './trainer\\dat\\__pycache__\\models.cpython-310.pyc', './trainer\\dat\\__pycache__\\quantization.cpython-310.pyc', './trainer\\dat\\__pycache__\\utils.cpython-310.pyc', './trainer\\dat\\__pycache__\\__init__.cpython-310.pyc', './trainer\\task.py', './trainer\\__init__.py', './trai