# Creating & Uploading python distribution
https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

In [40]:
!python setup.py sdist

running sdist
running egg_info
writing dat_trainer.egg-info\PKG-INFO
writing dependency_links to dat_trainer.egg-info\dependency_links.txt
writing requirements to dat_trainer.egg-info\requires.txt
writing top-level names to dat_trainer.egg-info\top_level.txt
reading manifest file 'dat_trainer.egg-info\SOURCES.txt'
writing manifest file 'dat_trainer.egg-info\SOURCES.txt'
running check
creating dat-trainer-0.1
creating dat-trainer-0.1\dat_trainer.egg-info
creating dat-trainer-0.1\trainer
creating dat-trainer-0.1\trainer\dat
copying files to dat-trainer-0.1...
copying README.md -> dat-trainer-0.1
copying setup.py -> dat-trainer-0.1
copying dat_trainer.egg-info\PKG-INFO -> dat-trainer-0.1\dat_trainer.egg-info
copying dat_trainer.egg-info\SOURCES.txt -> dat-trainer-0.1\dat_trainer.egg-info
copying dat_trainer.egg-info\dependency_links.txt -> dat-trainer-0.1\dat_trainer.egg-info
copying dat_trainer.egg-info\requires.txt -> dat-trainer-0.1\dat_trainer.egg-info
copying dat_trainer.egg-info\top

In [41]:
#!gcloud storage cp dist/dat_package-0.1.tar.gz gs://dat-package-bucket
!gsutil cp dist/dat-trainer-0.1.tar.gz gs://dat-project-bucket/dat-package/

Copying file://dist\dat-trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [0 files][    0.0 B/ 15.7 KiB]                                                
/ [1 files][ 15.7 KiB/ 15.7 KiB]                                                

Operation completed over 1 objects/15.7 KiB.                                     


# Creating CustomJob
https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-gcloud

# Operationalize Distributed Training with PyTorch on Google Cloud
https://www.youtube.com/watch?v=kXbDEmn2XOw

Pre-built-containers:
https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch

GPUs:
https://cloud.google.com/compute/docs/gpus
https://cloud.google.com/compute/gpus-pricing
https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table

In [None]:
#!pip install google-cloud-aiplatform

In [None]:
import datetime
from google.cloud import aiplatform

project_id = 'sdml-dat'
region = 'us-central1'
bucket_name = 'dat-project-bucket'
package_uri = f'gs://{bucket_name}/dat-package/dat-trainer-0.1.tar.gz'

# Initialize Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_name)

machine_type = 'n1-standard-4'
accelerator_type = 'NVIDIA_TESLA_K80'
accelerator_count = 1
replica_count = 2
dataset='cifar'
batch_size=1024
group_surfix = f'{datetime.datetime.now().strftime("%m-%d-%H-%M")}'
# Define worker pool spec
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": 1,  # Number of masters
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}"
            ],
        },
    }
    ,
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        },
        "replica_count": replica_count,  # Number of workers
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/",
                f"--batch-size={batch_size}",
                f"--group_surfix={group_surfix}",
                f"--dataset={dataset}",
                f"--machine_type={machine_type}",
                f"--accelerator_type={accelerator_type}"
            ],
        },
    },
]

# Define job name
job_name = 'dat-custom-gpu-pytorch-job'

# Create a custom job
job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

# Launch the job
job.run(sync=True)

print(f"Job {job_name} has been launched.")