# Creating & Uploading python distribution
https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container

In [131]:
!python setup.py sdist

running sdist
running egg_info
creating trainer.egg-info
writing trainer.egg-info\PKG-INFO
writing dependency_links to trainer.egg-info\dependency_links.txt
writing requirements to trainer.egg-info\requires.txt
writing top-level names to trainer.egg-info\top_level.txt
writing manifest file 'trainer.egg-info\SOURCES.txt'
reading manifest file 'trainer.egg-info\SOURCES.txt'
writing manifest file 'trainer.egg-info\SOURCES.txt'
running check
creating trainer-0.1
creating trainer-0.1\trainer
creating trainer-0.1\trainer.egg-info
creating trainer-0.1\trainer\dat
copying files to trainer-0.1...
copying README.md -> trainer-0.1
copying setup.py -> trainer-0.1
copying trainer\__init__.py -> trainer-0.1\trainer
copying trainer\task.py -> trainer-0.1\trainer
copying trainer.egg-info\PKG-INFO -> trainer-0.1\trainer.egg-info
copying trainer.egg-info\SOURCES.txt -> trainer-0.1\trainer.egg-info
copying trainer.egg-info\dependency_links.txt -> trainer-0.1\trainer.egg-info
copying trainer.egg-info\requ

In [132]:
#!gcloud storage cp dist/dat_package-0.1.tar.gz gs://dat-package-bucket
!gsutil cp dist/trainer-0.1.tar.gz gs://dat-project-bucket/dat-package/

Copying file://dist\trainer-0.1.tar.gz [Content-Type=application/x-tar]...
/ [0 files][    0.0 B/ 15.1 KiB]                                                
/ [1 files][ 15.1 KiB/ 15.1 KiB]                                                

Operation completed over 1 objects/15.1 KiB.                                     


# Creating CustomJob
https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-gcloud

# Operationalize Distributed Training with PyTorch on Google Cloud
https://www.youtube.com/watch?v=kXbDEmn2XOw

Pre-built-containers:
https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#pytorch

GPUs:
https://cloud.google.com/compute/docs/gpus
https://cloud.google.com/compute/gpus-pricing
https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table

In [None]:
#!pip install google-cloud-aiplatform

In [None]:
from google.cloud import aiplatform

project_id = 'sdml-dat'
region = 'us-central1' #TODO: change to better region
bucket_name = 'dat-project-bucket'
package_uri = f'gs://{bucket_name}/dat-package/trainer-0.1.tar.gz'

# Initialize Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_name)

# Define worker pool spec
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
        },
        "replica_count": 1,  # Number of masters
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/"
            ],
        },
    }
    ,
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            # "accelerator_type": "NVIDIA_TESLA_K80",
            # "accelerator_count": 2,
        },
        "replica_count": 2,  # Number of workers
        "python_package_spec": {
            "executor_image_uri": "europe-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest",
            "package_uris": [package_uri],
            "python_module": "trainer.task",
            "args": [
                "--gcloud=True",
                "--dataset-path=/gcs/dat-project-bucket/datasets/",
                "--output-dir=/gcs/dat-project-bucket/dat-outputs/"
            ],
        },
    },
]

# Define job name
job_name = 'dat-custom-cpu-pytorch-job'

# Create a custom job
job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

# Launch the job
job.run(sync=True)

print(f"Job {job_name} has been launched.")