## Prerequesites

In [21]:
import torch as tch
import numpy as np
import plotly.graph_objects as go
from pathlib import Path

import os

In [22]:
project_id = "semantic-segmentation-on-kitti" # @param {type:"string"}
# Set the project id
! gcloud config set project {project_id}

Updated property [core/project].


In [23]:
region = "us-central1"  # @param {type: "string"}

In [4]:
bucket_name = "bucket-name-placeholder"  # @param {type:"string"}
bucket_uri = f"gs://{bucket_name}"

bucket_name = "semantic-segmentation-on-kitti-aip-20240328170341"
bucket_uri = "gs://semantic-segmentation-on-kitti-aip-20240328170341"

from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

if bucket_name == "" or bucket_name is None or bucket_name == "bucket-name-placeholder":
    bucket_name = project_id + "-aip-" + timestamp
    bucket_uri = "gs://" + bucket_name
! echo $bucket_uri

gs://semantic-segmentation-on-kitti-aip-20240328170341


In [4]:
from google.cloud import storage
client = storage.Client(project=project_id)

# Create a bucket
bucket = client.get_bucket(bucket_name) # client.create_bucket(bucket_name, location=region)

NameError: name 'bucket_name' is not defined

In [6]:
print("Bucket {} created.".format(bucket.name))

Bucket semantic-segmentation-on-kitti-aip-20240328170341 created.


In [9]:
from google.cloud import aiplatform

# Initialize the Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_uri)

## Training script

In [27]:
%%writefile task_unet_model.py


import torch as tch
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
import os
import argparse

# Importing the data 

parser = argparse.ArgumentParser()
parser.add_argument('--lr_rate', default=0.001, type=int)
parser.add_argument('--n_epoch', default=100, type=int)
parser.add_argument('--batch_size', default=1, type=int)
args = parser.parse_args()


# Add saving and loading models for big epochs' training

from google.cloud import storage


# Initialiser le client GCS

#project_number = os.environ["CLOUD_ML_PROJECT_ID"]
project_id = "semantic-segmentation-on-kitti"
client = storage.Client(project=project_id)


bucket_name = 'data_kitti_driv_seg'
bucket = client.get_bucket(bucket_name)

# Liste des objets dans le bucket
blobs = bucket.list_blobs()

    
from io import BytesIO

# Récupérer l'objet depuis le bucket
object_path = 'data/training_tensor.pt'
blob = bucket.blob(object_path)
# Télécharger les données de l'objet en mémoire
data = BytesIO(blob.download_as_string())

training_tensor = tch.load(data)
training_tensor.shape


# Constructing the dataset objects

from torch.utils.data import Dataset
class ImageMaskDataset(Dataset):
    def __init__(self, data_tensor):
        self.data = data_tensor

    def __len__(self):
        return self.data.shape[1]  # Nombre d'exemples dans le tensor data

    def __getitem__(self, index):
        # Extraire l'image et le masque correspondant à l'index donné
        image = self.data[0, index]  # Première dimension pour les images
        mask = self.data[1, index]   # Deuxième dimension pour les masques
        
        return image, mask
    
# Splitting data into training/test datasets

training_data, test_data = ImageMaskDataset(training_tensor[:,:10]), ImageMaskDataset(training_tensor[:,10:])

# Création du DataLoader
data_loader = tch.utils.data.DataLoader(training_data, batch_size=args.batch_size, shuffle=True)


# Building the CNN (U-Net)

import torch.nn as nn

class DoubleConv(nn.Module): # Creating a class merging the double conv
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size = 3, padding = 1),          # X_out=X_in cf formula applied with these parameters' values
            nn.BatchNorm2d(out_channels),                                                # keeps size
            nn.ReLU(inplace=True),                                                       # keeps size 
            nn.Conv2d(out_channels, out_channels, kernel_size = 3, padding = 1),         
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )                                                                                # Keeps the same image size of the input

    def forward(self, x):
        return self.conv(x)

class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.dconv_down1 = DoubleConv(in_channels, 64)        # keeps image size 
        self.dconv_down2 = DoubleConv(64, 128)                # keeps image size 
        self.dconv_down3 = DoubleConv(128, 256)               # keeps image size 
        self.dconv_down4 = DoubleConv(256, 512)               # keeps image size 
        
        self.maxpool = nn.MaxPool2d(kernel_size = 2)          # X_out=int((X_in/2) + 1)   # Caution : default stride is equal to kernel-size here
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)  # X_out=int(X_in*2)       # Fasten the process as it hasn't to learn weights unlike the convtranspose (which is so )
        
        self.dconv_up3 = DoubleConv(256 + 512, 256)          # keeps image size 
        self.dconv_up2 = DoubleConv(128 + 256, 128)          # keeps image size
        self.dconv_up1 = DoubleConv(128 + 64, 64)            # keeps image size

        self.conv_last = nn.Conv2d(64, out_channels, 1)      # keeps image size

    def forward(self, x): 
        conv1 = self.dconv_down1(x)          
        x = self.maxpool(conv1)     

        conv2 = self.dconv_down2(x)          
        x = self.maxpool(conv2)     

        conv3 = self.dconv_down3(x)          
        x = self.maxpool(conv3)     

        x = self.dconv_down4(x)    
        x = self.upsample(x)        
        # print('La taille de x est ', x.shape, 'et la taille de conv3 est ', conv3.shape)
        x = tch.cat([x, conv3], dim=1) 

        x = self.dconv_up3(x)
        x = self.upsample(x)
        # print('La taille de x est ', x.shape, 'et la taille de conv2 est ', conv2.shape)
        x = tch.cat([x, conv2], dim=1)

        x = self.dconv_up2(x)
        x = self.upsample(x)
        #  print('La taille de x est ', x.shape, 'et la taille de conv1 est ', conv3.shape)
        x = tch.cat([x, conv1], dim=1)

        x = self.dconv_up1(x)
        out = self.conv_last(x)
        return out
    

unet_model = UNet(in_channels = 3, out_channels = 3)


# Training the model

# Définir la fonction de perte (criterion) et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = tch.optim.Adam(unet_model.parameters(), lr=args.lr_rate)

unet_model.train()


for epoch in range(args.n_epoch) :
    running_loss = 0.0
    for image, mask in data_loader :
        # Remettre à zéro les gradients
        optimizer.zero_grad()

        pred = unet_model(image)

        # Calculate the loss
        loss = criterion(pred, mask)

        # Backpropagation and update of the weights
        loss.backward()
        optimizer.step()

        # Calculate the whole loss of the epoch
        running_loss += loss.item()

    # Afficher la perte moyenne de l'époque
    print(f"Epoch [{epoch+1}/{args.n_epoch}], Loss: {running_loss/len(data_loader)}")

    
# Model saving
# Check https://pytorch.org/tutorials/beginner/saving_loading_models.html to save correctly the model
local_model_path = "unet_model.pt"
tch.save(unet_model.state_dict(), local_model_path)

object_path = 'model/' + local_model_path
blob = bucket.blob(object_path)
blob.upload_from_filename(local_model_path)

Writing task_unet_model.py


In [10]:
JOB_NAME = "custom_job_unet"

## Training the model

### Training Pipeline

In [11]:
ob# Modify this for the training
job = aiplatform.CustomTrainingJob(
    display_name=JOB_NAME,
    script_path="task_unet_model.py",
    container_uri="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest", # here
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest", # and here
)

### Creating and training the model 

In [12]:
MODEL_DISPLAY_NAME = "unet_model"

# Start the training and create your model
model = job.run(
    model_display_name=MODEL_DISPLAY_NAME,
)

Training script copied to:
gs://semantic-segmentation-on-kitti-aip-20240328170341/aiplatform-2024-04-02-15:17:34.288-aiplatform_custom_trainer_script-0.1.tar.gz.
Training Output directory:
gs://semantic-segmentation-on-kitti-aip-20240328170341/aiplatform-custom-training-2024-04-02-15:17:34.454 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4010323561137831936?project=549378803954
CustomTrainingJob projects/549378803954/locations/us-central1/trainingPipelines/4010323561137831936 current state:
PipelineState.PIPELINE_STATE_PENDING
CustomTrainingJob projects/549378803954/locations/us-central1/trainingPipelines/4010323561137831936 current state:
PipelineState.PIPELINE_STATE_PENDING
CustomTrainingJob projects/549378803954/locations/us-central1/trainingPipelines/4010323561137831936 current state:
PipelineState.PIPELINE_STATE_PENDING
CustomTrainingJob projects/549378803954/locations/us-central1/trainingPipelines/4010323561137831936 current state:
P

RuntimeError: Training failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. \nTraceback (most recent call last):\n  File \"/opt/conda/lib/python3.7/runpy.py\", line 193, in _run_module_as_main\n    \"__main__\", mod_spec)\n  File \"/opt/conda/lib/python3.7/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"/root/.local/lib/python3.7/site-packages/aiplatform_custom_trainer_script/task.py\", line 2, in <module>\n    import torch as tch\nModuleNotFoundError: No module named \'torch\'\n\nTo find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=549378803954&resource=ml_job%2Fjob_id%2F8163486842503561216&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%228163486842503561216%22"


### Deploying model

In [16]:
DEPLOYED_NAME = "penguins_deployed_unique"

endpoint = model.deploy(deployed_model_display_name=DEPLOYED_NAME)

Creating Endpoint
Create Endpoint backing LRO: projects/358435915614/locations/us-central1/endpoints/4200646790522863616/operations/37557599217909760
Endpoint created. Resource name: projects/358435915614/locations/us-central1/endpoints/4200646790522863616
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/358435915614/locations/us-central1/endpoints/4200646790522863616')
Deploying model to Endpoint : projects/358435915614/locations/us-central1/endpoints/4200646790522863616
Using default machine_type: n1-standard-2
Deploy Endpoint model backing LRO: projects/358435915614/locations/us-central1/endpoints/4200646790522863616/operations/2952512458033463296
Endpoint model deployed. Resource name: projects/358435915614/locations/us-central1/endpoints/4200646790522863616


## Make a prediction

### Prepare prediction test data

In [17]:
# Remove the species column
df_for_prediction.pop(LABEL_COLUMN)

# Convert data to a Python list
test_data_list = df_for_prediction.values.tolist()

### Viewing test data

In [18]:
test_data_list

[[0.0, 36.6, 18.4, 184.0, 3475.0, 0.0],
 [0.0, 40.9, 18.9, 184.0, 3900.0, 1.0],
 [0.0, 37.3, 16.8, 192.0, 3000.0, 0.0],
 [0.0, 49.5, 19.0, 200.0, 3800.0, 1.0],
 [0.0, 47.0, 17.3, 185.0, 3700.0, 0.0],
 [0.0, 34.0, 17.1, 185.0, 3400.0, 0.0],
 [0.0, 50.6, 19.4, 193.0, 3800.0, 1.0],
 [0.0, 50.8, 18.5, 201.0, 4450.0, 1.0],
 [0.0, 39.6, 18.1, 186.0, 4450.0, 1.0],
 [0.0, 41.3, 20.3, 194.0, 3550.0, 1.0],
 [0.0, 42.5, 16.7, 187.0, 3350.0, 0.0],
 [0.0, 40.6, 17.2, 187.0, 3475.0, 1.0],
 [0.0, 45.7, 17.0, 195.0, 3650.0, 0.0],
 [0.0, 36.0, 17.8, 195.0, 3450.0, 0.0],
 [0.0, 50.7, 19.7, 203.0, 4050.0, 1.0],
 [0.0, 42.2, 18.5, 180.0, 3550.0, 0.0],
 [0.0, 32.1, 15.5, 188.0, 3050.0, 0.0],
 [0.0, 50.9, 19.1, 196.0, 3550.0, 1.0],
 [0.0, 49.0, 19.6, 212.0, 4300.0, 1.0],
 [0.0, 52.2, 18.8, 197.0, 3450.0, 1.0],
 [0.0, 53.5, 19.9, 205.0, 4500.0, 1.0],
 [0.0, 52.8, 20.0, 205.0, 4550.0, 1.0],
 [0.0, 38.9, 18.8, 190.0, 3600.0, 0.0],
 [0.0, 38.8, 20.0, 190.0, 3950.0, 1.0],
 [0.0, 46.4, 18.6, 190.0, 3450.0, 0.0],


### Send the prediction request

In [19]:
# Get your predictions.
predictions = endpoint.predict(instances=test_data_list)

# View the predictions
predictions.predictions

[[0.380821019, 0.225905389, 0.393273532],
 [0.349367857, 0.209143579, 0.441488594],
 [0.411382914, 0.242850572, 0.345766455],
 [0.365590334, 0.216427699, 0.417981952],
 [0.359883547, 0.219310671, 0.420805812],
 [0.386932313, 0.227012664, 0.386055022],
 [0.359957278, 0.217440203, 0.422602445],
 [0.317476153, 0.187540948, 0.494982898],
 [0.310005456, 0.183340982, 0.506653547],
 [0.382173508, 0.224923551, 0.392902941],
 [0.386478394, 0.231975809, 0.381545722],
 [0.380858064, 0.226311743, 0.392830223],
 [0.371661782, 0.220337793, 0.40800038],
 [0.39031291, 0.225384504, 0.384302586],
 [0.34987697, 0.206286147, 0.443836898],
 [0.369881153, 0.225210309, 0.404908508],
 [0.409266233, 0.238573581, 0.352160156],
 [0.377004445, 0.227536932, 0.395458639],
 [0.338987589, 0.194405153, 0.466607243],
 [0.382138848, 0.231711939, 0.386149198],
 [0.317140281, 0.187296942, 0.495562822],
 [0.313786119, 0.184895352, 0.501318514],
 [0.376017094, 0.221398488, 0.402584374],
 [0.352369934, 0.206819162, 0.4408109

In [20]:
# Get the prediction for each set of input data.
species_predictions = np.argmax(predictions.predictions, axis=1)

# View the best prediction for the penguin characteristics in each row.
species_predictions

array([2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0,
       2])

In [21]:
import os

# Delete the training job
job.delete()

# Delete the endpoint and undeploy the model from it
endpoint.delete(force=True)

# Delete the model
model.delete()

# Delete the storage bucket and its contents
bucket.delete(force=True)

Deleting CustomTrainingJob : projects/358435915614/locations/us-central1/trainingPipelines/4620196032267943936
Delete CustomTrainingJob  backing LRO: projects/358435915614/locations/us-central1/operations/8096928352792739840
CustomTrainingJob deleted. . Resource name: projects/358435915614/locations/us-central1/trainingPipelines/4620196032267943936
Undeploying Endpoint model: projects/358435915614/locations/us-central1/endpoints/4200646790522863616
Undeploy Endpoint model backing LRO: projects/358435915614/locations/us-central1/endpoints/4200646790522863616/operations/7745647581857841152
Endpoint model undeployed. Resource name: projects/358435915614/locations/us-central1/endpoints/4200646790522863616
Deleting Endpoint : projects/358435915614/locations/us-central1/endpoints/4200646790522863616
Delete Endpoint  backing LRO: projects/358435915614/locations/us-central1/operations/3642868321323319296
Endpoint deleted. . Resource name: projects/358435915614/locations/us-central1/endpoints/4