## Prerequesites

In [1]:
import torch as tch
import numpy as np
import plotly.graph_objects as go
from pathlib import Path

import os

In [2]:
project_id = "semantic-segmentation-on-kitti" # @param {type:"string"}
# Set the project id
! gcloud config set project {project_id}

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey



In [3]:
region = "us-central1"  # @param {type: "string"}

In [4]:
bucket_name = 'data_kitti_driv_seg' # @param {type:"string"}
bucket_uri = f"gs://{bucket_name}"

# bucket_name = "semantic-segmentation-on-kitti-aip-20240328170341"
# bucket_uri = "gs://semantic-segmentation-on-kitti-aip-20240328170341"

# from datetime import datetime
# timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# if bucket_name == "" or bucket_name is None or bucket_name == "bucket-name-placeholder":
#     bucket_name = project_id + "-aip-" + timestamp
#     bucket_uri = "gs://" + bucket_name
! echo $bucket_uri

gs://data_kitti_driv_seg


In [5]:
from google.cloud import storage
client = storage.Client(project=project_id)

# Create a bucket
bucket = client.get_bucket(bucket_name) # client.create_bucket(bucket_name, location=region)

In [6]:
print("Bucket {} loaded.".format(bucket.name))

Bucket data_kitti_driv_seg loaded.


In [7]:
from google.cloud import aiplatform

# Initialize the Vertex AI SDK
aiplatform.init(project=project_id, location=region, staging_bucket=bucket_uri)

## Training script

In [27]:
%%writefile task_unet_model.py

import torch as tch
import numpy as np
import plotly.graph_objects as go
from pathlib import Path
import os
import argparse

# Importing the data 

parser = argparse.ArgumentParser()
parser.add_argument('--lr_rate', default=0.001, type=int)
parser.add_argument('--n_epoch', default=50, type=int)
parser.add_argument('--batch_size', default=1, type=int)
args = parser.parse_args()


# Add saving and loading models for big epochs' training

from google.cloud import storage


# Initialiser le client GCS

#project_number = os.environ["CLOUD_ML_PROJECT_ID"]
project_id = "semantic-segmentation-on-kitti"
client = storage.Client(project=project_id)


bucket_name = 'data_kitti_driv_seg'
bucket = client.get_bucket(bucket_name)

    
from io import BytesIO

# Récupérer l'objet depuis le bucket
object_path = 'data/training_tensor.pt'
blob = bucket.blob(object_path)
# Télécharger les données de l'objet en mémoire
data = BytesIO(blob.download_as_string())

training_tensor = tch.load(data)
training_tensor.shape


# Constructing the dataset objects

from torch.utils.data import Dataset
class ImageMaskDataset(Dataset):
    def __init__(self, data_tensor):
        self.data = data_tensor

    def __len__(self):
        return self.data.shape[1]  # Nombre d'exemples dans le tensor data

    def __getitem__(self, index):
        # Extraire l'image et le masque correspondant à l'index donné
        image = self.data[0, index]  # Première dimension pour les images
        mask = self.data[1, index]   # Deuxième dimension pour les masques
        
        return image, mask
    
# Splitting data into training/test datasets

training_data, test_data = ImageMaskDataset(training_tensor[:,:160]), ImageMaskDataset(training_tensor[:,160:])

# Création du DataLoader
data_loader = tch.utils.data.DataLoader(training_data, batch_size=args.batch_size, shuffle=True)


# Building the CNN (U-Net)

import torch.nn as nn

class DoubleConv(nn.Module): # Creating a class merging the double conv
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size = 3, padding = 1),          # X_out=X_in cf formula applied with these parameters' values
            nn.BatchNorm2d(out_channels),                                                # keeps size
            nn.ReLU(inplace=True),                                                       # keeps size 
            nn.Conv2d(out_channels, out_channels, kernel_size = 3, padding = 1),         
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )                                                                                # Keeps the same image size of the input

    def forward(self, x):
        return self.conv(x)

class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.dconv_down1 = DoubleConv(in_channels, 64)        # keeps image size 
        self.dconv_down2 = DoubleConv(64, 128)                # keeps image size 
        self.dconv_down3 = DoubleConv(128, 256)               # keeps image size 
        self.dconv_down4 = DoubleConv(256, 512)               # keeps image size 
        
        self.maxpool = nn.MaxPool2d(kernel_size = 2)          # X_out=int((X_in/2) + 1)   # Caution : default stride is equal to kernel-size here
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)  # X_out=int(X_in*2)       # Fasten the process as it hasn't to learn weights unlike the convtranspose (which is so )
        
        self.dconv_up3 = DoubleConv(256 + 512, 256)          # keeps image size 
        self.dconv_up2 = DoubleConv(128 + 256, 128)          # keeps image size
        self.dconv_up1 = DoubleConv(128 + 64, 64)            # keeps image size

        self.conv_last = nn.Conv2d(64, out_channels, 1)      # keeps image size

    def forward(self, x): 
        conv1 = self.dconv_down1(x)          
        x = self.maxpool(conv1)     

        conv2 = self.dconv_down2(x)          
        x = self.maxpool(conv2)     

        conv3 = self.dconv_down3(x)          
        x = self.maxpool(conv3)     

        x = self.dconv_down4(x)    
        x = self.upsample(x)        
        # print('La taille de x est ', x.shape, 'et la taille de conv3 est ', conv3.shape)
        x = tch.cat([x, conv3], dim=1) 

        x = self.dconv_up3(x)
        x = self.upsample(x)
        # print('La taille de x est ', x.shape, 'et la taille de conv2 est ', conv2.shape)
        x = tch.cat([x, conv2], dim=1)

        x = self.dconv_up2(x)
        x = self.upsample(x)
        #  print('La taille de x est ', x.shape, 'et la taille de conv1 est ', conv3.shape)
        x = tch.cat([x, conv1], dim=1)

        x = self.dconv_up1(x)
        out = self.conv_last(x)
        return out
    

unet_model = UNet(in_channels = 3, out_channels = 3)


# Training the model

# Définir la fonction de perte (criterion) et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = tch.optim.Adam(unet_model.parameters(), lr=args.lr_rate)

# Add saving method for optimisizer and epoch as said here https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html

unet_model.train()


for epoch in range(args.n_epoch) :
    running_loss = 0.0 
    
    if epoch % 5 == 0 :
        local_model_path = "unet_model.pt"
        object_path = 'model/' + local_model_path
        blob = bucket.blob(object_path)
        
        # Download the model locally
        blob.download_to_filename(local_model_path)

        # Charger les poids du modèle depuis le fichier
        state_dict = tch.load(local_model_path)

        # Mettre à jour les paramètres du modèle avec les poids chargés
        unet_model.load_state_dict(state_dict)
        unet_model.train()
        
    for image, mask in data_loader :
        # Remettre à zéro les gradients
        optimizer.zero_grad()

        pred = unet_model(image)

        # Calculate the loss
        loss = criterion(pred, mask)

        # Backpropagation and update of the weights
        loss.backward()
        optimizer.step()

        # Calculate the whole loss of the epoch
        running_loss += loss.item()
        
    if epoch % 5 == 0 :
        local_model_path = "unet_model.pt"
        tch.save(unet_model.state_dict(), local_model_path)

        object_path = 'model/' + local_model_path
        blob = bucket.blob(object_path)
        blob.upload_from_filename(local_model_path)

    # Afficher la perte moyenne de l'époque
    print(f"Epoch [{epoch+1}/{args.n_epoch}], Loss: {running_loss/len(data_loader)}")

    
# Model saving

local_model_path = "unet_model.pt"
tch.save(unet_model.state_dict(), local_model_path)

object_path = 'model/' + local_model_path
blob = bucket.blob(object_path)
blob.upload_from_filename(local_model_path)

Writing task_unet_model.py


In [10]:
JOB_NAME = "custom_job_unet"
container_uri = "us-central1-docker.pkg.dev/semantic-segmentation-on-kitti/unet-model-train-job/unet-model:tag1"

## Training the model

### Training Pipeline

In [11]:
job = aiplatform.CustomContainerTrainingJob(
    display_name=JOB_NAME,
    container_uri=container_uri,
)

In [12]:
MODEL_DISPLAY_NAME = "unet_model"
args = ["--epochs", "1"]

In [13]:
model = job.run(
    model_display_name=MODEL_DISPLAY_NAME,
    args=args,
    # tensorboard=tensorboard_resource_name,
)

model.wait()

print(model.display_name)
print(model.uri)

RuntimeError: model_display_name was provided but
                model_serving_container_image_uri was not provided when this
                custom pipeline was constructed.
                

In [16]:
DEPLOYED_NAME = "penguins_deployed_unique"

endpoint = model.deploy(deployed_model_display_name=DEPLOYED_NAME)

Creating Endpoint
Create Endpoint backing LRO: projects/358435915614/locations/us-central1/endpoints/4200646790522863616/operations/37557599217909760
Endpoint created. Resource name: projects/358435915614/locations/us-central1/endpoints/4200646790522863616
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/358435915614/locations/us-central1/endpoints/4200646790522863616')
Deploying model to Endpoint : projects/358435915614/locations/us-central1/endpoints/4200646790522863616
Using default machine_type: n1-standard-2
Deploy Endpoint model backing LRO: projects/358435915614/locations/us-central1/endpoints/4200646790522863616/operations/2952512458033463296
Endpoint model deployed. Resource name: projects/358435915614/locations/us-central1/endpoints/4200646790522863616
