# Fully convolutional networks for image segmentation

### This field is part of final batchelor tesis for an academic degree.

## 1. Set configuration


### Train configuration

Define primary values to train and test our model.

In [1538]:
# --- Define values to execute a SINGLE EXPERIMENT ---
single_experiment = True
architecture_name = 'FCN-8'
#Accepted values: FCN-8, UNET
model_name= 'VGG-16'
#Accepted values: VGG-16, Resnet-50, MobileNet
#--------------------------------------------------------------

n_classes = 3
image_size = 224
# Accepted values: 224, 256, 384, 512

if(image_size == 256 or image_size == 224):
    batch_size = 40
    steps_per_epoch = 28
    val_batch_size = 70
    val_steps_per_epoch = 2
    epochs = 100
    print('Loaded 224x224 configuration')
elif(image_size == 384):
    batch_size = 40
    steps_per_epoch = 28
    val_batch_size = 70
    val_steps_per_epoch = 2
    epochs = 100
    print('Loaded 384x384 configuration')
elif(image_size == 512):
    batch_size = 28
    steps_per_epoch = 40
    val_batch_size = 28
    val_steps_per_epoch = 5
    epochs = 100
    print('Loaded 512x512 configuration')
else:
    batch_size = 10
    val_batch_size = 10
    steps_per_epoch = 5
    val_steps_per_epoch = 5
    epochs = 25
    print('Loaded default configuration to quick test')


Loaded 224x224 configuration


### Connection & execution params
Define connection and executions params.

In [1539]:
experiment_folder_name = "TFG"
# Accepted values: 'VGG-16', 'Resnet-50', 'MobileNet'

# Data input prefix associated a the data input name defined
# Our data name is defined by "size_step_type_" for example "384_train_img" or "384_train_ann"
storage_iput_prefix = str(image_size)

# experiment_name - example: 'VGG-16-384x384'
experiment_name = experiment_folder_name+'-' + \
    storage_iput_prefix+'x'+storage_iput_prefix

# Execution instance name
gpu_instance = "k80-lp"

# Storage credentials
subscription_id = '628b0476-bda5-4357-ae22-3c9caf0a760b'
resource_group = 'azure-ml-resource'
workspace_name = 'tfg-workspace'


## 2. Create data & environment 

### Import packages

Import Python packages you need in this session. Also display the Azure Machine Learning SDK version.

In [1540]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import azureml.core
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.6.0


### Connect to workspace

Create a workspace object from the existing workspace. `Workspace.from_config()` reads the file **config.json** and loads the details into an object named `ws`.

In [1541]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

tfg-workspace	westus2	azure-ml-resource


### Create experiment

Create an experiment to track the runs in your workspace. A workspace can have muliple experiments. 

In [1542]:
from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

### Create or Attach existing compute resource
By using Azure Machine Learning Compute, a managed service, data scientists can train machine learning models on clusters of Azure virtual machines. Examples include VMs with GPU support. In this tutorial, you create Azure Machine Learning Compute as your training environment. You will submit Python code to run on this VM later in the tutorial. 
The code below creates the compute clusters for you if they don't already exist in your workspace.

**Creation of compute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace the code will skip the creation process.

In [1543]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# set a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", gpu_instance)
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6")

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())


found compute target: k80-lp


### Download TFG dataset

In [1544]:
from azureml.core import Workspace, Dataset

workspace = Workspace(subscription_id, resource_group, workspace_name)

_overwrite = True
load_data_in_local = False

if load_data_in_local:
    data_folder = os.path.join(os.getcwd(), 'dataset/384x384')
    os.makedirs(data_folder, exist_ok=True)
    img_train_path = os.path.join(data_folder, 'img_train')
    ann_train_path = os.path.join(data_folder, 'ann_train')
    img_val_path = os.path.join(data_folder, 'img_val')
    ann_val_path = os.path.join(data_folder, 'ann_val')
    img_test_path = os.path.join(data_folder, 'img_test')
    ann_test_path = os.path.join(data_folder, 'ann_test')

    img_train.download(target_path=img_train_path, overwrite=_overwrite)
    ann_train.download(target_path=ann_train_path, overwrite=_overwrite)
    img_val.download(target_path=img_val_path, overwrite=_overwrite)
    ann_val.download(target_path=ann_val_path, overwrite=_overwrite)
    img_test.download(target_path=img_test_path, overwrite=_overwrite)
    ann_test.download(target_path=ann_test_path, overwrite=_overwrite)
    print('Loaded data in local')

else:
    img_train = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_train_img')
    ann_train = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_train_ann')

    img_val = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_val_img')
    ann_val = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_val_ann')
    img_test = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_test_img')
    ann_test = Dataset.get_by_name(
        workspace, name=storage_iput_prefix+'_test_ann')

    img_train = img_train.register(workspace=ws,
                                   name='img_train',
                                   description='img_train',
                                   create_new_version=True)
    ann_train = ann_train.register(workspace=ws,
                                   name='ann_train',
                                   description='ann_train',
                                   create_new_version=True)
    img_val = img_val.register(workspace=ws,
                               name='img_val',
                               description='img_val',
                               create_new_version=True)
    ann_val = ann_val.register(workspace=ws,
                               name='ann_val',
                               description='ann_val',
                               create_new_version=True)
    img_test = img_test.register(workspace=ws,
                                 name='img_test',
                                 description='img_test',
                                 create_new_version=True)
    ann_test = ann_test.register(workspace=ws,
                                 name='ann_test',
                                 description='ann_test',
                                 create_new_version=True)
    print('Merged specified data into generic experiment dataset container')


Merged specified data into generic experiment dataset container


In [1545]:
import os
script_folder = os.path.join(os.getcwd(), experiment_folder_name)
os.makedirs(script_folder, exist_ok=True)

### Create a training script

To submit the job to the cluster, first create a training script. Run the following code to create the training script called `train.py` in the directory you just created. 

In [1546]:
%%writefile $script_folder/train.py

import numpy as np
import os
import joblib
import utils as my_utils
from azureml.core import Run
from keras_segmentation.models.unet import *
from keras_segmentation.models.fcn import *
from keras_segmentation.data_utils.data_loader import image_segmentation_generator
from keras_segmentation.predict import evaluate


# get hold of the current run
run = Run.get_context()

# get experiment inputs
args = my_utils.azure_get_experiment_inputs()

architecture_name = args.architecture_name
model_name = args.model_name

n_classes = args.n_classes
img_train = args.img_train
ann_train = args.ann_train
img_val = args.img_val
ann_val = args.ann_val
img_test = args.img_test
ann_test = args.ann_test
batch_size = args.batch_size
steps_per_epoch = args.steps_per_epoch
val_batch_size = args.val_batch_size
val_steps_per_epoch = args.val_steps_per_epoch
epochs = args.epochs
image_size = args.image_size
optimizer_name = args.optimizer_name

# define path to store checkpoints
os.makedirs('outputs/checkpoint', exist_ok=True)
checkpoint_path = 'outputs/checkpoint/'+model_name+'_1'

# TRAIN
if(architecture_name == 'FCN-8'):
    if(model_name == 'VGG-16'):
        model = fcn_8_vgg(n_classes=n_classes,
                         input_height=image_size, input_width=image_size)
    elif(model_name == 'Resnet-50'):
        model = fcn_8_resnet50(n_classes=n_classes,
                              input_height=image_size, input_width=image_size)
    elif(model_name == 'MobileNet'):
        model = fcn_8_mobilenet(n_classes=n_classes,
                               input_height=image_size, input_width=image_size)
    else:
        raise Exception('Sorry, architecture name: {} no contains model name: {}').format(
            architecture_name, model_name)

elif(architecture_name == 'UNET'):
    if(model_name == 'VGG-16'):
        model = vgg_unet(n_classes=n_classes,
                         input_height=image_size, input_width=image_size)
    elif(model_name == 'Resnet-50'):
        model = resnet50_unet(n_classes=n_classes,
                              input_height=image_size, input_width=image_size)
    elif(model_name == 'MobileNet'):
        model = mobilenet_unet(n_classes=n_classes,
                               input_height=image_size, input_width=image_size)
    else:
        raise Exception('Sorry, architecture name: {} no contains model name: {}').format(
            architecture_name, model_name)
else:
    raise Exception('Sorry, architecture name: {} not found').format(
        architecture_name)

print('Selected train model is '+model_name +' & architecture is '+architecture_name)
model.summary()

model.train(
    train_images=img_train,
    train_annotations=ann_train,
    val_images=img_val,
    val_annotations=ann_val,
    checkpoints_path=checkpoint_path,
    validate=True,
    batch_size=batch_size,
    val_batch_size=val_batch_size,
    steps_per_epoch=steps_per_epoch,
    val_steps_per_epoch=val_steps_per_epoch,
    epochs=epochs,
    optimizer_name=optimizer_name
)

my_utils.save_checkpoints_in_zip()

my_utils.azure_log_train_accuracy(model, run)
my_utils.azure_log_plot_train_accuracy(model, run)


# TEST
evaluation = evaluate(model=model, inp_images_dir=img_test,
                      annotations_dir=ann_test)

my_utils.azure_log_test_data(evaluation, run)

# Save model, the outputs folder is automatically uploaded into experiment record by AML Compute
model.save('./outputs/model.h5')
joblib.dump(value=model, filename='outputs/execution_model.pkl')
# register model 
model_save = run.register_model(model_name='execution_'+model_name+'_'+architecture_name+'_'+str(image_size), model_path='outputs/execution_model.pkl')
print(model_save.name, model_save.id, model_save.version, sep='\t')


Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/basic-cpu/code/users/histonzy/TFG/train.py


### Create a utils script

Create a utils script `utils.py` to add support functionalities about train script.

In [1547]:
%%writefile $script_folder/utils.py

import glob
import joblib
import zipfile
import os
import argparse
import numpy as np
import matplotlib.pyplot as plt

def azure_get_experiment_inputs():

    parser = argparse.ArgumentParser()
    
    #dataset parser
    parser.add_argument('--img_train', type=str,
                        dest='img_train', help='img_train')
    parser.add_argument('--ann_train', type=str,
                        dest='ann_train', help='ann_train')
    parser.add_argument('--img_val', type=str, dest='img_val', help='img_val')
    parser.add_argument('--ann_val', type=str, dest='ann_val', help='ann_val')
    parser.add_argument('--img_test', type=str,
                        dest='img_test', help='img_test')
    parser.add_argument('--ann_test', type=str,
                        dest='ann_test', help='ann_test')
    #train parser
    parser.add_argument('--architecture_name', type=str,
                        dest='architecture_name', help='architecture_name')
    parser.add_argument('--model_name', type=str,
                        dest='model_name', help='model_name')
    parser.add_argument('--n_classes', type=int,
                        dest='n_classes', default=3, help='n_classes')
    parser.add_argument('--batch_size', type=int,
                        dest='batch_size', default=10, help='batch size')
    parser.add_argument('--steps_per_epoch', type=int,
                        dest='steps_per_epoch', default=1, help='steps per epoch')
    parser.add_argument('--val_batch_size', type=int,
                        dest='val_batch_size', default=10, help='val batch size')
    parser.add_argument('--val_steps_per_epoch', type=int,
                        dest='val_steps_per_epoch', default=1, help='val steps per epoch')
    parser.add_argument('--epochs', type=int, dest='epochs',
                        default=5, help='epochs')
    parser.add_argument('--image_size', type=int,
                        dest='image_size', default=224, help='image size')
    parser.add_argument('--optimizer_name', type=str,
                        dest='optimizer_name', default="adadelta", help='optimizer name')
    
    return parser.parse_args()


def save_checkpoints_in_zip():

    path = os.path.join(os.getcwd(), 'outputs')
    path = os.path.abspath(os.path.normpath(os.path.expanduser(path)))
    for folder in os.listdir(path):
        zipf = zipfile.ZipFile('{0}.zip'.format(
            os.path.join(path, folder)), 'w', zipfile.ZIP_DEFLATED)
        for root, dirs, files in os.walk(os.path.join(path, folder)):
            for filename in files:
                zipf.write(os.path.abspath(os.path.join(
                    root, filename)), arcname=filename)
        zipf.close()

def azure_log_train_accuracy(model, run):
    run.log_list('Train accuracy -backup',
                 model.history.history['accuracy'][:25], description='TRAIN ACCURACY')
    run.log_list('Train accuracy -backup',
                 model.history.history['accuracy'][25:50], description='TRAIN ACCURACY')
    run.log_list('Train accuracy -backup',
                 model.history.history['accuracy'][50:75], description='TRAIN ACCURACY')
    run.log_list('Train accuracy -backup',
                 model.history.history['accuracy'][75:], description='TRAIN ACCURACY')

    run.log_list('Validation accuracy -backup',
                 model.history.history['val_accuracy'][:25], description='VALIDATION ACCURACY')
    run.log_list('Validation accuracy -backup',
                 model.history.history['val_accuracy'][25:50], description='VALIDATION ACCURACY')
    run.log_list('Validation accuracy -backup',
                 model.history.history['val_accuracy'][50:75], description='VALIDATION ACCURACY')
    run.log_list('Validation accuracy -backup',
                 model.history.history['val_accuracy'][75:], description='VALIDATION ACCURACY')

    run.log_list('Train loss -backup',
                 model.history.history['loss'][:25], description='TRAIN LOSS')
    run.log_list('Train loss -backup',
                 model.history.history['loss'][25:50], description='TRAIN LOSS')
    run.log_list('Train loss -backup',
                 model.history.history['loss'][50:75], description='TRAIN LOSS')
    run.log_list('Train loss -backup',
                 model.history.history['loss'][75:], description='TRAIN LOSS')

    run.log_list('Validation loss -backup',
                 model.history.history['val_loss'][:25], description='VALIDATION LOSS')
    run.log_list('Validation loss -backup',
                 model.history.history['val_loss'][25:50], description='VALIDATION LOSS')
    run.log_list('Validation loss -backup',
                 model.history.history['val_loss'][50:75], description='VALIDATION LOSS')
    run.log_list('Validation loss -backup',
                 model.history.history['val_loss'][75:], description='VALIDATION LOSS')

    run.log("final_val_loss", model.history.history["val_loss"][-1])


def azure_log_plot_train_accuracy(model, run):
    plt.figure(0)
    plt.plot(model.history.history['accuracy'], 'b', label="accuracy")
    plt.plot(model.history.history['val_accuracy'], 'g', label="val_accuracy")
    plt.xticks(np.arange(0, 101, 5))
    plt.yticks(np.arange(0, 1, 0.05))
    plt.xlabel("Num of epochs")
    plt.ylabel("Accuracy")
    plt.title("Training accuracy vs Validation accuracy")
    plt.legend(['train', 'validation'])
    plt.grid(True)
    run.log_image("accuracy vs val_accuracy", plot=plt)

    plt.figure(1)
    plt.plot(model.history.history['loss'], 'b')
    plt.plot(model.history.history['val_loss'], 'g')
    plt.xticks(np.arange(0, 101, 5))
    #plt.yticks(np.arange(0, 10, 0.2))
    plt.xlabel("Num of epochs")
    plt.ylabel("Loss")
    plt.title("Training loss vs Validation loss")
    plt.legend(['train', 'validation'])
    plt.grid(True)
    run.log_image("training loss vs validation loss", plot=plt)


def azure_log_test_data(evaluation, run):
    run.log('frequency_weighted_IU', evaluation['frequency_weighted_IU'])
    run.log('mean_IU', evaluation['mean_IU'])
    run.log('Ground mean_IoU ', evaluation['class_wise_IU'][0])
    run.log('Tree mean_IoU ', evaluation['class_wise_IU'][1])
    run.log('Grass mean_IoU ', evaluation['class_wise_IU'][2])
    run.log_list('class_wise_IU', evaluation['class_wise_IU'])


Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/basic-cpu/code/users/histonzy/TFG/utils.py


In [1548]:
%%writefile $script_folder/evaluation.py

import joblib
from keras_segmentation.predict import evaluate


Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/basic-cpu/code/users/histonzy/TFG/evaluation.py


### Define script params

Next, we define script params to pass our defined train inputs 

In [1549]:
script_params = {
    '--n_classes': n_classes,
    '--architecture_name': architecture_name,
    '--model_name': model_name,
    '--img_train': img_train.as_named_input('img_train').as_mount(),
    '--ann_train': ann_train.as_named_input('ann_train').as_mount(),
    '--img_val': img_val.as_named_input('img_val').as_mount(),
    '--ann_val': ann_val.as_named_input('ann_val').as_mount(),
    '--img_test': img_test.as_named_input('img_test').as_mount(),
    '--ann_test': ann_test.as_named_input('ann_test').as_mount(),
    '--batch_size': batch_size,
    '--val_batch_size': val_batch_size,
    '--steps_per_epoch': steps_per_epoch,
    '--val_steps_per_epoch': val_steps_per_epoch,
    '--epochs': epochs,
    '--image_size': image_size
}


### Define Tensorflow estimator & add dependencies

Define a Tensorflow context to execute our experiment (`train.py`) over GPU using train input specifications (`script_params`) and defined dependencies.

In [1550]:
from azureml.train.dnn import TensorFlow
est = TensorFlow(source_directory=script_folder,
                 entry_script='train.py',
                 script_params=script_params,
                 compute_target=compute_target,
                 pip_packages=[
                     "keras",
                     "imageio",
                     "imgaug",
                     "opencv-python",
                     "tqdm",
                     "joblib",
                     "matplotlib"],
                 use_gpu=True)




### Execute individual experiment

In [1551]:
if(single_experiment):
    run = exp.submit(est)


## 3. Imrpove experiment executions

###  Hyperparameters tunning

Define tune params to test and experiment which combinations gets best performance.
In this case, it will be used to make multiple experiment sequences using a different model for each iteration.

Also, we can use a ´Grid Parameter Sampling´ to find best batch_size combination for each model and architecture.

In [None]:
from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive import choice

find_best_batch_size = True

if (find_best_batch_size):
    ps = GridParameterSampling({
        '--batch_size': choice(35,28,20,16,14,10,8,7,5,4),
        '--architecture_name': choice('FCN-8', 'UNET'),
        '--model_name': choice('Resnet-50', 'VGG-16'),
        #   560,280,140,112,80,70,56,40,35,28,20,16,14,10,8,7,5,4
    })
else:
    ps = RandomParameterSampling(
        {
            '--model_name': choice('VGG-16', 'Resnet-50'),
            '--architecture_name': choice('FCN-8', 'UNET', 'MobileNet')
            # '--optimizer_name': choice("SGD", "RMSprop", "Adam", "Adadelta", "Adagrad", "Adamax", "Nadam")
        }
    )


### Stopper policies

Define a stopper policies to perform our time execution and improve our financial expense

`Median stopping` is an early termination policy based on running averages of primary metrics reported by the runs. This policy computes running averages across all training runs and terminates runs whose performance is worse than the median of the running averages.

In [None]:
from azureml.train.hyperdrive import BanditPolicy, MedianStoppingPolicy

#policy = BanditPolicy(evaluation_interval=50, slack_factor=0.1)
policy = MedianStoppingPolicy(evaluation_interval=1, delay_evaluation=20)



## 4. Release and validate result

### Execute experiment

Finally, we'll define and hypedrive context and will execute it.

In [None]:
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

hdc = HyperDriveConfig(estimator=est,
                       hyperparameter_sampling=ps, 
                       policy=policy,
                       primary_metric_name='Validation accuracy', 
                       primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                       max_total_runs=200,
                       max_concurrent_runs=1)


hdr = exp.submit(config=hdc)

### Display run results

You now have a model trained on a remote cluster.  Retrieve all the metrics logged during the run, including the accuracy of the model:

In [1285]:
from azureml.widgets import RunDetails
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

### Check best result and best params combination

In [None]:
hdr.wait_for_completion(show_output=True)
assert(hdr.get_status() == "Completed")
best_run = hdr.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])