# 03 - Convolutional Neural Network Model Training Notebook
Author: George Gorospe, george.gorospe@nmaia.net (updated 1/19/2024)

# In this third notebook, we'll use the the data we previously collected tfo train a Convolutional Neural Network (CNN). 

Training a neural network results in a machine learning model. In this case the resulting model will serve as our pilot for our self-driving car.

In [1]:
# Importing required libraries

### Machine Learning Libraries
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from torchvision.transforms import Compose, ToTensor, Resize
from torch.utils.data import DataLoader

# Custom dataset object library
from xy_dataset import XYDataset

# General Libraries 
import cv2, glob, os, fnmatch
from datetime import datetime
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from ipyfilechooser import FileChooser
from matplotlib.patches import Circle


# Jupyter Laboratory Libraries
import ipywidgets
import traitlets
from IPython.display import display

# Nvidia library for images
from jetcam.utils import bgr8_to_jpeg


### Selecting a Dataset for Training
Use the following folder chooser to select the folder where your dataset is located.

In [2]:
# Create and display a FileChooser widget
fc = FileChooser('/home/racer_core/Datasets')
display(fc)
fc.show_only_dirs = True
# Change the title (use '' to hide)
fc.title = '<b>Choose Dataset for Training</b>'

# Sample callback function
def change_title(chooser):
    chooser.title = '<b>Directory Selected.</b>'

# Register callback function
fc.register_callback(change_title)

FileChooser(path='/home/racer_core/Datasets', filename='', title='', show_hidden=False, select_desc='Select', …

In [3]:
# Inspecting Dataset

# Output from file chooser
DATASET_DIR = fc.selected_path
dataset_folder_name = DATASET_DIR.split("/")[-1]


# Information about the dataset, number of data points and a listing of the data points.
num_files =  len(glob.glob(os.path.join(DATASET_DIR, '*.jpg')))
file_list = fnmatch.filter(os.listdir(DATASET_DIR), '*.jpg')
if num_files > 0:
    print("Dataset found!")
    print("Number of files found in datadset: " + str(num_files))
elif num_files == 0:
  print("No data in selected directory, choose again?")

Dataset found!
Number of files found in datadset: 5012


In [4]:
# Creating our dataset object. This object parses the file names to get the labels for each datapoint

# These transforms adjust the images prior to training to promote robust performance
# Note: Some transforms are commented out they are example of possible transforms to use in the future
TRANSFORMS = transforms.Compose([
    #transforms.ColorJitter(0.2, 0.2, 0.2, 0.2),  # Color Jitter #1
    #transforms.ColorJitter(brightness=1.0, hue=.3), # Color Jitter #2
    #transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)), # Gaussian Blur #1
    transforms.GaussianBlur(kernel_size=(7), sigma=(0.8)),  # Gaussian Blue #2

    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

Sample_Dataset = XYDataset(DATASET_DIR,TRANSFORMS)

Dataset directory exists.
Number of files in datadset: 5012


In [5]:
# Using sklearn to split dataset into training and evaluation subsets

def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['evaluate'] = Subset(dataset, val_idx)
    return datasets

# Both "Train" and "Evaluate" datasets are within the datasets list
datasets = train_val_dataset(Sample_Dataset)
print(f"Number of Data Points in Training Dataset: {len(datasets['train'])}")
print(f"Number of Data Points in Evaluate Dataset: {len(datasets['evaluate'])}")

Number of Data Points in Training Dataset: 3759
Number of Data Points in Evaluate Dataset: 1253


In [6]:
# Creating the Dataloaders for both the 'train' and the 'eval' datasets
# Here the datasets ('train' and 'evaluate') are input into DataLoaders
# DataLoaders deliver the data to the training algorithm when requested.
# They deliver the data in 'minibatches' , and reshuffle the data for each epoch
train_dataloader = DataLoader(datasets['train'], batch_size=64, shuffle=True)
test_dataloader = DataLoader(datasets['evaluate'], batch_size=64, shuffle=True)

In [None]:
# Plotting an example of the data from the train loader

# Create a figure for both images
fig = plt.figure(figsize= (10, 10))

# Create a subplot for the array of images
ax = fig.add_subplot(1, 2, 1)

# Get an image, list of annotations and labels from the train dataloader
train_image, ann, train_labels = next(iter(train_dataloader))

# Getting the label for the image
x = train_labels[0].numpy()[0]
x = int(224 * (x / 2.0 + 0.5))

# Reading the raw image from file
file_path = ann['image_path'][0]
print(f"Selected File: {file_path}")
print(f"Label value (x): {x}")
img = cv2.imread(file_path)

# Plotting the raw image w/ label
circ = Circle((x,112),15)
ax.add_patch(circ)
ax.imshow(img)

# Getting the transformed image from the dataloader (all images on the
train_image = train_image.numpy()[0]
train_image = np.moveaxis(train_image, 0, -1)

# Plotting the transformed image w/ label
ax = fig.add_subplot(1, 2, 2)
circ = Circle((x,112),15)
ax.add_patch(circ)
ax.imshow(train_image)

## Training the CNN on the Selected Dataset
Next, we'll setup the training algorithm for our machine learning model.
As we prepare to train our model we need to make choices about the way we'll train it.
These choices can impact how long it takes to train the model and the overall accuracy of the model.

The user-set parameters of the training algorithm are often called "Hyper-Parameters"
You can set your hyper parameters below, make sure to track which setting you used for your training!

In [None]:
# Set Training Hyper Parameters:

########## [ACTION REQUIRED] Set name for new machine learning model #################
model_file_name = "large_model2.pth"
training_notes = "A study of large models, resnet50 w/ frozen layers."


# Number of training epochs:
epochs = 2

# Data loader batch size:
BATCH_SIZE = 128

# Model Output
output_dim = 2

######### Select a Machine learning model structure (Neural Network) ###########
######### Uncomment both the model and the fully connected layer "model.fc"

# Resnet 18
#model = torchvision.models.resnet18(pretrained=True)
#model.fc = torch.nn.Linear(512, output_dim)

# Resnet 34
#model = torchvision.models.resnet34(pretrained=True)
#model.fc = torch.nn.Linear(512, output_dim)

# Resnet 50
#model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")
#model.fc = torch.nn.Linear(2048, output_dim)

# MobileNet V2
model = torchvision.models.mobilenet_v2(pretrained=True)
model.fc = torch.nn.Linear(2048, output_dim)

# MobileNet V3
#model = torchvision.models.mobilenet_v3_large(pretrained=True)
# model.classifier[-1] = torch.nn.Linear(4096, output_dim)

# ALEXNET
# model = torchvision.models.alexnet(pretrained=True)
# model.classifier[-1] = torch.nn.Linear(4096, output_dim)

# SQUEEZENET 
# model = torchvision.models.squeezenet1_1(pretrained=True)
# model.classifier[1] = torch.nn.Conv2d(512, output_dim, kernel_size=1)
# model.num_classes = len(dataset.categories)

# DENSENET 121
# model = torchvision.models.densenet121(pretrained=True)
# model.classifier = torch.nn.Linear(model.num_features, output_dim)


# If you wanted to train fewer of the layers (freeze some layers)
#Freeze all of the weights in ResNet18
for param in model.parameters():
  param.requires_grad = False

# Adding a fully connected layer to the top/head of the model

# Model optimizer:
optimizer = torch.optim.Adam(model.parameters())

#Loading a GPU if available and otherwise a CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def training_and_evaluation(epochs):
  # Training Timing
  start_time = datetime.now()

  ############# Initiating Training Process ##############
  # First set model to train mode
  model.train()

  print("Starting training process ...")
  # Start training process dependent on number of epochs
  while epochs > 0:
    print("######### Epoch: " + str(epochs) + " #########")
    # Index
    i = 0
    sum_loss = 0.0
    error_count = 0.0

    # Training Loop
    # Process each batch of data points in the train loader
    for images, category_idx, xy in iter(train_dataloader):
      # send data to device
      images = images.to(device)
      xy = xy.to(device)

      # zero gradients of parameters
      optimizer.zero_grad()

      # execute model to get outputs
      outputs = model(images)

      # compute MSE loss over x, y coordinates for associated categories
      loss = 0.0
      loss += torch.mean((outputs - xy)**2)
      #for batch_idx, cat_idx in enumerate(list(category_idx.flatten())):
      #    loss += torch.mean((outputs[batch_idx][2 * cat_idx:2 * cat_idx+2] - xy[batch_idx])**2)
      #loss /= len(category_idx)

      # run backpropogation to accumulate gradients
      loss.backward()

      # step optimizer to adjust parameters
      optimizer.step()

      # increment progress
      # NO TRAINING ACCURACY: no correct answer for regression, only loss
      #count = len(category_idx.flatten())
      #i += count
      i += len(xy)
      sum_loss += float(loss)
      #progress_widget.value = i / len(dataset)
      #loss_widget.value = sum_loss / i


      print("Mean Square Error (MSE): " + str(sum_loss/i))

    #sum_loss.append(totalLoss)
    #print(f"Training Accuracy: {testAccuracy / len(training)}")


    # Evaluation Loop
    i = 0
    evaluation_loss = 0.0
    for images, category_idx, xy in test_dataloader:

        # Put the model into evaluation mode
        model.eval()

        # send data to device
        images = images.to(device)
        xy = xy.to(device)

        # execute model to get outputs
        outputs = model(images)

        # compute MSE loss over x, y coordinates for associated categories
        loss = 0.0
        loss += torch.mean((outputs - xy)**2)
        i += len(xy)
        evaluation_loss += float(loss)
    print(f"Validation Test Mean Square Error (MSE): {evaluation_loss / i}")
    #Save our model for each epoch
    #torch.save(model.state_dict(), file)

    # End of the current epoch
    epochs = epochs -1
  end_time = datetime.now()

  # get the execution time
  elapsed_time = end_time - start_time
  training_duration_time_formatted = str(elapsed_time)
  print('Execution time:', training_duration_time_formatted)

  # Writing training details to training log
  f = open("/home/racer_core/Models/training_log.txt", "a")
  f.write("\n")
  dt_string = start_time.strftime("%m/%d/%Y %H:%M:%S")
  f.write(f"Training Report: {dt_string} \n")
  f.write(f"Output Model File: {model_file_name}\n")
  f.write(f"Selected Dataset: {dataset_folder_name}, Number Data Points: {num_files}\n")
  f.write(f"Model: {model.__class__.__name__}, Epochs: {epochs}, Batch Size: {BATCH_SIZE}\n")
  f.write(f"Training Notes: {training_notes}\n")
  f.write(f"Transforms used in this training: {TRANSFORMS}\n")
  f.write(f"Final model evaluation loss: {evaluation_loss/i}\n")
  f.write(f"Total training & evaluation time: {training_duration_time_formatted}\n")
  f.write("\n")
  f.close()

  return model #trainLoss, validationLoss, model

# START TRAINING
model = training_and_evaluation(epochs)

# SAVE TGHE MODEL TO FILE
model_folder = "/home/racer_core/Models/"
model_file_path = model_folder + model_file_name
torch.save(model.state_dict(), model_file_path)
print(f"Saved new model as: {model_file_path}")


### Optimizing the Machine Learning Model to Run on the Robot
In this final step to the training process, we'll optimize the model.
We optimize the model so that it will run as fast as our camera collects data.

Optimization involves two steps:
1. Export the trained machine learning model from a pytorch model (.pth) to a general and open format called ONNX (.onnx). ONNX stands for Open Neural Network Exchange.
2. Convert the ONNX model to a version designed for fast operation, a tensorRT model (.TRT)


In [None]:
#### STEP 1: Conversion of the Pytorch model to ONNX format

import torch.onnx

# Retrieve an example of the input, an image, from the data loader.
train_image, ann, train_labels = next(iter(train_dataloader))
train_image = train_image.to(device) # Need to ensure that the example input is on the device

onnx_model_file_name = model_file_name.split(".")[0] + ".onnx"
onnx_model_file_path = "/home/racer_core/Models/onnx/" + onnx_model_file_name

onnx_model = torch.onnx.export(model, train_image , onnx_model_file_path, verbose=False)
print(f"New ONNX model file saved to: {onnx_model_file_path}")

In [None]:
# Load the recently created onnx 
import onnx
onnx_model = onnx.load(onnx_model_file_path)
onnx.checker.check_model(onnx_model)

In [None]:
#import pycuda.driver as cuda
#import pycuda.autoinit
import numpy as np
import tensorrt as trt
 
# logger to capture errors, warnings, and other information during the build and inference phases
TRT_LOGGER = trt.Logger()
 
def build_engine(onnx_file_path):
    # initialize TensorRT engine and parse ONNX model
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network()
    parser = trt.OnnxParser(network, TRT_LOGGER)
     
    # parse ONNX
    with open(onnx_file_path, 'rb') as model:
        print('Beginning ONNX file parsing')
        parser.parse(model.read())
    print('Completed parsing of ONNX file')