In [1]:
# import libraries
from __future__ import print_function
from __future__ import division
from PIL import Image
from skimage import io, transform
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy
import json
from torch.utils.data.sampler import SubsetRandomSampler
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.13.0+cu116
Torchvision Version:  0.14.0+cu116


In [None]:
# mount at google drive
from google.colab import drive
drive.mount('/content/gdrive')
!ls

Mounted at /content/gdrive
gdrive	sample_data


In [None]:
# We use pretrained renset152 from torch for this task
model_ft = models.resnet152(pretrained=True)

In [3]:
# If only feature extraction, freeze the pretrained layers
# actually not used
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [4]:
# Finetune the whole model instead of feature extraction
feat_extract = False

# Initialize the model
# Change the first and last layer of resnet152 to fit out input and output size
def initialize_model(input_channel=3, output_size=12, feature_extract=True, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    model_ft = models.resnet152(pretrained=use_pretrained)
    set_parameter_requires_grad(model_ft, feature_extract)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, output_size)
    input_size = 256
    conv1_out_channel = model_ft.conv1.out_channels
    model_ft.conv1 = nn.Conv2d(input_channel, conv1_out_channel, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    return model_ft, input_size

# Initialize the model for this run
model_ft, input_size = initialize_model(feature_extract=feat_extract)

# Print the model we just instantiated
print(model_ft)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# customized dataset 
class ProbeDataset(Dataset):

    def __init__(self, root_dir='./dataset_merged', transform=None, operation='train'):
        self.root_dir = root_dir
        self.matrix_dir = os.path.join(root_dir, 'matrix', operation)
        self.rgba_dir = os.path.join(root_dir, 'rgba', operation)
        self.transform = transform
        self.operation = operation

    def __len__(self):
        length = len([entry for entry in os.listdir(self.matrix_dir) if os.path.isfile(os.path.join(self.matrix_dir, entry))])
        return length

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()

    # read the data from different folders according to operation
    if(self.operation=='train'):
        img_path = os.path.join(self.rgba_dir, "{0:05d}.png".format(idx))
    elif(self.operation=='val'):
        img_path = os.path.join(self.rgba_dir, "val_{0:05d}.png".format(idx))
    elif(self.operation=='test'):
        img_path = os.path.join(self.rgba_dir, "test_{0:05d}.png".format(idx))
    img = io.imread(img_path)
    img = img[:,:,0:3]
    if(self.operation=='train'):
        matrix_path = os.path.join(self.matrix_dir, "{0:05d}.json".format(idx))
    elif(self.operation=='val'):
        matrix_path = os.path.join(self.matrix_dir, "val_{0:05d}.json".format(idx))
    elif(self.operation=='test'):
        matrix_path = os.path.join(self.matrix_dir, "test_{0:05d}.json".format(idx))
    f = open(matrix_path)
    matrix_json = json.load(f)
    matrix = np.array(matrix_json['matrix_probe_in_cam_coord'])
    matrix = matrix.flatten()
    matrix = matrix[0:12]

    sample = {'image': img, 'matrix': matrix}

    if self.transform:
        sample = self.transform(sample)

    return sample

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, matrix = sample['image'], sample['matrix']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        return {'image': torch.from_numpy(image),
                'matrix': torch.from_numpy(matrix)}


In [None]:
# read in the data
probe_dataset_train = ProbeDataset(root_dir='./gdrive/MyDrive/ML_dataset_new',
                transform=transforms.Compose([ToTensor()]), operation="train")
probe_dataset_val = ProbeDataset(root_dir='./gdrive/MyDrive/ML_dataset_new',
                transform=transforms.Compose([ToTensor()]), operation="val")
probe_dataset_test = ProbeDataset(root_dir='./gdrive/MyDrive/ML_dataset_new',
                transform=transforms.Compose([ToTensor()]), operation="test")
# check the size
for i in range(len(probe_dataset_train)):
    sample = probe_dataset_train[i]
    print(i, sample['image'].size(), sample['matrix'].size())
    if i == 3:
        break

0 torch.Size([3, 256, 256]) torch.Size([12])
1 torch.Size([3, 256, 256]) torch.Size([12])
2 torch.Size([3, 256, 256]) torch.Size([12])
3 torch.Size([3, 256, 256]) torch.Size([12])


In [None]:
# dataloaders for train, val and test
batch_size = 32
shuffle_dataset = True
random_seed= 100
train_loader = DataLoader(probe_dataset_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(probe_dataset_val, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(probe_dataset_test, batch_size=1, shuffle=True)

In [6]:
# use gpu to train
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
# Send the model to GPU
model_ft = model_ft.to(device)
# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feat_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 conv1.weight
	 bn1.weight
	 bn1.bias
	 layer1.0.conv1.weight
	 layer1.0.bn1.weight
	 layer1.0.bn1.bias
	 layer1.0.conv2.weight
	 layer1.0.bn2.weight
	 layer1.0.bn2.bias
	 layer1.0.conv3.weight
	 layer1.0.bn3.weight
	 layer1.0.bn3.bias
	 layer1.0.downsample.0.weight
	 layer1.0.downsample.1.weight
	 layer1.0.downsample.1.bias
	 layer1.1.conv1.weight
	 layer1.1.bn1.weight
	 layer1.1.bn1.bias
	 layer1.1.conv2.weight
	 layer1.1.bn2.weight
	 layer1.1.bn2.bias
	 layer1.1.conv3.weight
	 layer1.1.bn3.weight
	 layer1.1.bn3.bias
	 layer1.2.conv1.weight
	 layer1.2.bn1.weight
	 layer1.2.bn1.bias
	 layer1.2.conv2.weight
	 layer1.2.bn2.weight
	 layer1.2.bn2.bias
	 layer1.2.conv3.weight
	 layer1.2.bn3.weight
	 layer1.2.bn3.bias
	 layer2.0.conv1.weight
	 layer2.0.bn1.weight
	 layer2.0.bn1.bias
	 layer2.0.conv2.weight
	 layer2.0.bn2.weight
	 layer2.0.bn2.bias
	 layer2.0.conv3.weight
	 layer2.0.bn3.weight
	 layer2.0.bn3.bias
	 layer2.0.downsample.0.weight
	 layer2.0.downsample.1.weight

In [None]:
# some hyperparameter here
num_epoch = 100
lambda1 = 1.2
lambda2 = 0.7

def train(model, dataloader, criterion, optimizer, epoch, lambda_1, lambda_2):
"""
Train function
Input:
    model: pretrained resnet152
    dataloader: dataloader defined previously
    criterion: loss function is built with three parts, two of them use L2 loss
    optimizer: optimizer to update the model
    epoch: current epoch number
    lambda_1 and lambda_2: weight for different part of the loss function
"""
  #print(epoch)
    model.train()
    running_loss = 0.0
    for i_batch, sample_batch in enumerate(dataloader):
        image = sample_batch['image'].to(device)
        matrix = sample_batch['matrix'].to(device)
        image = image.float()
        matrix = matrix.float()
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward
        with torch.set_grad_enabled(True):
            prediction = model(image)
            batchsize = prediction.shape[0]

        cor_gt = matrix[:,[3,7,11]]
        cor_pred = prediction[:,[3,7,11]]
        rot_gt = matrix[:,[0,1,2,4,5,6,8,9,10]] # N*9
        rot_pred = prediction[:,[0,1,2,4,5,6,8,9,10]]

        # first part of the loss
        loss_cor = criterion(prediction, matrix)

        # second part of the loss
        unflatten = nn.Unflatten(1, (3,3))
        rot_gt = unflatten(rot_gt)
        rot_pred = unflatten(rot_pred)
        rot_gt_t = torch.transpose(rot_gt,1,2)
        rot_gt_pred = torch.bmm(rot_gt_t, rot_pred)
        identity_mtx = torch.eye(3)
        identity_mtx = identity_mtx.reshape((1, 3, 3))
        identity_mtx = identity_mtx.repeat(batchsize, 1, 1)
        identity_mtx = identity_mtx.to(device)
        loss_whether_rot = criterion(rot_gt_pred, identity_mtx)

        # third part of the loss
        U, S, Vh = torch.linalg.svd(rot_pred)
        rot_pred_corrected = torch.bmm(U, identity_mtx)
        rot_pred_corrected = torch.bmm(rot_pred_corrected, Vh)
        rot_pred_corrected_mul = torch.bmm(rot_gt_t, rot_pred_corrected)
        loss_rot_dist = torch.sum(-rot_pred_corrected_mul.diagonal(offset=0, dim1=-2, dim2=-1).sum(dim=-1))/batchsize

        # the total loss
        loss = loss_cor + lambda_1*loss_whether_rot + lambda_2*loss_rot_dist

        # backward + optimize
        loss.backward()
        optimizer.step()

        # statistics
        running_loss += loss.item() * image.size(0)
    epoch_loss = running_loss / len(dataloader.dataset)
    if(epoch%5==0):
        print('Train Loss: {:.4f}'.format(epoch_loss))

        
def validation(model, dataloader, criterion, optimizer, epoch, best_val_loss, best_model_wts, lambda_1, lambda_2):
"""
validation function
Input:
    model: pretrained resnet152
    dataloader: dataloader defined previously
    criterion: loss function is built with three parts, two of them use L2 loss
    optimizer: optimizer to update the model
    epoch: current epoch number
    best_val_loss: best validation loss so far
    best_model_wts: corresponding model weight of best validation loss
    lambda_1 and lambda_2: weight for different part of the loss function
Output:
    best_val_loss: updated best validation loss so far
    best_model_wts: updated corresponding model weight of best validation loss
"""
    model.eval()
    running_loss = 0.0
    for i_batch, sample_batch in enumerate(dataloader):
        image = sample_batch['image'].to(device)
        matrix = sample_batch['matrix'].to(device)
        image = image.float()
        matrix = matrix.float()
        # zero the parameter gradients
        optimizer.zero_grad()
        # do not update gradient
        with torch.set_grad_enabled(False):
            prediction = model(image)
            batchsize = prediction.shape[0]

            cor_gt = matrix[:,[3,7,11]]
            cor_pred = prediction[:,[3,7,11]]
            rot_gt = matrix[:,[0,1,2,4,5,6,8,9,10]] # N*9
            rot_pred = prediction[:,[0,1,2,4,5,6,8,9,10]]

            # first part of the loss
            loss_cor = criterion(prediction, matrix)

            # second part of the loss
            unflatten = nn.Unflatten(1, (3,3))
            rot_gt = unflatten(rot_gt)
            rot_pred = unflatten(rot_pred)
            rot_gt_t = torch.transpose(rot_gt,1,2)
            rot_gt_pred = torch.bmm(rot_gt_t, rot_pred)
            identity_mtx = torch.eye(3)
            identity_mtx = identity_mtx.reshape((1, 3, 3))
            identity_mtx = identity_mtx.repeat(batchsize, 1, 1)
            identity_mtx = identity_mtx.to(device)
            loss_whether_rot = criterion(rot_gt_pred, identity_mtx)

            # third part of the loss
            U, S, Vh = torch.linalg.svd(rot_pred)
            rot_pred_corrected = torch.bmm(U, identity_mtx)
            rot_pred_corrected = torch.bmm(rot_pred_corrected, Vh)
            rot_pred_corrected_mul = torch.bmm(rot_gt_t, rot_pred_corrected)
            loss_rot_dist = torch.sum(-rot_pred_corrected_mul.diagonal(offset=0, dim1=-2, dim2=-1).sum(dim=-1))/batchsize

            # the total loss
            loss = loss_cor + lambda_1*loss_whether_rot + lambda_2*loss_rot_dist
            
        # statistics
        running_loss += loss.item() * image.size(0)
    epoch_loss = running_loss / len(dataloader.dataset)
    if(epoch%5==0):
        print('Validation Loss: {:.4f}'.format(epoch_loss))
    # save the model every 10 epoch after 40 epochs
    if(epoch%10==0 and epoch>40):
        md_wt = copy.deepcopy(model.state_dict())
        torch.save(best_model_wts, './gdrive/MyDrive/ML_dataset_new/model_1219_epoch{0:03d}.pt'.format(epoch))
    # save the best model
    if(epoch_loss < best_val_loss and epoch > (num_epoch-20)):
        best_val_loss = epoch_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(best_model_wts, './gdrive/MyDrive/ML_dataset_new/model_1219_best.pt')
    return best_val_loss, best_model_wts

In [None]:
# Define optimizer and criterion
optimizer_ft = optim.Adam(params_to_update, lr=0.0001)
loss_func = nn.MSELoss()

In [None]:
# training and validating
low_val_loss = 100000
model_weights = None
for epoch in range(num_epoch):
    if(epoch%5==0):
        print('Epoch {}/{}'.format(epoch, num_epoch-1))
        print('-' * 10)
    train(model_ft, train_loader, loss_func, optimizer_ft, epoch, lambda1, lambda2)
    low_val_loss, model_weights = validation(model_ft, val_loader, loss_func, optimizer_ft, epoch, low_val_loss, model_weights, lambda1, lambda2)

Epoch 0/99
----------
Train Loss: 9.3408
Validation Loss: 3.1670
Epoch 5/99
----------
Train Loss: -1.6492
Validation Loss: -0.9215
Epoch 10/99
----------
Train Loss: -1.9793
Validation Loss: -1.2003
Epoch 15/99
----------
Train Loss: -2.0092
Validation Loss: -1.2215
Epoch 20/99
----------
Train Loss: -2.0002
Validation Loss: -1.2483
Epoch 25/99
----------
Train Loss: -2.0157
Validation Loss: -1.2267
Epoch 30/99
----------
Train Loss: -2.0136
Validation Loss: -1.3000
Epoch 35/99
----------
Train Loss: -2.0338
Validation Loss: -1.3336
Epoch 40/99
----------
Train Loss: -2.0273
Validation Loss: -1.3047
Epoch 45/99
----------
Train Loss: -2.0354
Validation Loss: -1.3191
Epoch 50/99
----------
Train Loss: -2.0365
Validation Loss: -1.3049
Epoch 55/99
----------
Train Loss: -2.0386
Validation Loss: -1.3479
Epoch 60/99
----------
Train Loss: -2.0399
Validation Loss: -1.3627
Epoch 65/99
----------
Train Loss: -2.0323
Validation Loss: -1.3074
Epoch 70/99
----------
Train Loss: -2.0505
Validatio

In [None]:
# load the best trained model and make prediction on test set

#model_ft.load_state_dict(torch.load('./gdrive/MyDrive/ML_dataset_new/model_1.pt', map_location=torch.device('cpu')))
model_ft.load_state_dict(torch.load('./gdrive/MyDrive/ML_dataset_new/model_1219_best.pt'))
model_ft.eval()
test_loader = DataLoader(probe_dataset_test, batch_size=1, shuffle=False)
for i_batch, sample_batch in enumerate(test_loader):
    image = sample_batch['image'].to(device)
    matrix = sample_batch['matrix'].to(device)
    image = image.float()
    matrix = matrix.float()
    prediction = model_ft(image)
    prediction = prediction.cpu().detach().numpy()
    prediction = prediction.reshape(3,4)
    print("prediction: ", prediction)
    prediction = prediction.tolist()
    matrix = matrix.cpu().detach().numpy()
    matrix = matrix.reshape(3,4)
    print("ground truth: ", matrix)
    matrix = matrix.tolist()
    
    # save the output as json
    prediction_json = json.dumps(prediction, indent=4)
    matrix_json = json.dumps(matrix, indent=4)
    with open("./gdrive/MyDrive/ML_dataset_new/output_1219/{0:05d}.json".format(i_batch), "w") as outfile:
      outfile.write(prediction_json)
      outfile.close()
    

prediction:  [[  0.16529085  -0.5000112   -0.1792647    0.291608  ]
 [  0.03704821  -0.07174214   1.0406579   -0.743917  ]
 [ -0.39734486  -0.08900318  -0.0195951  -12.643178  ]]
ground truth:  [[  0.47519928  -0.8767089   -0.07461314   0.09554597]
 [ -0.0411941   -0.10687413   0.9934188   -0.79092014]
 [ -0.87891334  -0.46899828  -0.08690174 -12.9248905 ]]
prediction:  [[  0.8142393   -0.09036738  -0.29346707  -0.46425202]
 [  0.29441166   0.0357576    0.93272096  -1.3982877 ]
 [ -0.04551942  -0.95854896   0.06729166 -12.914788  ]]
ground truth:  [[  0.9173543    0.2473458   -0.31189933  -0.56946224]
 [  0.34572807  -0.10667948   0.9322508   -1.4042674 ]
 [  0.19731507  -0.9630366   -0.18337727 -12.914607  ]]
prediction:  [[  0.3138624    0.5206963    0.0792347    1.7003194 ]
 [ -0.24226098   0.05048085   0.9378547   -0.8306866 ]
 [  0.61867744  -0.252385     0.16916813 -12.046947  ]]
ground truth:  [[  0.11027987   0.9909955    0.07593541   1.5245284 ]
 [ -0.17254488  -0.05615225   0