# Intro to ML Final Project
Author: Daniel Sun  
Date: 12/07/2022

## Method

### Import Packages

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision import transforms
import numpy as np
import os
import cv2
import pickle as pkl
from torchvision.models import resnet50, ResNet50_Weights

### Define Device to use

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load and Normalize Data

In [3]:
class LazyLoadDataset(Dataset):
  """
    Class for lazy loading 
    
    Constructor:
      path(String): the path in which lazy data are located
      train(Boolean): the data is for train or not
      transform(function): the function for us to perform transform on images
  """
  def __init__(self, path, train=True, transform=None):
    self.transform = transform
    path = path + ("train/" if train else "test/")

    self.pathX = path + "X/"
    self.pathY = path + "Y/"

    self.data = os.listdir(self.pathX)
    self.train = train
  
  def __getitem__(self, idx):
    f = self.data[idx]

    img0 = cv2.imread(self.pathX + f + "/rgb/0.png")
    img1 = cv2.imread(self.pathX + f + "/rgb/1.png")
    img2 = cv2.imread(self.pathX + f + "/rgb/2.png")

    if self.transform is not None:
      img0 = self.transform(img0)
      img1 = self.transform(img1)
      img2 = self.transform(img2)
    
    depth = np.load(self.pathX + f + "/depth.npy")
    # depth = self.transform(depth)

    field_id = pkl.load(open(self.pathX + f + "/field_id.pkl", "rb"))

    if self.train:
      Y = np.load(self.pathY + f + ".npy") * 1000
      return (img0, img1, img2, depth), Y
    else:
      return (img0, img1, img2, depth)

  def __len__(self):
    return len(self.data)

In [4]:
# function for normalization and transformation
data_transforms = {
    'train': transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomRotation(45),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        #transforms.Resize((224, 224)),#attention 
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], 
                            [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomRotation(45),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        #transforms.Resize((224, 224)),#attention 
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], 
                            [0.229, 0.224, 0.225])
    ]),
}

In [5]:
# define train and test dataset and dataloader
train_dataset = LazyLoadDataset("./lazydata/", transform=data_transforms['train'])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=30, shuffle=False)

### Training the model

In [6]:
def train(epoch, model, optimizer, permute_pixels=None, permutation_order=None):
    """
    Train the model for one epoch

    Args:
        epoch (int): current epoch
        model (nn.Module): model to train
        optimizer (torch.optim): optimizer to use
        permute_pixels (function): function to permute the pixels (default: None)
        permutation_order (1D torch array): order of the permutation (default: None)
    """
    model.train()
    total_loss = 0
    for batch_idx, ((img0, img1, img2, depth), target) in enumerate(train_loader):
        # send to device
        concate = torch.cat((img0, img1, img2, depth), dim=1)
        data, target = concate.to(device), target.to(device)
        
        # permute pixels
        if permute_pixels is not None:
            data = permute_pixels(data, permutation_order)

        optimizer.zero_grad()
        model = model.to(device)
        output = model(data)
        lossFn = nn.MSELoss()
        loss = lossFn(output.float(), target.float())
        total_loss += loss
        loss.backward()
        optimizer.step()
    
    print('Train Epoch: {} \tAvg Loss: {:.6f}'.format(
                epoch, total_loss/len(train_loader.dataset)))


In [7]:
# define the model for training
# use resnet50 as a base
model = resnet50(weights=ResNet50_Weights.DEFAULT)
model = model.to(device)
model.eval()
model.float()
model.fc = nn.Linear(2048, 12)
weight = model.conv1.weight.clone()
model.conv1 = nn.Conv2d(12, 64, kernel_size=7, stride=2, padding=3, bias=False)
with torch.no_grad():
    model.conv1.weight[:, :3] = weight
    model.conv1.weight[:, 3] = model.conv1.weight[:, 0]

In [8]:
# define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# train the model for 40 epoches
for epoch in range(0, 40):
    train(epoch, model, optimizer)

Train Epoch: 0 	Avg Loss: 29.560263
Train Epoch: 1 	Avg Loss: 16.185776
Train Epoch: 2 	Avg Loss: 14.481716
Train Epoch: 3 	Avg Loss: 13.596376
Train Epoch: 4 	Avg Loss: 12.802649
Train Epoch: 5 	Avg Loss: 11.612081
Train Epoch: 6 	Avg Loss: 10.484567
Train Epoch: 7 	Avg Loss: 9.260120
Train Epoch: 8 	Avg Loss: 7.155323
Train Epoch: 9 	Avg Loss: 4.184078
Train Epoch: 10 	Avg Loss: 2.684620
Train Epoch: 11 	Avg Loss: 2.191308
Train Epoch: 12 	Avg Loss: 1.861231
Train Epoch: 13 	Avg Loss: 1.591500
Train Epoch: 14 	Avg Loss: 1.384787
Train Epoch: 15 	Avg Loss: 1.233111
Train Epoch: 16 	Avg Loss: 1.129283
Train Epoch: 17 	Avg Loss: 1.041603
Train Epoch: 18 	Avg Loss: 0.966110
Train Epoch: 19 	Avg Loss: 0.906252
Train Epoch: 20 	Avg Loss: 0.852666
Train Epoch: 21 	Avg Loss: 0.805656
Train Epoch: 22 	Avg Loss: 0.766693
Train Epoch: 23 	Avg Loss: 0.731094
Train Epoch: 24 	Avg Loss: 0.692568
Train Epoch: 25 	Avg Loss: 0.661462
Train Epoch: 26 	Avg Loss: 0.629905
Train Epoch: 27 	Avg Loss: 0.60

### Export Test Prediction

In [9]:
import pandas as pd

outfile = 'submission.csv'

output_file = open(outfile, 'w')

titles = ['ID', 'FINGER_POS_1', 'FINGER_POS_2', 'FINGER_POS_3', 'FINGER_POS_4', 'FINGER_POS_5', 'FINGER_POS_6',
         'FINGER_POS_7', 'FINGER_POS_8', 'FINGER_POS_9', 'FINGER_POS_10', 'FINGER_POS_11', 'FINGER_POS_12']
preds = []

test_data = torch.load('./test/test/testX.pt')
file_ids = test_data[-1]
rgb_data = test_data[0]
depth = test_data[1]
model.eval()

for i, (img0, img1, img2) in enumerate(rgb_data):

    img0 = data_transforms['test'](img0)
    img1 = data_transforms['test'](img1)
    img2 = data_transforms['test'](img2)
    
    data = torch.cat((img0, img1, img2, depth[i]), dim=0)
    data = torch.unsqueeze(data, 0)
    output = model(data.to(device)) / 1000
    preds.append(output[0].cpu().detach().numpy())

df = pd.concat([pd.DataFrame(file_ids), pd.DataFrame.from_records(preds)], axis = 1, names = titles)
df.columns = titles
df.to_csv(outfile, index = False)
print("Written to csv file {}".format(outfile))

Written to csv file submission.csv


## Experiment Result

The public score of the prediction on Kaggle is 0.00551

## Discussion

### Loading

- Lazy loading can save a lot of time
- It is also memory friendly
- Large batch size can result in memory shortage so decreases batch size to save GPU

### Normalization

- Normalization on RGB images can improve the performance of the model tremendously
- Possibly because some of the data are very large compare to the other ones
- It is very useful to multiple the Y by 1000 when loading the data as the Y is smaller than X
- Transform depth image doesn't improve the performance much

### Model

- Using pre-trained model(like resnet50 in this case) can provide a good start
- Need to modify the fc layer of the model to accomadate the data
- SGD optimizer in my case perform better than Adam optimizer
- Running multiple epoches can improve the model tremendously

## Future Work

- Trying different normalization methods to see if we can achieve a better result
- Trying more optimizers or modifying optimizers' parameters like learning rate
- Using other pre-trained or self-defined models 
- Running the model for more epoches