<a href="https://colab.research.google.com/github/58191554/PointNet-Project/blob/main/modelassert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

seed = 42

#### T-Net
The T-Net module is a type of Spatial Transformer Network (STN) that learns a kxk transformation matrix for a given point cloud, which is then used to transform the point cloud to a canonical pose. It consists of two parts: a convolutional network and a fully connected network. The convolutional network maps the input point cloud to a feature space, consisting of a series of convolutional layers with batch normalization and ReLU activation. The fully connected network takes the feature space and learns the transformation matrix, consisting of fully connected layers with batch normalization and ReLU activation. Finally, the T-Net applies the transformation matrix to the input point cloud to transform it to a canonical pose.
![T-net](https://github.com/58191554/PointNet-Project/blob/main/img/T-net_pipeline.drawio.png?raw=true)

In [None]:
class Tnet(nn.Module):
    """
    T-Net is a type of spatial transformer network (STN) that learns a kxk transformation matrix
    for a given point cloud. The matrix is then used to transform the point cloud to a canonical
    pose. It consists of two parts: a convolutional network and a fully connected network.
    The convolutional network maps the input point cloud to a feature space and the fully connected
    network learns the transformation matrix from the feature space.
    """
    def __init__(self, hidden_sizes_conv=[64, 128, 1024], hidden_sizes_fc=[512, 256], k=3):
        super().__init__()
        self.k=k
        self.hidden_sizes_conv=hidden_sizes_conv
        self.hidden_sizes_fc=hidden_sizes_fc
        
        self.conv = self._build_conv()
        self.fc = self._build_fc()
  
    def _build_conv(self):
        ########################################################################
        # TODO: Builds the convolutional network that maps the input point cloud 
        # to a feature space. The hidden dimension is hidden_sizes_conv
        #  
        # Hint: consisting of a series of convolutional layers with batch 
        # normalization and ReLU activation.
        #   The convolution layers is in following structure:
        #   [conv1d]-> [Batch Norm Layer] -> [ReLU]-> [conv1d]-> ...
        ########################################################################
        layers = []
        prev_size = self.k
        for layer_id, size in enumerate(self.hidden_sizes_conv):
            bn = nn.BatchNorm1d(size)
            conv = nn.Conv1d(prev_size, size,1)
            layers.append(conv)
            layers.append(bn)
            layers.append(nn.ReLU())
            prev_size = size
        ########################################################################
        return nn.Sequential(*layers)
  
    def _build_fc(self):
        ########################################################################
        # TODO:  the fully connected network that takes the feature space and 
        # learns the transformation matrix. 
        #   The hidden_layers according to hidden_sizes_fc
        # 
        # Hint: the fully connected structur is as follows:
        #   [Fully Connected Layer]-> [Batch Norm Layer] -> [ReLU]-> [Fully Connected Layer]-> ...
        ########################################################################
        layers = []
        prev_size = self.hidden_sizes_conv[-1]
        for layer_id, size in enumerate(self.hidden_sizes_fc):
            bn = nn.BatchNorm1d(size)
            fc = nn.Linear(prev_size, size)
            layers.append(fc)
            layers.append(bn)
            layers.append(nn.ReLU())
            prev_size = size
        layers.append(nn.Linear(self.hidden_sizes_fc[-1],self.k**2))
        ########################################################################
        return nn.Sequential(*layers)
      

    def forward(self, input):
        ########################################################################
        # TODO: Performs the forward pass of the T-Net. 
        # It first applies the convolutional network to the input point cloud 
        # to obtain a feature space. 
        # Then, it applies the fully connected network to the feature space to 
        # obtain the kxk transformation matrix. Finally, it applies the
        # transformation matrix to the input point cloud to transform it to a 
        # canonical pose.
        # 
        # Hint: the forward structure is as follows:
        # [ConvLayers]->[MaxPooling]->[Flatten]->[Fully Connected Layers]->[theta_Matrix + identity]
        #   The identity require gradient
        ########################################################################
        # input.shape (bs,n,3)
        bs = input.size(0)
        
        xb = self.conv(input)   
        pool = nn.MaxPool1d(xb.size(-1))(xb)
        flat = nn.Flatten(1)(pool)
        xb = self.fc(flat)
      
        init = torch.eye(self.k, requires_grad=True).repeat(bs,1,1)
        if xb.is_cuda:
          init=init.cuda()
        matrix = xb.view(-1,self.k,self.k) + init        
        return matrix

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

torch.manual_seed(seed)
test_t_net = Tnet()

if count_parameters(test_t_net)!=803081:
    print("Error")
    print("test_t_net parameters number = ", count_parameters(test_t_net))

assert count_parameters(test_t_net)==803081

In [None]:
torch.manual_seed(seed)
x1 = torch.randn(3, 3, 5)
print(x1)

y1 = torch.tensor([[[ 1.4712e+00,  1.1447e+00,  6.5780e-02],
         [ 2.6862e-01,  1.5355e+00, -7.9635e-01],
         [-3.1744e-01,  4.8485e-01,  1.2669e+00]],

        [[ 1.0652e+00, -2.9729e-02, -9.1289e-04],
         [-2.0753e-01,  1.6646e+00,  5.0989e-01],
         [-2.5312e-01,  7.1402e-01,  8.2575e-01]],

        [[ 1.3445e+00,  6.7090e-01, -4.4554e-01],
         [ 2.4452e-01,  1.1833e+00, -5.8614e-01],
         [-5.3094e-02, -1.3413e-01,  9.4217e-01]]])
pred_y1 = test_t_net(x1)
print(pred_y1)
assert torch.allclose(y1, pred_y1, rtol=1e-03, atol=1e-03),  "different y_pred and y"

tensor([[[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784],
         [-1.2345, -0.0431, -1.6047, -0.7521,  1.6487],
         [-0.3925, -1.4036, -0.7279, -0.5594, -0.7688]],

        [[ 0.7624,  1.6423, -0.1596, -0.4974,  0.4396],
         [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.6872]],

        [[-1.0892, -0.3553, -0.9138, -0.6581,  0.0780],
         [ 0.5258, -0.4880, -0.4345, -1.3864, -1.2862],
         [-1.4032,  0.0360, -0.0635,  0.6756, -0.0978]]])
tensor([[[ 1.4712e+00,  1.1447e+00,  6.5780e-02],
         [ 2.6862e-01,  1.5355e+00, -7.9635e-01],
         [-3.1744e-01,  4.8485e-01,  1.2669e+00]],

        [[ 1.0652e+00, -2.9729e-02, -9.1289e-04],
         [-2.0753e-01,  1.6646e+00,  5.0989e-01],
         [-2.5312e-01,  7.1402e-01,  8.2575e-01]],

        [[ 1.3445e+00,  6.7090e-01, -4.4554e-01],
         [ 2.4452e-01,  1.1833e+00, -5.8614e-01],
         [-5.3094e-02, -1.3413e-01,  9.4217e-01]]], grad_fn=<AddBackward0>)


### Transfrom Class
The Transform class is every thing before last MLPs in PointNet. It is a neural network architecture that uses two pairs of spatial transform net (STN) and shared MLP layers to extract global features from a point cloud data of (nx3) shape. The STN is implemented using the T-Net and computes the 3x3 transform matrix, which is then multiplied with the input point cloud to get a transformed point cloud of the same shape. The transformed point cloud is then input into the shared MLP layers along with the feature transform matrix. The output from the shared MLP layers is max pooled along the feature dimension to get a global feature vector. The output also includes the point and feature transform matrices.
![TransformNet](https://github.com/58191554/PointNet-Project/blob/main/img/PointNetStructureFromPaper.png?raw=true)

In [None]:
class Transform(nn.Module):
    def __init__(self, input_size=3, feature_size=64, sharedMLP1_layers=[64, 64], sharedMLP2_layers=[64, 128, 1024], batch_norm = True):
        """
        Transform class is all the pipeline to get a global feature
                 _____________________                                     _______________                 ___________________       _______________
                |                     |                                   |               |                |                 |     |               |                    
        x -->   |   input transform   | --> y (canonical point cloud) --> |  shared MLP   | --> feature -->|feature transform| --> |  shared MLP   | --> max pooling --> z
                |_____________________|                                   |_______________|                |_________________|     |  _____________|
        The transform class is a neural networknet architecture that go throught 2 pairs of spactial transform net and shared MLP.
        The STN is the T-Net that implement above, and the shared-MLP can be regarded as a one-dimensional convolutional layer.

        the input x as a point cloud data of (nx3) shape first compute the 3x3 transform matrix and multiplied with the transform matrix to get a (nx3) transformed point cloud.

        the last_activate bool is True when you want to add the last layer with activation function.
        """
        super().__init__()
        self.batch_norm = True
        
        self.input_transform = Tnet(k=3)
        self.feature_transform = Tnet(k=64)

        self.sharedMLP1 = self._build_sharedMLP(input_size, sharedMLP1_layers, last_activate=True)
        self.sharedMLP2 = self._build_sharedMLP(feature_size, sharedMLP2_layers, last_activate=False)

    def _build_sharedMLP(self, input_dim, sharedMLP_layers, last_activate = True):
        ########################################################################
        # TODO: Build the shared MLP layers 
        # Hint: 
        #   The structure is [Conv1d]->[Batch Norm]->[ReLU]
        ########################################################################
        layers = []
        prev_size = input_dim
        for layer_id, size in enumerate(sharedMLP_layers):
            layers.append(nn.Conv1d(prev_size, size, 1))

            if self.batch_norm:
                layers.append(nn.BatchNorm1d(size))

            if (layer_id < len(sharedMLP_layers)-1) or last_activate:
                layers.append(nn.ReLU())

            prev_size = size
        return nn.Sequential(*layers)
       
    def forward(self, input):     #input:[batch_size, 3, 1024] output:[batch_size, 1024]
    
        ########################################################################
        # TODO: Implement the code to multiply the transform matrix and the point
        # cloud. The transformed x should be the same shape of x 
        # 
        # Hint: 
        # 1. Get the transform matrix by the T-Net
        # 2. Batch matrix multiply the input x and transform matrix
        # 3. Input the data into the Shared MLP
        # 4. Batch matrix multiply the feature and the feature_transform matrix
        # 5. Input the output into the Shared MLP with feature dimension
        # 6. Maxpooling along the feature dimension
        # 7. output the output data, points transform matrix, and the feature
        #       transform matrix
        ########################################################################
        matrix3x3 = self.input_transform(input)     #[batch_size, 3, 3]
        # batch matrix multiplication
        xb = torch.bmm(torch.transpose(input,1,2), matrix3x3).transpose(1,2)     #[batch_size, 3, 1024]
        xb = self.sharedMLP1(xb)

        matrix64x64 = self.feature_transform(xb)     #[batch_size, 64, 64]
        xb = torch.bmm(torch.transpose(xb,1,2), matrix64x64).transpose(1,2)     #[batch_size, 64, 1024]
        xb = self.sharedMLP2(xb)

        xb = nn.MaxPool1d(xb.size(-1))(xb)     #[batch_size, 1024, 1]
        output = nn.Flatten(1)(xb)     #[batch_size, 1024]
        ########################################################################
        return output, matrix3x3, matrix64x64

In [None]:
torch.manual_seed(seed)
test_transfrom_net = Transform()

test_tranform_net_param_num = count_parameters(test_transfrom_net)
if test_tranform_net_param_num!=2812105:
    print("Error")
    print("test_transfrom_net parameters number = ", count_parameters(test_transfrom_net))
    print("Difference = ", torch.absolute(test_tranform_net_param_num!=2812105))

assert count_parameters(test_transfrom_net)==2812105


torch.manual_seed(seed)
x2 = torch.randn(2, 3, 5)
pred_y2, pred_mat1, pred_mat2 = test_transfrom_net(x2)

mat1 = torch.tensor([[[ 1.0655,  0.4169,  0.1452],
         [ 0.0240,  1.7885, -0.6011],
         [-0.5582,  0.6857,  1.0256]],

        [[ 1.5976,  0.8703, -0.4317],
         [ 0.2260,  1.2361,  0.0457],
         [ 0.0986,  0.0844,  1.0267]]])

if not torch.allclose(mat1, pred_mat1, rtol=1e-03, atol=1e-03):
    print("Error")
    print("The answer mat1 is \n", mat1)
    print("The pred_mat1 is \n", pred_mat1)
    print("Difference = ", torch.norm(pred_mat1- mat1))
assert torch.allclose(mat1, pred_mat1, rtol=1e-03, atol=1e-03),  "different pred_mat1 and mat1"

### PointNet Classifier
The code defines a PyTorch module called PointNet for classifying point cloud data. The PointNet module includes a Transform class, which takes in 3D point cloud data as input and generates global features and transformation matrices. The global features are then passed through a multi-layer perceptron (MLP) to generate scores for classification. The MLP consists of linear layers, batch normalization, ReLU activation, and dropout layers. The PointNet module outputs the logsoftmax of the scores along with the 3x3 and 64x64 transformation matrices generated by the Transform class. The PointNet module can be customized with different layer configurations, batch normalization, and dropout rates.

In [None]:
class PointNet(nn.Module):
    def __init__(self, sharedMLP1_layers=[64, 64], sharedMLP2_layers=[64, 128, 1024], classes = 10, batch_norm = True, dropout_rate = 0.3):
        """
        Point Net the whole neural network for the classification of point cloud data
                    _________                         ___     
        input x--->|Transform|---> global feature--->|MLP|---> scores
                   |_________|                       |___|
            args:   sharedMLP1_layers is the first shared MLP in Transform class
                    sharedMLP2_layers is the second shared MLP in Transorm class
                The MLP has the structure of 
                    [Linear] -> [Batch Norm] -> [ReLU] -> [Dropout] -> [Linear] -> ... -> [Linear] -> [Batch Norm] -> [ReLU] -> [Linear of class size]

                
        """
        super().__init__()
        self.transform = Transform(input_size=3, feature_size=64, sharedMLP1_layers=sharedMLP1_layers, sharedMLP2_layers=sharedMLP2_layers)
        self.batch_norm = batch_norm
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, classes)
        

        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout = nn.Dropout(dropout_rate)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def _build_fc(self, input_dim, fc_layers, cls_num, dropout_rate):
        layers = []
        prev_size = input_dim
        for layer_id, size in enumerate(fc_layers):
            layers.append(nn.Linear(prev_size, size, 1))

            if self.batch_norm:
                layers.append(nn.BatchNorm1d(size))

            if layer_id < len(fc_layers):
                layers.append(nn.ReLU())

            if layer_id < len(fc_layers)-1:
                layers.append(nn.Dropout(dropout_rate))
            prev_size = size
        
        layers.append(nn.Linear(cls_num))
        return nn.Sequential(*layers)
        

    def forward(self, input):
        ########################################################################
        # TODO: get the output y, 3x3 transform matrix and 64x64 transform
        # matrix from self.transform net.
        # Then, y->[fc1]->[bn1]->[relu]->[fc2]->[dropout]->[bn2]->[relu]->[fc3]->z
        # return logsoftmax(z), 3x3 transform matrix and 64x64 transform matrix
        ########################################################################

        xb, matrix3x3, matrix64x64 = self.transform(input)
        xb = F.relu(self.bn1(self.fc1(xb)))
        xb = F.relu(self.bn2(self.dropout(self.fc2(xb))))
        output = self.fc3(xb)
        return self.logsoftmax(output), matrix3x3, matrix64x64

In [None]:
torch.manual_seed(42)
test_point_net = PointNet()

test_point_net_param_num = count_parameters(test_point_net)
if test_point_net_param_num!=3472339:
    print("Error")
    print("test_transfrom_net parameters number = ", count_parameters(test_point_net))
    print("Difference = ", torch.absolute(torch.tensor(test_point_net_param_num-3472339)))

assert count_parameters(test_point_net)==3472339


torch.manual_seed(seed)
x3 = torch.randn(3, 3, 5)
w, pred_mat3x3, pred_matfxf = test_point_net(x3)

mat3x3 = torch.tensor([[[ 1.4712e+00,  1.1447e+00,  6.5780e-02],
         [ 2.6862e-01,  1.5355e+00, -7.9635e-01],
         [-3.1744e-01,  4.8485e-01,  1.2669e+00]],

        [[ 1.0652e+00, -2.9729e-02, -9.1289e-04],
         [-2.0753e-01,  1.6646e+00,  5.0989e-01],
         [-2.5312e-01,  7.1402e-01,  8.2575e-01]],

        [[ 1.3445e+00,  6.7090e-01, -4.4554e-01],
         [ 2.4452e-01,  1.1833e+00, -5.8614e-01],
         [-5.3094e-02, -1.3413e-01,  9.4217e-01]]])
if not torch.allclose(mat3x3, pred_mat3x3, rtol=1e-03, atol=1e-03):
    print("Error")
    print("The answer mat3x3 is \n", mat1)
    print("The pred_mat3x3 is \n", pred_mat1)
    print("Difference = ", torch.norm(pred_mat3x3- mat3x3))
assert torch.allclose(pred_mat3x3, mat3x3, rtol=1e-03, atol=1e-03),  "different pred_mat1 and mat1"