In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [None]:
#### prepare data
rgb_file = 'data/train_images/frame_105.png'
#depth_file = 'data/kitti_demo/depth/0000000050.png'
intrinsic = [212.010, 212.010, 213.846, 121.795] #[707.0493, 707.0493, 604.0814, 180.5066]
rgb_origin = cv2.imread(rgb_file)[:, :, ::-1]

# Adjust input size to fit model requirements
input_size = (616, 1064)  # For ViT model; use (544, 1216) for ConvNeXt model
h, w = rgb_origin.shape[:2]
scale = min(input_size[0] / h, input_size[1] / w)
rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)

# Scale intrinsic parameters
intrinsic = [intrinsic[0] * scale, intrinsic[1] * scale, intrinsic[2] * scale, intrinsic[3] * scale]

# Pad the image to match model input size
padding = [123.675, 116.28, 103.53]
h, w = rgb.shape[:2]
pad_h = input_size[0] - h
pad_w = input_size[1] - w
pad_h_half = pad_h // 2
pad_w_half = pad_w // 2
rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]

# Normalize the image
mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
rgb = torch.div((rgb - mean), std)
rgb = rgb[None, :, :, :].cuda()

# Load pre-trained model and perform inference
model = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_large', pretrain=True) # was 'metric3d_vit_small'
model.cuda().eval()
with torch.no_grad():
    pred_depth, confidence, output_dict = model.inference({'input': rgb})

# Remove padding
pred_depth = pred_depth.squeeze()
pred_depth = pred_depth[pad_info[0]: pred_depth.shape[0] - pad_info[1], pad_info[2]: pred_depth.shape[1] - pad_info[3]]

# Upsample to original size
pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], rgb_origin.shape[:2], mode='bilinear').squeeze()
print(pred_depth.size())
# Convert depth to metric space (if needed)
canonical_to_real_scale = intrinsic[0] / 1000.0  # Adjust based on focal length of canonical camera
pred_depth = pred_depth * canonical_to_real_scale
pred_depth = torch.clamp(pred_depth, 0, 300)  # Clamping depth values for visualization
print(pred_depth.size())

pointcloud = depth_to_pointcloud(pred_depth, intrinsic)
print("Point cloud shape:", pointcloud.shape)  # Expected shape [H*W, 3]
'''
# Save or use predicted depth
pred_depth_np = pred_depth.cpu().numpy()
cv2.imwrite('predicted_depth.png', (pred_depth_np * 255 / pred_depth_np.max()).astype(np.uint8))
'''

In [None]:
import os
import pandas as pd
from torchvision.io import read_image

class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [None]:
class SpatialBlock(nn.Module):
    def __init__(self):
        super(SpatialBlock, self).__init__()
        
        vgg = vgg16(weights=VGG16_Weights.DEFAULT)
        self.rgb_feature_extractor = nn.Sequential(*list(vgg.features.children()), 
                                                   nn.Flatten(), 
                                                   *list(vgg.classifier.children())[:-1])
        
        pointnet = PNet(k=40, normal_channel=False)
        #self.pc_feature_extractor = nn.Sequential(*list(pointnet
        self.pc_feature_extractor = pointnet

    def forward(self, rgb_input, pc_input):
        rgb_features = self.rgb_feature_extractor(rgb_input)
        pc_features = self.pc_feature_extractor(pc_input)

        combined_features = torch.cat([rgb_features, pc_features], dim=1)

        output = F.relu(self.fc(combined_features))

        return output

SB = SpatialBlock().to(device)
print(SB)

In [None]:
class TemporalBlock(nn.Module):
    def __init__(self, input_dim, num_layers=3, kernel_size=3, dilation_base=2, output_dim=1):
        super(TemporalBlock, self).__init__()

        layers = []
        current_dilation = 1

        for i in range(num_layers):
            conv = nn.Conv1d(input_dim, input_dim, kernel_size=kernel_size, padding=current_dilation, dilation=current_dilation)
            layers.append(conv)
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(input_dim))
            current_dilation *= dilation_base  # Dilating each layer

        self.tcn = nn.Sequential(*layers)

        # Fully connected layer to output force prediction
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        # Input x is expected to be of shape [batch_size, input_dim, sequence_length]
        tcn_output = self.tcn(x)  # Temporal convolution over the sequence
        tcn_output = tcn_output[:, :, tcn_output.size(2) // 2]  # Take the middle time step features
        output = self.fc(tcn_output)  # Predict force from the middle step
        return output

# Example usage:
# Suppose we have a sequence of spatial features with shape [batch_size, input_dim, sequence_length]
input_dim = 4608  # From the spatial block's output
sequence_length = 15  # Number of frames in the temporal window

temporal_block = TemporalBlock(input_dim=input_dim, num_layers=3, kernel_size=3)
input_data = torch.randn(15, input_dim, sequence_length)  # Example input with batch size 8
output = temporal_block(input_data)
print(output.shape)  # Expected output shape: [8, 1]

TB = TemporalBlock(input_dim=input_dim).to(device)
print(TB)

In [4]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break


# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26421880/26421880 [00:02<00:00, 10187010.17it/s]


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29515/29515 [00:00<00:00, 451346.56it/s]


Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4422102/4422102 [00:01<00:00, 3806350.15it/s]


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5148/5148 [00:00<00:00, 4627577.58it/s]


Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64
Using cuda device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
Epoch 1
-------------------------------
loss: 2.297380  [   64/60000]
loss: 2.296994  [ 6464/60000]
loss: 2.272621  [12864/60000]
loss: 2.265488  [19264/60000]
loss: 2.267900  [25664/60000]
loss: 2.210917  [32064/60000]
loss: 2.234223  [38464/60000]
loss: 2.188991  [44864/60000]
loss: 2.191795  [51264/60000]
loss: 2.160941  [57664/60000]
Test Error: 
 Accuracy: 32.8%, Avg loss: 2.160184 

Epoch 2
-------------------------------
loss: 2.164182  [   64/60000]
loss: 2.165017  [ 6464

  model.load_state_dict(torch.load("model.pth"))


In [3]:
from pointnet import PNet

# Example usage
batch_size = 15
num_points = 2048
pointcloud = torch.rand(batch_size, 3, num_points)  # Random point cloud data

# Create PointNet model and run forward pass
model = PNet(normal_channel=False)
output = model(pointcloud)  # Output shape: (batch_size, num_classes)
print(output[0].shape)

torch.Size([15, 512])


In [None]:
from pointnet2 import PNetPP

# Example usage
batch_size = 15
num_points = 2048
pointcloud = torch.rand(batch_size, 3, num_points)  # Random point cloud data

# Create PointNet model and run forward pass
model = PNetPP(num_class=40, normal_channel=False)
output = model(pointcloud)  # Output shape: (batch_size, num_classes)
print(output[0].shape)