In [12]:
import torch
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import re
from torch import nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision.transforms import v2
from torchvision.io import read_image, ImageReadMode
from torchvision.models import vgg16, VGG16_Weights
from pytorch_tcn import TCN, TemporalConv1d, TemporalConvTranspose1d
import torch.nn.functional as F
from pointnet2_ssg import get_model as PNet2
#import optuna

In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(device)
print(torch.cuda.is_available())

cuda
True


In [3]:
def depth_to_pointcloud(depth, intrinsics):
    # Belső paraméterek
    fx, fy, cx, cy = intrinsics
    H, W = depth.shape

    # Pixel-háló generálása
    u, v = torch.meshgrid(
        torch.arange(0, W, device=depth.device),
        torch.arange(0, H, device=depth.device),
        indexing='xy'
    )
    # Vektorok kilapítása
    u, v = u.flatten(), v.flatten() 

    # Mélységkép konvertálása 3D pontokra
    z = depth.flatten()
    x = (u - cx) * z / fx
    y = (v - cy) * z / fy

    # Pontfelhő megalkotása a pontok halmozásával
    points = torch.stack((x, y, z), dim=1)
    
    return points

def extract_image_number(filename):
    match = re.search(r'frame_(\d+)\.png', filename)
    return int(match.group(1)) if match else -1

In [4]:
def mono(rgb_file, model, intrinsic):
    # Kép betöltése és átméretezése a modell bemeneti méretére
    rgb_origin = cv2.imread(rgb_file)[:, :, ::-1]
    input_size = (616, 1064)
    h, w = rgb_origin.shape[:2]
    scale = min(input_size[0] / h, input_size[1] / w)
    rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
    
    # Kamera belső paramétereinek skálázása
    intrinsic = [intrinsic[0] * scale, intrinsic[1] * scale, intrinsic[2] * scale, intrinsic[3] * scale]
    
    # A kép kitöltése
    padding = [123.675, 116.28, 103.53]
    h, w = rgb.shape[:2]
    pad_h = input_size[0] - h
    pad_w = input_size[1] - w
    pad_h_half = pad_h // 2
    pad_w_half = pad_w // 2
    rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
    pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
    
    # Kép normalizálása
    mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
    std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
    rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
    rgb = torch.div((rgb - mean), std)
    rgb = rgb[None, :, :, :].cuda()
    
    # Elő-tanított modell betöltése inferenciára
    model.cuda().eval()
    with torch.no_grad():
        pred_depth, confidence, output_dict = model.inference({'input': rgb})
    
    # Kitöltés eltávolítása
    pred_depth = pred_depth.squeeze()
    pred_depth = pred_depth[pad_info[0]: pred_depth.shape[0] - pad_info[1], pad_info[2]: pred_depth.shape[1] - pad_info[3]]
    
    # Interpoláció 224x224-es méretre
    pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], size=(224, 224), mode='bilinear').squeeze()
    
    # Mélység konvertálása metrikus térbe
    canonical_to_real_scale = intrinsic[0] / 1000.0
    pred_depth = pred_depth * canonical_to_real_scale
    pred_depth = torch.clamp(pred_depth, 0, 300)

    return pred_depth

In [5]:
class RPC_Dataset(Dataset):
    def __init__(self, image_dir, forces, intrinsics, image_transform=None, pointcloud_transform=None):
        self.image_dir = image_dir
        self.labels = pd.read_csv(forces, header=None)
        self.intrinsics = intrinsics
        self.metric3d = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_small', pretrain=True)

        vgg = vgg16(weights=VGG16_Weights.DEFAULT)
        vgg.eval()
        vgg.to(device)
        self.rgb_feature_extractor = nn.Sequential(
            *list(vgg.features.children()),   # Konvolúciós rétegek
            nn.AdaptiveAvgPool2d((7, 7)),     # 7x7-es kimenet
            nn.Flatten(),                     # Kilapítás a következő rétegek miatt
            *list(vgg.classifier.children())[:-1]  # Az utolsó előtti réteggel bezárólag (4096-os kimenet)
        )

        # PointNet++ betöltése elő-tanított súlyokkal
        pointnet = PNet2(40, normal_channel=False)
        checkpoint = torch.load("pnet2_weights_ssg.pth")
        pointnet.load_state_dict(checkpoint['model_state_dict'], strict=False)
        pointnet.eval()
        pointnet.to(device)
        self.pc_feature_extractor = pointnet

        rgb_transform = v2.Compose([
            v2.Resize((224, 224)),
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True)
        ])

        self.image_transform = rgb_transform
        self.image_augment = image_transform
        
        self.seq_length = 15

        self.image_filenames = sorted(os.listdir(image_dir), key=extract_image_number)
        

    def __len__(self):
        #print(len(self.image_filenames))
        return int(len(self.image_filenames) / self.seq_length)

    def __getitem__(self, idx):
        images = []
        pointclouds = []

        # Ciklus a szekvencia hosszán
        for i in range(self.seq_length):
            # Kép betöltése
            img_path = os.path.join(self.image_dir, self.image_filenames[idx * self.seq_length + i])
            image = read_image(img_path, ImageReadMode.RGB)
            
            # Mélység térkép becslése
            pred_depth = mono(img_path, self.metric3d, self.intrinsics)
            
            # Alap transzformációk alkalmazása a képen
            image = self.image_transform(image)

            # Mélység térkép pontfelhővé konvertálása
            pointcloud_tensor = depth_to_pointcloud(pred_depth, self.intrinsics)
            
            pointcloud_tensor = pointcloud_tensor.transpose(0, 1)
            image, pointcloud_tensor = image.to(device), pointcloud_tensor.to(device)

            # Tulajdonság kinyerés
            rgb_features = self.rgb_feature_extractor(image.unsqueeze(0))
            pc_features = self.pc_feature_extractor(pointcloud_tensor.unsqueeze(0))[0]
    
            # Virtuális "batch" dimenzió eltávolítása
            rgb_features = rgb_features.squeeze(0)
            pc_features = pc_features.squeeze(0)
            
            # Kinyert adatok listába rendezése
            images.append(rgb_features)
            pointclouds.append(pc_features)
        
        images = torch.stack(images, dim=0)
        pointclouds = torch.stack(pointclouds, dim=0)
        
        # Annotáció megszerzése erre a szekvenciára
        label = self.labels.iloc[idx * self.seq_length + self.seq_length - 1, 2]
        
        return images, pointclouds, torch.tensor(label, dtype=torch.float32)

# Define transform for the images (if needed)
rgb_augment = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToImage(), 
    v2.ToDtype(torch.float32, scale=True),
    v2.AutoAugment()
])

# Initialize the dataset
'''
train_image_dir = 'data/frame/ECM_frames/left'
train_forces = 'data/frame/ECM_frames/labels.csv'

test_image_dir = 'data/frame/ECM_frames/left_test'
test_forces = 'data/frame/ECM_frames/test.csv'
'''
# Debugging dataset
train_image_dir = 'data/frame/ECM_frames/right'
train_forces = 'data/frame/ECM_frames/debug.csv'

test_image_dir = 'data/frame/ECM_frames/right_test'
test_forces = 'data/frame/ECM_frames/debug_test.csv'


### Belső paraméterek: [fx, fy, cx, cy]
# ECM bal kamera: [833.170, 909.161, 275.833, 297.017]
# ECM jobb kamera: [832.659, 908.230, 407.354, 304.965]
intrinsics = [833.170, 909.161, 275.833, 297.017]

train_dataset = RPC_Dataset(train_image_dir, train_forces, intrinsics)
test_dataset = RPC_Dataset(test_image_dir, test_forces, intrinsics)
#augmented_test_dataset = RPC_Dataset(test_image_dir, test_pointcloud_dir, test_forces, image_transform=rgb_augment, pointcloud_transform=pc_augment)
#augmented_test_dataset = ConcatDataset([test_dataset, augmented_test_dataset])

'''
data = dataset.__getitem__(5)
image, pointcloud = data[0], data[1]
visualize_rgb(image)
visualize_point_cloud(pointcloud)
'''
# Use DataLoader to batch
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

'''
for batch in train_dataloader:
    cats, labels = batch  # Unpack all three: images, pointclouds, and labels
    print("Cats batch size:", cats.size())
    #print("Pointcloud batch size:", pointclouds.size())
    print("Labels:", labels)
    break
'''

Using cache found in C:\Users\Martin/.cache\torch\hub\yvanyin_metric3d_main
A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "C:\Users\Martin\AppData\Local\Programs\Python\Python311\Lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
    ^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'triton'
  from xformers.components.attention import ScaledDotProduct
  checkpoint = torch.load("pnet2_weights_ssg.pth")
Using cache found in C:\Users\Martin/.cache\torch\hub\yvanyin_metric3d_main
  checkpoint = torch.load("pnet2_weights_ssg.pth")


'\nfor batch in train_dataloader:\n    cats, labels = batch  # Unpack all three: images, pointclouds, and labels\n    print("Cats batch size:", cats.size())\n    #print("Pointcloud batch size:", pointclouds.size())\n    print("Labels:", labels)\n    break\n'

In [10]:
class RPC_TCN(nn.Module):
    def __init__(self, input_size=4608, output_size=1, num_channels=[64, 128, 256], kernel_size=3, dropout=0.1):
        super(RPC_TCN, self).__init__()
        
        self.sequence_length = 15

        # Temporális Blokk
        
        self.tcn = TCN(input_size, num_channels, kernel_size=kernel_size, dropout=dropout, use_norm='layer_norm', output_projection=1)

    def forward(self, images, pointclouds):
        cat_tensors = []
    
        # Ciklus a képkockákon át
        for k in range(self.sequence_length):
            images = images.squeeze(0)
            pointclouds = pointclouds.squeeze(0)
            rgb_features = images[k]  # Shape: (3, H, W)
            pc_features = pointclouds[k]  # Shape: (N, 3)
            #rgb_features = rgb_features.unsqueeze(0)
            #pc_features = pc_features.unsqueeze(0)
            #print(rgb_features.size())
            #print(pc_features.size())
            # Concatenate RGB and point cloud features
            cat = torch.cat((rgb_features, pc_features), dim=0)  # Shape: (rgb_feature_dim + pc_feature_dim)
    
            # Add the concatenated features to the list
            cat_tensors.append(cat)
    
        # Stack features to form a sequence tensor for TCN input
        cat_input = torch.stack(cat_tensors, dim=0)  # Shape: (sequence_length, rgb_feature_dim + pc_feature_dim)
        #print(cat_input.size())
        # TB forward prop
        cat_input = cat_input.unsqueeze(0)
        cat_input = cat_input.transpose(1, 2)
        #print(cat_input.size())
        force = self.tcn(cat_input)  # Apply TCN layer
        #print(force)
        #print(force.size())
        #force = self.fc(force[:, :, -1])  # Take the last time step and pass through fully connected layer
        #print(force.size())
        #print(force[:, :, -1])
        #force = F.relu(force)
        force = F.softplus(force)
        
        return force[:, :, -1]

model = RPC_TCN().to(device)
#print(model)

In [7]:
# Mean Squared Error
#loss_fn = nn.MSELoss()
loss_fn = nn.SmoothL1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)  # T_max is the number of epochs
#accumulation_steps = 8  # Effective batch size of 8


def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (rgbs, pcs, f) in enumerate(dataloader):
        #torch.cuda.empty_cache()
        rgbs, pcs, f = rgbs.float(), pcs.float(), f.float()
        rgbs, pcs, f, = rgbs.to(device), pcs.to(device), f.to(device)
        #f = f * 0.001
        #print(f)
        
        # Compute prediction error
        pred = model(rgbs, pcs)
        pred = pred.squeeze(0)
        #print(pred)
        loss = loss_fn(pred, f)
        #loss = loss_fn(pred, f) / accumulation_steps
        
        # Backpropagation
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        '''
        # Accumulated Backpropogation
        if (batch + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()
        '''
        #if batch % 1 == 0:
        loss, current = loss.item(), (batch + 1) * len(rgbs)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    total_absolute_error, test_loss = 0, 0

    with torch.no_grad():
        for rgbs, pcs, f in dataloader:
            rgbs, pcs, f = rgbs.float(), pcs.float(), f.float()
            rgbs, pcs, f, = rgbs.to(device), pcs.to(device), f.to(device)
            
            # Forward pass: Compute predictions
            pred = model(rgbs, pcs)
            pred = pred.squeeze(0)
            
            # Accumulate Mean Squared Error loss (MSE) as used in training
            test_loss += loss_fn(pred, f).item()
            
            # Calculate Mean Absolute Error (MAE) for this batch
            total_absolute_error += torch.sum(torch.abs(pred - f)).item()

    # Calculate average losses
    avg_mse_loss = test_loss / num_batches
    mae = total_absolute_error / size

    print(f"Test Results: \n MAE: {mae:.5f}, Avg MAE Loss: {avg_mse_loss:.5f} \n")

In [8]:
epochs = 50
for t in range(epochs):
    torch.cuda.empty_cache()
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
    if (t + 1) % 5 == 0:
        scheduler.step()
print("Done!")

torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")


Epoch 1
-------------------------------
tensor([[[ 1.3788, -0.4018,  0.9407,  0.5803,  0.3185,  0.7398,  0.3081,
           0.3362,  0.2900,  0.6859,  0.0645,  1.0378,  1.0018,  1.4391,
           0.9255]]], device='cuda:0', grad_fn=<ConvolutionBackward0>)
tensor([[0.9255]], device='cuda:0', grad_fn=<SelectBackward0>)
loss: 0.423627  [    1/   80]
tensor([[[-0.2691,  0.1405,  0.8941,  0.2826,  0.4070,  0.6103,  0.3528,
           0.5133,  0.7365,  0.4311, -0.0205,  0.6192,  0.1572,  0.1086,
           0.6356]]], device='cuda:0', grad_fn=<ConvolutionBackward0>)
tensor([[0.6356]], device='cuda:0', grad_fn=<SelectBackward0>)
loss: 0.200725  [    2/   80]
tensor([[[0.4786, 0.2351, 0.7367, 3.1994, 0.7689, 0.6085, 0.2584, 0.2766,
          1.0744, 0.8367, 0.7148, 1.0440, 2.0869, 1.1267, 0.8789]]],
       device='cuda:0', grad_fn=<ConvolutionBackward0>)
tensor([[0.8789]], device='cuda:0', grad_fn=<SelectBackward0>)
loss: 0.384448  [    3/   80]
tensor([[[0.4474, 0.5426, 0.8281, 1.4129, 2.4503

KeyboardInterrupt: 