In [None]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchvision import transforms
from scipy.spatial.transform import Rotation as R

from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F

# -------------------------
# Dataset
# -------------------------
class StepwiseDeepVODataset(Dataset):
    def __init__(self, data_dir, img_size=(128, 384), transform=None, seq_len=5):
        self.files = sorted(os.listdir(data_dir))
        self.data_dir = data_dir
        self.img_size = img_size
        self.seq_len = seq_len

        self.transform = transform or transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(self.img_size),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.files) - self.seq_len + 1  # Prevent overshooting

    def __getitem__(self, idx):
        images = []
        poses = []
        for i in range(self.seq_len):
            item_path = os.path.join(self.data_dir, self.files[idx + i])
            item = np.load(item_path, allow_pickle=True)
            img1 = cv2.imread(item['img1'].item())
            img2 = cv2.imread(item['img2'].item())

            if img1 is None or img2 is None:
                raise ValueError(f"Image read failed at index {idx + i}: {item['img1'].item()} or {item['img2'].item()}")

            img1 = self.transform(img1)
            img2 = self.transform(img2)
            
            pair = torch.cat([img1, img2], dim=0)

            images.append(pair)
            poses.append(item['pose'])

        #images = np.array(images)
        #poses = np.array(poses)
        #return torch.tensor(images, dtype=torch.float32), torch.tensor(poses, dtype=torch.float32)
        images = torch.stack(images)  # shape (seq_len, 6, H, W)
        poses = torch.from_numpy(np.array(poses)).float()

        return images, poses

    
# -------------------------
# Model
# -------------------------
class DeepVO(nn.Module):
    def __init__(self, input_size=(128, 384), hidden_size=1000, rnn_layers=1):
        super(DeepVO, self).__init__()
        self.conv1 = nn.Conv2d(6, 64, kernel_size=7, stride=2, padding=3)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2, padding=2)
        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        self.relu = nn.ReLU()

        # Compute flattened conv feature size
        dummy_input = torch.zeros(1, 6, *input_size)
        with torch.no_grad():
            x = self.relu(self.conv1(dummy_input))
            x = self.relu(self.conv2(x))
            x = self.relu(self.conv3(x))
            x = self.relu(self.conv4(x))
            #conv_out_size = 49152 #x.view(1, -1).shape[1]
            conv_out_size = x.view(1, -1).shape[1]

        self.rnn = nn.LSTM(input_size=conv_out_size,
                           hidden_size=hidden_size,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.fc = nn.Linear(hidden_size, 6)

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()  # Expected input shape: (B, T, C, H, W)
        x = x.view(batch_size * seq_len, c, h, w)

        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        #print("CNN output shape:", x.shape)  # Before x = x.reshape(...)

        x = x.reshape(batch_size, seq_len, -1)  # (B, T, FeatureDim)

        rnn_out, _ = self.rnn(x)
        output = self.fc(rnn_out)  # (B, T, 6)
        return output

class DeepVO(nn.Module):
    def __init__(self, input_size=(128, 384), hidden_size=1000, rnn_layers=2):
        super(DeepVO, self).__init__()
        self.leaky_relu = nn.LeakyReLU(0.1)

        self.conv1 = nn.Conv2d(6, 64, kernel_size=7, stride=2, padding=3)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2, padding=2)
        self.conv3_1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
        self.conv4_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1)
        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1)

        self.dropout = nn.Dropout(p=0.5)

        # Compute flattened CNN output size
        dummy_input = torch.zeros(1, 6, *input_size)
        with torch.no_grad():
            x = self.forward_cnn(dummy_input)
            conv_out_size = x.view(1, -1).shape[1]

        self.rnn = nn.LSTM(input_size=conv_out_size,
                           hidden_size=hidden_size,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.fc = nn.Linear(hidden_size, 6)

    def forward_cnn(self, x):
        x = self.leaky_relu(self.conv1(x))
        x = self.leaky_relu(self.conv2(x))
        x = self.leaky_relu(self.conv3(x))
        x = self.leaky_relu(self.conv3_1(x))
        x = self.leaky_relu(self.conv4(x))
        x = self.leaky_relu(self.conv4_1(x))
        x = self.leaky_relu(self.conv5(x))
        x = self.leaky_relu(self.conv5_1(x))
        x = self.leaky_relu(self.conv6(x))
        return x

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        x = x.view(batch_size * seq_len, c, h, w)

        x = self.forward_cnn(x)  # (B*T, C, H', W')
        x = x.view(batch_size, seq_len, -1)  # (B, T, FeatureDim)

        x = self.dropout(x)

        rnn_out, _ = self.rnn(x)
        output = self.fc(rnn_out)
        return output



# -------------------------
# Training and Testing
# -------------------------
scaler = torch.amp.GradScaler()

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        with torch.amp.autocast(device_type=device):
            pred = model(X)
            loss = criterion(pred, y)

        scaler.scale(loss).backward()

        # Clip gradients here
        scaler.unscale_(optimizer)  # Unscale first before clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        scaler.step(optimizer)
        scaler.update()

        # pred = model(X)
        # loss = criterion(pred, y)

        # loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        # optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
train_dataset = StepwiseDeepVODataset("data/train/00_short")
#train_dataset = StepwiseDeepVODataset("data/train/08")
#train_dataset = StepwiseDeepVODataset("data/train/isep")

# Load only % of the data
subset_size = int(1.0 * len(train_dataset))
subset_indices = list(range(subset_size))

train_subset = Subset(train_dataset, subset_indices)

train_loader = DataLoader(train_subset, batch_size=10, shuffle=True)

In [3]:
# -------------------------
# Main
# -------------------------
import csv

loss_log_path = "checkpoints2_lstm/training_loss_mse_kitti_00_short_5seq.csv"

# Create the log file and write header if it doesn't exist
if not os.path.exists(loss_log_path):
    with open(loss_log_path, mode='w', newline='') as f:
        loss_writer = csv.writer(f)
        loss_writer.writerow(["Epoch", "Loss"])

model = DeepVO().to(device)

#optimizer = optim.Adam(model.parameters(), lr=1e-4)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

def deepvo_loss(pred, target, alpha=1, beta=100):
    # pred and target are (B, T, 6)
    t_pred = pred[:, :, :3]
    r_pred = pred[:, :, 3:]

    t_gt = target[:, :, :3]
    r_gt = target[:, :, 3:]

    loss_t = F.mse_loss(t_pred, t_gt)
    loss_r = F.mse_loss(r_pred, r_gt)
    return loss_t * alpha + loss_r * beta

criterion = nn.MSELoss()
#criterion = lambda pred, target: deepvo_loss(pred, target) 

# -------------------------------
# Load checkpoint if it exists
# -------------------------------
os.makedirs("checkpoints2_lstm", exist_ok=True)
ckpt_path = "checkpoints2_lstm/deepvo_checkpoint_lstm2_kitti_00_short_5seq.pt"
start_epoch = 0
if os.path.exists(ckpt_path):
    checkpoint = torch.load(ckpt_path)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    start_epoch = checkpoint["epoch"] + 1
    print(f"Loaded checkpoint from epoch {start_epoch}")
else:
    print("Starting training from scratch")

for epoch in range(10):
    loss = train(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {start_epoch+epoch}, Loss: {loss:.4f}")
    
    # Save loss to CSV
    with open(loss_log_path, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([start_epoch+epoch, loss])

    torch.save({
        "epoch": start_epoch+epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict()
    }, ckpt_path)


Loaded checkpoint from epoch 3
Epoch 3, Loss: 0.0024
Epoch 4, Loss: 0.0009
Epoch 5, Loss: 0.0005
Epoch 6, Loss: 0.0002
Epoch 7, Loss: 0.0002
Epoch 8, Loss: 0.0001
Epoch 9, Loss: 0.0001
Epoch 10, Loss: 0.0001


KeyboardInterrupt: 

In [None]:

model = DeepVO().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

ckpt_path = "checkpoints2_lstm/deepvo_checkpoint_lstm2_kitti_00_short_5seq.pt"

if os.path.exists(ckpt_path):
    checkpoint = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    print(f"Loaded model from epoch {checkpoint['epoch']+1}")
else:
    raise FileNotFoundError("Model not found!")

model.eval()

FileNotFoundError: Model not found!

In [4]:

#test_dataset = StepwiseDeepVODataset("data/train/05")
test_dataset = StepwiseDeepVODataset("data/train/05_cut_500")

#test_dataset = StepwiseDeepVODataset("data/test/isep")

# Load only % of the data
subset_size = int(1.0 * len(test_dataset))
subset_indices = list(range(subset_size))

test_subset = Subset(test_dataset, subset_indices)
test_loader = DataLoader(test_subset, batch_size=1, shuffle=False)

def test_old(model, dataloader, device):
    model.eval()
    preds = []
    gts = []
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            pred = model(X)
            preds.append(pred.cpu())
            gts.append(y.cpu())
    return torch.cat(preds), torch.cat(gts)

def test(model, dataloader, device):
    model.eval()
    preds = []
    gts = []
    with torch.no_grad():
        for X, y in dataloader:  # get both input and ground truth poses
            X = X.to(device)
            pred = model(X)  # (B, T, 6)
            preds.append(pred[:, -1, :].cpu())  # only last predicted pose per sequence
            gts.append(y[:, -1, :].cpu())       # only last ground truth pose per sequence
    preds = torch.cat(preds)  # (num_sequences, 6)
    gts = torch.cat(gts)      # (num_sequences, 6)
    return preds, gts


pred_outputs, gt_outputs = test(model, test_loader, device)

In [5]:
import plotly.graph_objs as go


# -------------------------
# Integrate and Plot
# -------------------------
def integrate_poses(y_output, start_pos=np.zeros(3), start_rot=R.identity()):
    positions = [start_pos.copy()]
    curr_rot = start_rot
    curr_pos = start_pos.copy()
    for i in range(y_output.shape[0]):
        dpos = y_output[i, :3]
        drot = y_output[i, 3:]
        global_dpos = curr_rot.apply(dpos)
        curr_pos += global_dpos
        dR = R.from_rotvec(drot)
        curr_rot = curr_rot * dR
        positions.append(curr_pos.copy())
    return np.array(positions)



def plot_pred_vs_gt(pred_positions, gt_positions):
    trace_gt = go.Scatter3d(
        x=gt_positions[:, 0],
        y=gt_positions[:, 1],
        z=gt_positions[:, 2],
        mode='lines',
        name='Ground Truth',
        line=dict(color='green')
    )
    
    trace_pred = go.Scatter3d(
        x=pred_positions[:, 0],
        y=pred_positions[:, 1],
        z=pred_positions[:, 2],
        mode='lines',
        name='Predicted',
        line=dict(color='red')
    )
    
    layout = go.Layout(
        scene=dict(
            xaxis_title='X',
            yaxis_title='Y',
            zaxis_title='Z'
        ),
        legend=dict(x=0, y=1),
        margin=dict(l=0, r=0, b=0, t=0)
    )
    
    fig = go.Figure(data=[trace_gt, trace_pred], layout=layout)
        
    fig.update_layout(
        title=f'Trajectory',
        scene=dict(
            xaxis_title='X (m)',
            yaxis_title='Y (m)',
            zaxis_title='Z (m)',
            aspectmode='data'
        ),
        margin=dict(l=0, r=0, b=0, t=30)
    )

    fig.show()



# flatten batch and sequence dims to (T, 6)
#pred_outputs = pred_outputs.view(-1, 6).cpu().numpy()
#gt_outputs = gt_outputs.view(-1, 6).cpu().numpy()

gt_positions = integrate_poses(gt_outputs)
pred_positions = integrate_poses(pred_outputs, start_pos=gt_positions[0])

plot_pred_vs_gt(pred_positions, gt_positions)

In [5]:
import numpy as np
import plotly.graph_objs as go

pose_file = "/home/ros_ws/noetic/data/kitti/data_odometry_poses/00_cut.txt"

# Read poses
poses = []
with open(pose_file, 'r') as f:
    for line in f:
        vals = list(map(float, line.strip().split()))
        if len(vals) != 12:
            continue
        pose = np.array(vals).reshape(3, 4)
        poses.append(pose)

positions = np.array([pose[:, 3] for pose in poses])

# Calculate 0.1% sampling indices (at least 1 point)
num_points = len(positions)
sample_size = max(1, int(num_points * 0.3))
sample_indices = np.linspace(0, num_points - 1, sample_size).astype(int)
sampled_positions = positions[sample_indices]

x = sampled_positions[:, 0]
y = sampled_positions[:, 1]
z = sampled_positions[:, 2]

fig = go.Figure(data=[go.Scatter3d(
    x=x, y=y, z=z,
    mode='lines+markers',
    marker=dict(size=4, color='red'),  # bigger & red so sparse points stand out
    line=dict(width=2, color='red')
)])

fig.update_layout(
    title=f'Trajectory from KITTI Poses (sampled points = {sample_size})',
    scene=dict(
        xaxis_title='X (m)',
        yaxis_title='Y (m)',
        zaxis_title='Z (m)',
        aspectmode='data'
    ),
    margin=dict(l=0, r=0, b=0, t=30)
)

fig.show()


Create npz files for training

In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm

def load_poses(pose_file):
    poses = []
    with open(pose_file, 'r') as f:
        for line in f:
            T = np.array(list(map(float, line.strip().split()))).reshape(3, 4)
            pose = np.eye(4)
            pose[:3, :] = T
            poses.append(pose)
    return poses

def compute_rel_poses(poses):
    rel_poses = []
    for i in range(1, len(poses)):
        rel = np.linalg.inv(poses[i-1]) @ poses[i]
        trans = rel[:3, 3]
        rot = rel[:3, :3]
        rpy = cv2.Rodrigues(rot)[0].flatten()
        rel_poses.append(np.hstack([trans, rpy]))
    return np.array(rel_poses)

def preprocess_lightweight(seq, save_dir, img_size=(128, 384)):
    os.makedirs(save_dir, exist_ok=True)
    img_dir = f"/home/ros_ws/noetic/data/kitti/sequences/{seq}/image_0"
    pose_file = f"/home/ros_ws/noetic/data/kitti/data_odometry_poses/{seq}.txt"
    #img_dir = f"/home/ros_ws/noetic/data/isep/dataset_stereo_front_left_kitti_style/test"
    #pose_file = f"/home/ros_ws/noetic/data/isep/dataset_stereo_front_left_kitti_style/test_poses.txt"

    images = sorted(os.listdir(img_dir))
    poses = load_poses(pose_file)
    rel_poses = compute_rel_poses(poses)

    for i in tqdm(range(len(rel_poses))):
        data = {
            'img1': os.path.join(img_dir, images[i]),
            'img2': os.path.join(img_dir, images[i+1]),
            'pose': rel_poses[i]
        }
        np.savez(os.path.join(save_dir, f"{i:06d}.npz"), **data)

# Example usage
preprocess_lightweight("08", "data/train/08", img_size=(128, 384))


100%|██████████| 4070/4070 [00:00<00:00, 9132.69it/s]


In [None]:
import os
import numpy as np

def plot_image_paths(data_dir, num_samples=5):
    files = sorted(os.listdir(data_dir))[:num_samples]

    img1_paths = []
    img2_paths = []

    for file in files:
        data = np.load(os.path.join(data_dir, file), allow_pickle=True)
        img1_paths.append(data['img1'].item())
        img2_paths.append(data['img2'].item())

    for i in range(num_samples):
        print(f"{i}: img1 = {img1_paths[i]}")
        print(f"   img2 = {img2_paths[i]}")

# Example usage
plot_image_paths("data/train/00_cut", num_samples=5)


0: img1 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003000.png
   img2 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003001.png
1: img1 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003001.png
   img2 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003002.png
2: img1 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003002.png
   img2 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003003.png
3: img1 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003003.png
   img2 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003004.png
4: img1 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003004.png
   img2 = /home/ros_ws/noetic/data/kitti/sequences/00_cut/image_0/003005.png


In [1]:
import cv2

im = cv2.imread('/home/ros_ws/noetic/data/kitti/sequences/05/image_0/000000.png')

print(type(im))
print(im.shape)
print(type(im.shape))

<class 'numpy.ndarray'>
(370, 1226, 3)
<class 'tuple'>
