In [1]:
import os
from scipy.io import loadmat
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
def init_vid_extr(session_id, H_map="RGB_videos/video_data_n3"):
    h_path = os.path.join(H_map, f"H{session_id}.mat")
    # load homograph
    H = loadmat(h_path)['H']
    return H

def world_to_pixel(wrld_cords, H):
    pts = wrld_cords.reshape(-1, 1, 2)
    pixel_pts = cv2.perspectiveTransform(pts, H)
    return pixel_pts.reshape(-1, 2)

In [4]:
def testing_it(session_id, timesteps=10):
    label_file = os.path.join("Pedestrian_labels", f"{session_id}_frame.txt")
    print(label_file)

    sess_id = session_id
    t = timesteps
    data = pd.read_csv(label_file, header=None, names=['frame', 'person_id', 'x', 'y', 'z'])
    data = data.sort_values(by=['person_id', 'frame']).reset_index(drop=True)
    samples = []
    print(str(len(data.groupby('person_id'))) + " person_ids")
    for pid, group in data.groupby('person_id'):
        group = group.sort_values('frame').reset_index(drop=True)
        if len(group) < timesteps + 1:
            continue
        for i in range(len(group) - (2*timesteps)):
            seq = group.iloc[i:i+timesteps]
            target = group.iloc[i+timesteps:i+(2*timesteps)]
            samples.append((seq, target))
    return samples
a = testing_it(1)

Pedestrian_labels\1_frame.txt
151 person_ids


In [None]:
class TrajectoryVideoLinearDataset(Dataset):
    def __init__(self, session_id, device, max_x=1280, max_y=1024, timesteps=10):
        label_file = os.path.join("Pedestrian_labels", f"{session_id}_frame.txt")

        self.sess_id = session_id
        self.t = timesteps
        self.data = pd.read_csv(label_file, header=None, names=['frame', 'person_id', 'x', 'y', 'z'])
        self.data = self.data.sort_values(by=['person_id', 'frame']).reset_index(drop=True)
        samples_stepA = []
        print(str(len(self.data.groupby('person_id'))) + " person_ids")
        for pid, group in self.data.groupby('person_id'):
            group = group.sort_values('frame').reset_index(drop=True)
            if len(group) < timesteps + 1:
                continue
            for i in range(len(group) - (2*timesteps)):
                seq = group.iloc[i:i+timesteps]
                target = group.iloc[i+timesteps:i+(2*timesteps)]
                samples_stepA.append((seq, target))
        
        #Convert samples to 2d coordinates, normalized, flattened
        print("Converting samples to 2d coordinates")
        self.samples = []
        H = init_vid_extr(self.sess_id)
        for i in range(len(samples_stepA)):
            seq_samples, target_samples = samples_stepA[i]
            # Get the homography matrix from the video extractor.
            H = init_vid_extr(self.sess_id)
            
            # Convert target positions from world to pixel coordinates.
            target_samples_pos = target_samples[['x','y','z']].values.astype(np.float32)
            pixel_target = world_to_pixel(target_samples_pos[:, :2], H)
            target_samples_pos[:, :2] = pixel_target

            # Convert the input sequence positions.
            pos_seq_samples = seq_samples[['x','y','z']].values.astype(np.float32)
            pixel_coords = world_to_pixel(pos_seq_samples[:, :2], H)
            pos_seq_samples[:, :2] = pixel_coords
        
            self.samples.append((torch.flatten(torch.Tensor(pos_seq_samples[:, :2] / [max_x, max_y])).to(device), torch.flatten(torch.Tensor(target_samples_pos[:, :2] / [max_x, max_y])).to(device)))

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]

In [6]:
class TrajVideoBaselineModel(nn.Module):
    def __init__(self, num_pos=10, pos_embed_dim=16):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2*num_pos, pos_embed_dim),
            nn.ReLU(),
            nn.Linear(pos_embed_dim, pos_embed_dim),
            nn.ReLU(),
            nn.Linear(pos_embed_dim, 2*num_pos)
        )
        
    def forward(self, pos_seq):
        # Process trajectory positions
        pred = self.model(pos_seq)  # shape: (batch, T, pos_embed_dim)
        return pred

In [61]:
def calculate_metrics(predicted_temp, true_temp):
    predicted = np.array(predicted_temp)
    true = np.array(true_temp)

    mse = np.mean((predicted - true) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(predicted - true))
    return mse, rmse, mae

def initialize_datasets(session_ids, device):
    datasets = []
    for session_id in session_ids:
        datasets.extend(TrajectoryVideoLinearDataset(session_id, device))
    return datasets

In [23]:
def execute_training(model, datasets, optimizer, criterion, epochs, device):
    #Set up the datasets
    dataloader_thing = DataLoader(datasets, 128, shuffle=True, num_workers=0)
    start_epoch = 1

    #Load the checkpoint
    checkpoint_path = "checkpoint_baseline_epoch_0.pth"
    if os.path.isfile(checkpoint_path):
        print("Loading checkpoint...")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"resuming training from epoch {start_epoch}")
        epoch = start_epoch

    #Set up train mode and train the model
    model.train()
    for epoch in range(epochs):
        current_epoch = start_epoch + epoch
        checkpoint_path = f"checkpoint_baseline_epoch_{current_epoch}.pth"
        epoch_loss = 0.0
        for pos_seq, target in tqdm(dataloader_thing):
            pos_seq = pos_seq
            target = target
            optimizer.zero_grad()
            output = model(pos_seq)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * pos_seq.size(0)
        epoch_loss_avg = epoch_loss / len(dataloader_thing)
        print("Avg loss = " + str(epoch_loss_avg))

        if current_epoch % 20 == 0:
            torch.save({
                'epoch': current_epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': epoch_loss_avg,
            }, checkpoint_path)
            print(f"Checkpoint saved at epoch {current_epoch}")

In [14]:
datasets = initialize_datasets([1,2,3,5,6,7], device)

151 person_ids
Converting samples to 2d coordinates
51 person_ids
Converting samples to 2d coordinates
369 person_ids
Converting samples to 2d coordinates
127 person_ids
Converting samples to 2d coordinates
147 person_ids
Converting samples to 2d coordinates
102 person_ids
Converting samples to 2d coordinates


In [24]:
##Learn to predict the next 10 frames
model = TrajVideoBaselineModel(10, 32).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0008)
criterion = nn.MSELoss().to(device)
epochs=20
execute_training(model, datasets, optimizer, criterion, 20, device)

100%|██████████| 5852/5852 [00:15<00:00, 375.38it/s]


Avg loss = 0.2915184468791874


100%|██████████| 5852/5852 [00:18<00:00, 324.61it/s]


Avg loss = 0.003527761938486949


100%|██████████| 5852/5852 [00:15<00:00, 385.90it/s]


Avg loss = 0.002024288124960817


100%|██████████| 5852/5852 [00:13<00:00, 422.81it/s]


Avg loss = 0.0018456838725387787


100%|██████████| 5852/5852 [00:11<00:00, 511.04it/s]


Avg loss = 0.0017750400244625673


100%|██████████| 5852/5852 [00:11<00:00, 523.46it/s]


Avg loss = 0.0016992762980628552


100%|██████████| 5852/5852 [00:11<00:00, 506.13it/s]


Avg loss = 0.0016149851967974746


100%|██████████| 5852/5852 [00:11<00:00, 511.47it/s]


Avg loss = 0.0015386118650158737


100%|██████████| 5852/5852 [00:13<00:00, 437.23it/s]


Avg loss = 0.0014903410367795374


100%|██████████| 5852/5852 [00:14<00:00, 406.97it/s]


Avg loss = 0.0014440632860444945


100%|██████████| 5852/5852 [00:12<00:00, 466.25it/s]


Avg loss = 0.0014113741706381902


100%|██████████| 5852/5852 [00:13<00:00, 439.11it/s]


Avg loss = 0.001389249775793167


100%|██████████| 5852/5852 [00:13<00:00, 425.21it/s]


Avg loss = 0.0013774777423156978


100%|██████████| 5852/5852 [00:13<00:00, 439.94it/s]


Avg loss = 0.0013695190558890315


100%|██████████| 5852/5852 [00:13<00:00, 423.59it/s]


Avg loss = 0.0013571848149480357


100%|██████████| 5852/5852 [00:13<00:00, 426.24it/s]


Avg loss = 0.0013439302021287815


100%|██████████| 5852/5852 [00:13<00:00, 430.26it/s]


Avg loss = 0.0013461761300218372


100%|██████████| 5852/5852 [00:13<00:00, 420.65it/s]


Avg loss = 0.001339483452805761


100%|██████████| 5852/5852 [00:13<00:00, 424.22it/s]


Avg loss = 0.0013322554043263125


100%|██████████| 5852/5852 [00:14<00:00, 416.93it/s]


Avg loss = 0.0013221782093714876
Checkpoint saved at epoch 20


 56%|█████▌    | 3281/5852 [00:07<00:06, 419.31it/s]


KeyboardInterrupt: 

In [67]:
def execute_testing(model):
    checkpoint_path = "checkpoint_baseline_epoch_20.pth"
    if os.path.isfile(checkpoint_path):
        print("Loading checkpoint...")
        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=True)
        model.load_state_dict(checkpoint['model_state_dict'])

    model.eval()
    dataset = TrajectoryVideoLinearDataset(0, device)

    random_ints = torch.randint(0, len(dataset), (500,))
    for index in random_ints:
        metrics_all = []
        pos_seq, target = dataset[index]
        prediction = model(pos_seq)
        pos_seq = pos_seq.reshape([10,2]).cpu() * torch.tensor([1280,1024])
        target = target.reshape([10,2]).cpu() * torch.tensor([1280,1024])
        prediction = prediction.reshape([10,2]).detach().cpu() * torch.tensor([1280,1024])
        print("Original = " + str(pos_seq))
        print("Target = " + str(target))
        print("Prediction = " + str(prediction))
        mse, rmse, mae = calculate_metrics(prediction, target)
        metrics_all.append((mse, rmse, mae))
        print("MSE = " + str(mse) + ", RMSE = " + str(rmse) + ", MAE = " + str(mae))

        if metrics_all:
            metrics_all = np.array(metrics_all)
            avg_mse = np.mean(metrics_all[:, 0])
            avg_rmse = np.mean(metrics_all[:, 1])
            avg_mae = np.mean(metrics_all[:, 2])
            print("Overall Metrics on Holdout Data:")
            print(f"Avg MSE: {avg_mse:.4f}, Avg RMSE: {avg_rmse:.4f}, Avg MAE: {avg_mae:.4f}")

execute_testing(model)

Loading checkpoint...
18 person_ids
Converting samples to 2d coordinates
Original = tensor([[897.5421, 565.9280],
        [895.1476, 565.5262],
        [891.7461, 565.0585],
        [890.3522, 564.7388],
        [887.9596, 564.3373],
        [887.3820, 564.5048],
        [882.5609, 563.5139],
        [879.3543, 564.1769],
        [876.0241, 563.7006],
        [873.0754, 563.4777]])
Target = tensor([[869.7733, 563.0129],
        [867.4366, 562.6344],
        [865.8667, 562.7172],
        [864.9108, 562.6447],
        [862.5050, 562.0529],
        [862.4088, 561.8575],
        [860.9025, 561.1400],
        [858.7953, 559.8744],
        [857.7466, 559.6075],
        [857.1910, 558.9525]])
Prediction = tensor([[869.9476, 562.1740],
        [867.4946, 562.9068],
        [865.6269, 560.7441],
        [863.5435, 562.1092],
        [862.2394, 561.1168],
        [860.5850, 560.7055],
        [857.4655, 560.4050],
        [855.1880, 560.3051],
        [853.7705, 559.8934],
        [852.4353, 559