In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Running TBD Pedestrian, image only angle prediction

In [None]:
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, models
from PIL import Image
import numpy as np
from tqdm import tqdm

###############################################################################
# 1. CONFIGURATION
###############################################################################
DATA_PKL     = "/kaggle/input/image-angle-pred-uhh-temp-yay-maybe/AngleOfPerson_20250331_042254.pkl"  # <-- Replace with your path
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE   = 100
EPOCHS       = 20
LEARNING_RATE = 0.004
TRAIN_SPLIT  = 0.8
# Basic image transforms
IMAGE_TRANSFORM = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    # If using a pretrained ResNet, you typically want normalization:
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

print(f"Using device: {DEVICE}")

In [None]:
###############################################################################
# 2. DATASET
###############################################################################
class ImageAngleDataset(Dataset):
    """
    Dataset that returns:
      - image tensor
      - angle (float) as a label
    """
    def __init__(self, data_list, transform=None):
        """
        data_list: List of (image_path, angle)
        transform: TorchVision transforms for images
        """
        self.data_list = data_list
        self.transform = transform
        # print("Creating dataset!")
        # for item_a, item_b in tqdm(data_list):
        #     modified_item_a = Image.fromarray(item_a)
        #     if self.transform:
        #         modified_item_a = self.transform(modified_item_a)#.to(DEVICE)
            
        #     modified_item_b = torch.tensor(item_b, dtype=torch.float32).unsqueeze(0)#.to(DEVICE)
        #     self.data_list.append((modified_item_a, modified_item_b))

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        image_path, angle = self.data_list[idx]
        # Load image
        image = Image.fromarray(image_path)
        # image = torch.from_numpy(np.transpose(image, (2,0,1))).float()
        if self.transform:
            image = self.transform(image)
        # Convert angle to float tensor [1,]
        angle_tensor = torch.tensor(angle, dtype=torch.float32).unsqueeze(0)
        return image, angle_tensor

In [None]:
###############################################################################
# 3. MODEL: Simple ResNet-based regressor
###############################################################################
class ImageRegressor(nn.Module):
    def __init__(self, pretrained=True):
        """
        If pretrained=True, uses pretrained ImageNet weights.
        If pretrained=False, initializes from scratch.
        """
        super().__init__()
        # Use a ResNet18 as the backbone
        if pretrained:
            backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        else:
            backbone = models.resnet18(weights=None)

        # Remove the final classification layer
        num_feats = backbone.fc.in_features
        backbone.fc = nn.Identity()

        self.backbone = backbone
        # Final linear to produce 1 output (angle)
        #       But with an extra layer in between to smoothen the process
        self.fc_before = nn.Linear(num_feats, 32)
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        # x: [batch_size, 3, H, W]
        features = self.backbone(x)   # [batch_size, 512] for ResNet18
        before_out = self.fc_before(features)
        out = self.fc(before_out)       # [batch_size, 1]
        return out

In [None]:
###############################################################################
# 4. TRAINING & TESTING
###############################################################################
def circular_error(pred, actual):
    """
    Computes the circular error (in degrees) between predicted and actual angles.
    The error is defined as the minimum of the absolute difference and 360 minus that difference.
    """
    diff = abs(pred - actual)
    return diff if diff <= 180 else 360 - diff

def train_model(model, train_loader, epochs=10, lr=1e-3):
    model.to(DEVICE)
    criterion = nn.MSELoss().to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    

    for epoch in range(epochs):
        checkpoint_path = f"checkpoint_angle_pred_images_TBD_epoch_{epoch+1}.pth"
        model.train()
        total_loss = 0.0

        for images, angles in tqdm(train_loader):
            images = images.to(DEVICE)
            angles = angles.to(DEVICE)

            optimizer.zero_grad()
            preds = model(images)
            
            loss = criterion(preds, angles)
            loss = torch.sum(torch.abs(180 - torch.abs(((torch.abs(angles - preds) % 360) - 180))))
            loss.backward()
            
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}]  Train Loss: {avg_loss:.4f}")
        if (epoch+1) % 5 == 0:
            torch.save(model.state_dict(), checkpoint_path)

def test_model(model, test_loader):
    model.to(DEVICE)
    model.eval()
    criterion = nn.MSELoss()
    total_loss = 0.0

    # Optional: store predictions for further analysis
    all_preds = []
    all_labels = []
    total_circular_error = 0.0
    count = 0

    with torch.no_grad():
        for images, angles in tqdm(test_loader):
            images = images.to(DEVICE)
            angles = angles.to(DEVICE)

            preds = model(images)
            loss = criterion(preds, angles)
            total_loss += loss.item()

            preds_list = preds.cpu().view(-1).tolist()
            angles_list = angles.cpu().view(-1).tolist()
            all_preds.extend(preds_list)
            all_labels.extend(angles_list)

            for p, a in zip(preds_list, angles_list):
                err = circular_error(p, a)
                total_circular_error += err
                count += 1

    avg_loss = total_loss / len(test_loader)
    avg_circular_error = total_circular_error / count if count > 0 else 0.0

    print(f"Test Loss: {avg_loss:.4f}")
    print("Sample Predictions vs Actual with Circular Error:")
    # for i in range(len(all_preds)):
    #     err = circular_error(all_preds[i], all_labels[i])
    #     print(f"  Pred: {all_preds[i]:.2f}, Actual: {all_labels[i]:.2f}, Circular Error: {err:.2f}")
    print(f"Average Circular Error: {avg_circular_error:.2f}")
    return all_preds, all_labels

In [None]:
# 1) Load data_list from .pkl
with open(DATA_PKL, "rb") as f:
    data_list = pickle.load(f)
print(f"Loaded {len(data_list)} samples from {DATA_PKL}")

# 2) Create dataset
dataset = ImageAngleDataset(data_list, transform=IMAGE_TRANSFORM)

# 3) Split into train/test
train_size = int(TRAIN_SPLIT * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

# 4) Initialize model
model = ImageRegressor(pretrained=True)
print(model)

# 5) Train
print("Starting Training ...")
train_model(model, train_loader, epochs=EPOCHS, lr=LEARNING_RATE)

# 6) Test
print("Starting Testing ...")
test_model(model, test_loader)

print("Done!")

In [None]:
checkpoint_path = "/kaggle/working/checkpoint_angle_pred_images_TBD_epoch_20.pth"

if os.path.isfile(checkpoint_path):
    print("Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=DEVICE, weights_only=True)
    model.load_state_dict(checkpoint)

    all_preds, all_labels = test_model(model, test_loader)

# Creating keypoints annotations

In [None]:
import os
from scipy.io import loadmat
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from tqdm import tqdm
import pickle
import datetime
from PIL import Image
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('/kaggle/input/image-angle-pred-uhh-temp-yay-maybe/AngleOfPerson_20250331_042254.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

In [None]:
### Stores the obtained images as a zip file in the given path
def obtain_all_pictures(full_data, path, name):
    temp_path = os.path.join(path, 'temp_image/')
    if not os.path.exists(temp_path):
        os.makedirs(temp_path)

    current_item = 0
    for i in range(len(full_data)):
        image = full_data[i][0]
        image = Image.fromarray(image.astype('uint8')).convert('RGB')
        image.save(os.path.join(temp_path, 'image_' + str(i) + '.jpg'))

    # import shutil
    # shutil.make_archive(os.path.join(path, name), 'zip', temp_path)
    # shutil.rmtree(temp_path)

obtain_all_pictures(loaded_data, "/kaggle/working/annotation_data/images/", 'images')

In [None]:
with open('/kaggle/working/annotation_data/AngleOfPerson_20250331_042254.pkl', 'wb') as f:
    pickle.dump(loaded_data, f)

In [None]:
os.listdir("/kaggle/working/annotation_data")

In [None]:
loaded_data[0][0].shape

In [None]:
!pip install ultralytics
from ultralytics import YOLO
import os
import pickle
from tqdm import tqdm

# Load a COCO-pretrained YOLO12n model
model = YOLO("yolo11m-pose.pt")  # load an official model

def single_run(path_input, path_output):
    results = model(path_input, save=True, show=True, show_conf=False, show_labels=False, max_det = 1, verbose=False)  # predict on an image

    for result in results:
        keypoints = result.keypoints  # Keypoints object for pose outputs

    xyn = keypoints.xyn 

    f = open(path_output, "w")
    for i in range(len(xyn)):
        f.write(str(xyn[i]) + "\n")
    f.close()


path_input = "/kaggle/working/annotation_data/images/temp_image/"
path_output = "/kaggle/working/annotation_data/KeypointsData.pkl"
images_dir = path_input
output_pickle = path_output
keypoints_data = {}
print(len(os.listdir(images_dir)))
# Process all images in the directory
for filename in tqdm(os.listdir(images_dir)):
    if filename.endswith(".jpg"):
        results = model(
            os.path.join(images_dir, filename),
            save=False,
            show_conf=False,
            show_labels=False,
            verbose=False,
            max_det=1,
            device=DEVICE
        )  # predict on an image

        for result in results:
            keypoints = result.keypoints  # Keypoints object for pose outputs
            xyn = keypoints.xyn.tolist()  # Convert normalized coordinates to a list
            keypoints_data[filename] = xyn  # Save keypoints for the image
print(len(keypoints_data))

# Save the keypoints data to a pickle file
with open(output_pickle, "wb") as f:
    pickle.dump(keypoints_data, f)

print(f"Keypoints data saved to {output_pickle}")

# total_run("/kaggle/working/annotation_data/images/temp_image/", "kaggle/working/annotation_data")

In [None]:
len(keypoints_data)

In [None]:
os.listdir("/kaggle/working/annotation_data")

In [None]:
len(os.listdir("/kaggle/working/annotation_data/images/temp_image"))

In [None]:
path_dir = "/kaggle/working/annotation_data/"
output_pickle = "/kaggle/working/annotation_data/CombinedData.pkl"
img_dir = "/kaggle/working/annotation_data/images/temp_image/"
directory = path_dir
result = []

with open(directory + 'AngleOfPerson_20250331_042254.pkl', 'rb') as f:
    loaded_data = pickle.load(f)
with open(directory + 'KeypointsData.pkl', 'rb') as f:
    loaded_keypoints = pickle.load(f)

# Iterate through the images (image_0.jpg to image_432.jpg)
print(len(loaded_keypoints))
for i in tqdm(range(len(loaded_keypoints))):
    image_key = f"image_{i}.jpg"
    if image_key in loaded_keypoints:
        keypoints_value = loaded_keypoints[image_key]
        angle_value = loaded_data[i][1]  # Get the second value from loaded_data[i]
        result.append((keypoints_value, angle_value))  # Create the tuple and add to the list

# output_pickle = "C:/Users/Bulut/Documents/GitHub/Skelet/CombinedData.pkl"
with open(output_pickle, "wb") as f:
    pickle.dump(result, f)

print(f"Keypoints data saved to {output_pickle}")

In [None]:
# Adjust paths as needed
pkl_path = "/kaggle/working/annotation_data/CombinedData.pkl"
images_dir = "/kaggle/working/annotation_data/images/temp_image/"
output_pkl_path = "/kaggle/working/annotation_data/FinalKeypointSet35K.pkl"

# Load your CombinedData.pkl
with open(pkl_path, 'rb') as f:
    data_dict = pickle.load(f)  # Suppose it's a list of [keypoints, angle]
    
#print(f"Loaded {len(data_dict)} samples from {pkl_path}")
#print(f"Example data: {data_dict[0][0]}")
#print(f"Example data: {data_dict[0][1]}")


filtered_data_list = []
print(len(data_dict))

for i, item in enumerate(data_dict):
    keypoints = item[0]
    angle = torch.tensor(item[1], dtype=torch.float32).unsqueeze(0)   # Labels are the angle (single value)

    # Convert to torch tensors for checking shape (or you can just check Python lists)
    sequence = torch.tensor(keypoints, dtype=torch.float32)
    if sequence.ndimension() == 3 and sequence.shape[0] == 1:
        sequence = sequence.squeeze(0)  # Remove the first singleton dimension

    # Check if sequence is empty or has invalid shape
    # Example: we want sequence of shape (seq_len, 2) and seq_len>0
    if sequence.shape[0] == 0 or sequence.shape[1] != 2:
        # Skip this row
        continue
    
    # If we reach here, the row is valid
    # The corresponding image is: "image_{i}.jpg" or some pattern
    image_path = os.path.join(images_dir, f"image_{i}.jpg")

    # (Optionally) check if the image actually exists on disk
    if not os.path.isfile(image_path):
        print(f"Warning: Image not found at {image_path}, skipping.")
        continue

    # Keep the data
    filtered_data_list.append((image_path, sequence, angle))

print(len(filtered_data_list))

#If desired, save the filtered data to a new pkl
with open(output_pkl_path, 'wb') as f:
    pickle.dump(filtered_data_list, f)


print(f"Filtered data saved to: {output_pkl_path}")

# Running TBD Pedestrian, keypoints angle prediction

In [None]:
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset from .pkl file
directory = "/kaggle/working/annotation_data/"  # Update this to the correct path
with open(directory + 'FinalKeypointSet35K.pkl', 'rb') as f:
    data_dict = pickle.load(f)  # Assuming it's stored as a dictionary

# Flatten data_dict to ensure each data sample is a 2D tensor of shape (seq_len, 2)
data = []
labels = []
for item in data_dict:
    # Convert each sequence into a tensor of shape (seq_len, 2) and each label as a float tensor
    sequence = torch.tensor(item[1], dtype=torch.float32)  # Ensure this is a tensor of shape (seq_len, 2)
    label = torch.tensor(item[2], dtype=torch.float32).unsqueeze(0)   # Labels are the angle (single value)
    data.append(sequence)
    labels.append(label)

# Padding function for variable-length sequences
def pad_batch(batch):
    sequences, labels = zip(*batch)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0.0)  # Pad with zeros
    return padded_sequences, torch.stack(labels)

# Dataset Class
class CoordDataset(Dataset):
    def __init__(self, data, labels):
        self.data = [torch.tensor(seq, dtype=torch.float32).to(device) for seq in data]  # Convert to tensor
        self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1).to(device)  # Convert to tensor and reshape

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]  # Tensor of shape (seq_len, 2)
        y = self.labels[idx]  # Keep as a single value (not sin/cos)
        return x, y

# Split dataset into training and testing (90% training, 10% testing)
dataset = CoordDataset(data, labels)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders for training and testing
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True, collate_fn=pad_batch)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=False, collate_fn=pad_batch)

# Transformer Model
class TransformerRegressor(nn.Module):
    def __init__(self, input_dim=2, model_dim=64, num_heads=4, num_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, 
                                                        dim_feedforward=ff_dim, dropout=dropout, 
                                                        batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(model_dim, 1)  # Output single angle value

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, model_dim)
        x = self.transformer_encoder(x)  # (batch_size, seq_len, model_dim)
        x = x.mean(dim=1)  # Global average pooling
        return self.fc_out(x)  # (batch_size, 1)

# Circular error function
def circular_error(pred, actual):
    """
    Computes the circular error (in degrees) between predicted and actual angles.
    The error is the minimum of the absolute difference and 360 minus that difference.
    """
    diff = abs(pred - actual)
    return diff if diff <= 180 else 360 - diff

# Training Function
def train_model(model, train_dataloader, epochs=200, lr=0.0005):
    model.to(device)  # Move the model to GPU (if available)
    criterion = nn.MSELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        checkpoint_path = f"checkpoint_angle_pred_keypoints_TBD_epoch_{epoch+1}.pth"
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            x, y = batch
            x, y = x, y  # Move data to GPU
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if (epoch+1) % 40 == 0:
            torch.save(model.state_dict(), checkpoint_path)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")

# Testing Function
def test_model(model, test_dataloader):
    model.to(device)  # Ensure the model is on the correct device
    model.eval()
    total_loss = 0
    total_circular_error = 0
    criterion = nn.MSELoss()
    count = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            x, y = batch
            x, y = x.to(device), y.to(device)  # Move data to GPU
            predictions = model(x)
            loss = criterion(predictions, y)
            total_loss += loss.item()
            
            # Compute and print circular error for each sample in the batch
            preds = predictions.squeeze()
            actuals = y.squeeze()
            # Ensure both preds and actuals are iterable
            if preds.dim() == 0:
                preds = preds.unsqueeze(0)
            if actuals.dim() == 0:
                actuals = actuals.unsqueeze(0)
            for pred, actual in zip(preds.tolist(), actuals.tolist()):
                err = circular_error(pred, actual)
                total_circular_error += err
                count += 1
                # print(f"Actual: {actual:.2f}, Predicted: {pred:.2f}, Circular Error: {err:.2f}")
    
    avg_loss = total_loss / len(test_dataloader)
    avg_circular_error = total_circular_error / count if count > 0 else 0
    print(f"\nTest Loss: {avg_loss:.4f}")
    print(f"Average Circular Error: {avg_circular_error:.2f}")

# Train the model
model = TransformerRegressor()
train_model(model, train_dataloader)

# Test the model with detailed circular error analysis
test_model(model, test_dataloader)


# Combined

In [None]:
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

from torchvision import transforms, models
from PIL import Image

###############################################################################
# 1. CONFIGURATION
###############################################################################
FILTERED_DATA_PKL = "/kaggle/working/annotation_data/FinalKeypointSet35K.pkl"
DEVICE            = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Example transforms for images (resize to 224x224, convert to tensor)
IMAGE_TRANSFORM = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    # If using pretrained networks, you often do:
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

BATCH_SIZE    = 100
TRAIN_SPLIT   = 0.8
EPOCHS        = 20
LEARNING_RATE = 0.001

print(f"Using device: {DEVICE}")

###############################################################################
# 2. DATASET & DATALOADER
###############################################################################

class ImageKeypointDataset(Dataset):
    """
    Dataset that provides:
      1) Image,
      2) Raw keypoint coordinates,
      3) Label (angle of movement).
    """
    def __init__(self, data_list, transform=None):
        """
        data_list: a list of tuples -> (image_path, keypoints, angle)
        transform: torchvision transforms for images
        """
        self.data_list = data_list
        self.transform = transform
        # print("Initializing dataset!")
        # for i in tqdm(range(len(data_list))):
        #     image_path, keypoints, angle = data_list[i]
        #     image = Image.open(image_path).convert("RGB")
        #     if self.transform:
        #         image = self.transform(image)

        #     # Convert keypoints to tensor (seq_len, 2)
        #     kp_tensor = torch.tensor(keypoints, dtype=torch.float32)
    
        #     # Convert angle to float tensor, shape (1,)
        #     angle_tensor = torch.tensor(angle, dtype=torch.float32).unsqueeze(0)
        #     self.data_list.append((image, kp_tensor, angle_tensor))
            

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        # return self.data_list[idx]
        image_path, keypoints, angle = self.data_list[idx]

        # Load image
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Convert keypoints to tensor (seq_len, 2)
        kp_tensor = torch.tensor(keypoints, dtype=torch.float32)

        # Convert angle to float tensor, shape (1,)
        angle_tensor = angle#torch.tensor(angle, dtype=torch.float32)#.unsqueeze(0)

        return image, kp_tensor, angle_tensor


def multimodal_collate_fn(batch):
    """
    Custom collate function to handle:
      - a list of (image, keypoints, label)
      - variable-length keypoints
      - images get stacked
    """
    images, kpoints_list, labels = zip(*batch)

    # Stack images
    images = torch.stack(images, dim=0)  # (batch_size, C, H, W)

    # Pad keypoints to the same sequence length
    padded_kpoints = pad_sequence(kpoints_list, batch_first=True, padding_value=0.0)

    # Stack labels
    labels = torch.stack(labels, dim=0)  # (batch_size, 1)

    return images, padded_kpoints, labels

###############################################################################
# 3. MULTI-MODAL MODEL
###############################################################################

class ImageEncoder(nn.Module):
    def __init__(self, pretrained=True, out_features=128):
        super().__init__()
        # Use a pretrained ResNet18 as an example
        backbone = models.resnet18(pretrained=pretrained)
        # Remove final classification layer
        num_feats = backbone.fc.in_features
        backbone.fc = nn.Identity()
        
        self.backbone = backbone
        self.projection = nn.Linear(num_feats, out_features)

    def forward(self, x):
        """
        x shape: (batch_size, 3, H, W)
        """
        features = self.backbone(x)         # (batch_size, 512) for ResNet18
        out = self.projection(features)     # (batch_size, out_features)
        return out


class KeypointEncoder(nn.Module):
    def __init__(self, input_dim=2, model_dim=64, num_heads=4, num_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        
        # Note: remove batch_first here
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout
            # batch_first=False is default
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(model_dim, model_dim)

    def forward(self, x):
        """
        x shape: [batch_size, seq_len, 2]
        """
        # 1) Embed => [batch_size, seq_len, model_dim]
        x = self.embedding(x)

        # 2) Transpose => [seq_len, batch_size, model_dim]
        x = x.transpose(0, 1)

        # 3) Pass through transformer => still [seq_len, batch_size, model_dim]
        x = self.transformer_encoder(x)

        # 4) Transpose back => [batch_size, seq_len, model_dim]
        x = x.transpose(0, 1)

        # 5) Pool across seq_len => [batch_size, model_dim]
        x = x.mean(dim=1)

        # 6) Final FC => [batch_size, model_dim]
        x = self.fc_out(x)

        return x


class MultiModalRegressor(nn.Module):
    def __init__(self, img_out_features=128, keypoint_dim=64, hidden_dim=128):
        super().__init__()
        self.img_encoder = ImageEncoder(pretrained=True, out_features=img_out_features)
        self.kp_encoder  = KeypointEncoder(input_dim=2, model_dim=keypoint_dim)

        fusion_input_dim = img_out_features + keypoint_dim

        self.regressor = nn.Sequential(
            nn.Linear(fusion_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # single angle
        )

    def forward(self, images, keypoints):
        # Encode image
        img_feats = self.img_encoder(images)   # (batch_size, img_out_features)

        # Encode keypoints
        kp_feats  = self.kp_encoder(keypoints) # (batch_size, keypoint_dim)
        # print(img_feats)
        # print(kp_feats)
        # Fuse
        fused = torch.cat([img_feats, kp_feats], dim=1)  # (batch_size, fusion_input_dim)
        # print(fused)

        # Regress angle
        out = self.regressor(fused)  # (batch_size, 1)
        return out


###############################################################################
# 4. TRAINING & TESTING FUNCTIONS
###############################################################################

def train_model(model, train_loader, epochs=10, lr=1e-3):
    model.to(DEVICE)
    model.train()
    criterion = nn.MSELoss().to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0.0
        for images, kpoints, labels in tqdm(train_loader):
            checkpoint_path = f"checkpoint_angle_pred_keypoints_image_TBD_epoch_{epoch+1}.pth"
            images = images.to(DEVICE)
            kpoints = kpoints.to(DEVICE)
            labels = labels.to(DEVICE)

            optimizer.zero_grad()
            predictions = model(images, kpoints)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if (epoch+1) % 5 == 0:
            torch.save(model.state_dict(), checkpoint_path)
            # print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")


def test_model(model, test_loader):
    model.eval()
    model.to(DEVICE)
    criterion = nn.MSELoss()
    total_loss = 0.0

    with torch.no_grad():
        for images, kpoints, labels in test_loader:
            images = images.to(DEVICE)
            kpoints = kpoints.to(DEVICE)
            labels = labels.to(DEVICE)

            preds = model(images, kpoints)
            loss = criterion(preds, labels)
            total_loss += loss.item()

            # If you want to visualize some predictions:
            # print("Predicted:", preds.squeeze().tolist())
            # print("Actual:   ", labels.squeeze().tolist())

    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss:.4f}")


###############################################################################
# 5. MAIN EXECUTION
###############################################################################

# 1) Load filtered data
with open(FILTERED_DATA_PKL, 'rb') as f:
    filtered_data_list = pickle.load(f)

print(f"Loaded filtered data list, total samples: {len(filtered_data_list)}")

# 2) Create dataset
dataset = ImageKeypointDataset(filtered_data_list, transform=IMAGE_TRANSFORM)

# 3) Train/Test split
train_size = int(TRAIN_SPLIT * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# 4) Dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=multimodal_collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=multimodal_collate_fn
)

# 5) Initialize multi-modal model
model = MultiModalRegressor(
    img_out_features=128,
    keypoint_dim=64,
    hidden_dim=32
)
print(model)

# 6) Train
print("Starting Training ...")
train_model(model, train_loader, epochs=EPOCHS, lr=LEARNING_RATE)

# 7) Test
print("Starting Testing ...")
test_model(model, test_loader)

In [None]:
# Circular error function
def circular_error(pred, actual):
    """
    Computes the circular error (in degrees) between predicted and actual angles.
    The error is the minimum of the absolute difference and 360 minus that difference.
    """
    diff = abs(pred - actual)
    return diff if diff <= 180 else 360 - diff
def test_model(model, test_loader):
    model.eval()
    model.to(DEVICE)
    criterion = nn.MSELoss()
    total_loss = 0.0

    # Optional: store predictions for further analysis
    all_preds = []
    all_labels = []
    total_circular_error = 0.0
    count = 0

    with torch.no_grad():
        for images, kpoints, labels in tqdm(test_loader):
            images = images.to(DEVICE)
            kpoints = kpoints.to(DEVICE)
            labels = labels.to(DEVICE)

            preds = model(images, kpoints)
            loss = criterion(preds, labels)
            total_loss += loss.item()

            preds_list = preds.cpu().view(-1).tolist()
            angles_list = labels.cpu().view(-1).tolist()
            all_preds.extend(preds_list)
            all_labels.extend(angles_list)
            
            for p, a in zip(preds_list, angles_list):
                err = circular_error(p, a)
                total_circular_error += err
                count += 1

    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss:.4f}")
    ace = total_circular_error / count
    print(f"ACE: {ace:.4f}")
test_model(model, test_loader)