In [1]:
# pip install -U "transformers>=4.44" "accelerate>=0.33" trl peft bitsandbytes datasets pillow

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForVision2Seq, AutoProcessor

import torch
from transformers import Qwen2_5_VLProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig
from trl import SFTConfig
from peft import LoraConfig
from PIL import Image
from datasets import load_dataset
from trl import SFTTrainer
from peft import PeftModel
import json


MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"  # or "Qwen/Qwen2-VL-2B-Instruct"
OUTPUT_DIR = "qwen-vl-lora"

# (Optional) 4-bit quantization to save VRAM
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch.bfloat16, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)

# Load model and tokenizer

model_id = "/Users/eddie/Telementoring/train_vlm/qwen_mse7"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(MODEL_ID, **model_kwargs)
processor = Qwen2_5_VLProcessor.from_pretrained(MODEL_ID)

# LoRA config (tweak r/alpha/targets as needed)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.38s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [2]:
args = SFTConfig(
    output_dir="object-detection",     # directory to save and repository id
    num_train_epochs=3,                         # number of training epochs
    per_device_train_batch_size=1,              # batch size per device during training
    gradient_accumulation_steps=4,              # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    logging_steps=5,                            # log every 5 steps
    save_strategy="epoch",                      # save checkpoint every epoch
    metric_for_best_model="eval_loss",  # Metric to evaluate the best model
    learning_rate=2e-4,                         # learning rate, based on QLoRA paper
    bf16=True,                                  # use bfloat16 precision
    max_grad_norm=0.3,                          # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                          # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",               # use constant learning rate scheduler
    push_to_hub=False,             # push model to hub
    report_to="tensorboard",       
    # use_gradient_checkpointing=False,# report metrics to tensorboard
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # use reentrant checkpointing
    dataset_text_field="",                      # need a dummy field for collator
    dataset_kwargs={"skip_prepare_dataset": True},  # important for collator

)
args.remove_unused_columns = False # important for collator

In [14]:

from datasets import load_dataset

# Load a local CSV file
dataset = load_dataset("csv", data_files="/Users/eddie/Telementoring/preprocessing/P02_38_50_good.csv", usecols = ["video_id", "right_hand","text"])
# dataset = dataset.shuffle()
# View the first row
dataset = dataset.rename_column("video_id", "image")
dataset = dataset.rename_column("right_hand", "answer")
dataset = dataset.rename_column("text", "question")
print(dataset["train"][0])

Generating train split: 1855 examples [00:00, 33120.35 examples/s]

{'answer': '[(703.6103820800781, 518.3125915527344), (703.8306274414062, 518.5762329101562), (704.1045532226562, 519.0954284667969), (704.3289184570312, 519.3901977539062), (704.8181762695312, 519.8074951171875), (704.8181762695312, 519.8074951171875), (704.6837158203125, 519.9284515380859), (704.8555297851562, 519.8917388916016), (705.4415893554688, 520.666259765625), (705.5805358886719, 520.6741027832031), (704.9831237792969, 520.5289459228516), (704.9831237792969, 520.5289459228516), (705.1738891601562, 520.7425079345703), (705.184814453125, 521.0217895507812), (704.9457702636719, 521.0758209228516), (704.3121032714844, 521.3522491455078), (704.4117431640625, 521.9154052734375), (704.4117431640625, 521.9154052734375), (704.566162109375, 521.8283081054688), (703.8301086425781, 522.2403259277344), (703.7429504394531, 522.2540130615234), (703.636474609375, 522.3168182373047), (703.1723327636719, 522.2032470703125), (703.1723327636719, 522.2032470703125), (702.7111206054688, 522.1723480




In [15]:

# Example dataset: replace with your own
# expected columns: image (path or PIL), question (str), answer (str)

def to_messages(row):
    # If row["image"] is a path, keep it; collator will open it.
    messages = [
        {"role": "user", "content": [
            {"type": "image", "image": row["image"]},
            {"type": "text", "text": "Where should I next move my hand after these steps:" + row["question"] +"? Output the next 50 trajectory points as an entire array"},
        ]},
        {"role": "assistant", "content": [
            {"type": "text", "text": row["answer"]}
        ]},
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"text": text, "image": row["image"]}

# keep_cols = ["left_hand", "text"]
ds = dataset.map(to_messages)

Map: 100%|██████████| 1855/1855 [00:00<00:00, 9382.47 examples/s] 


In [16]:
print(ds["train"])

Dataset({
    features: ['answer', 'question', 'image', 'text'],
    num_rows: 1855
})


In [17]:

# Collator: builds batch from (text + image) using the same processor
from PIL import Image
root_dir = "/Users/eddie/Downloads/temp/"
def collate_fn(examples):
    # print(examples[0])
    texts  = [ex["text"] for ex in examples]
    # labels = [ex["label"] for ex in examples]
    # print(texts)
    imgs   = []
    for ex in examples:
        img = ex["image"]
        if isinstance(img, str):
            img = root_dir+img+".jpg"
            img = Image.open(img).convert("RGB")
            # img.show(title=f"Image {1}")
        imgs.append(img)
    inputs = processor(text=texts, images=imgs, return_tensors="pt", padding=True)
    inputs["labels"] = inputs["input_ids"].clone()
    # inputs["labels"] = inputs["input_ids"]
    # move tensors to device in the trainer step; leaving on CPU is fine here
    # print("Labels IDs:", inputs["labels"][0])
    # print("Labels text:", processor.tokenizer.decode(inputs["labels"][0]))
    return inputs

In [None]:
import torch.nn as nn
import numpy as np
from ast import literal_eval
import re
# Select token IDs based on softmax probabilities
def clean_string(text: str) -> str:
    # # Step 1: Remove everything except digits, (), [], ., and ,
    # text = re.sub(r"[^0-9()\[\].,]", "", text)
    
    # # Step 2: Remove periods that are NOT between digits
    # text = re.sub(r"(?<!\d)\.|\.{2,}|(?!\d)\.", "", text)
    
    return text
class CustomSFTTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super(CustomSFTTrainer, self).__init__(*args, **kwargs)
        
    def extract_last_assistant(text: str) -> str | None:
        sep = "\nassistant\n"
        if sep not in text:
            return None
        return text.rsplit(sep, 1)[-1].strip()


    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # img = Image.open("/Users/eddie/Downloads/temp/P02_38/frame_000001.jpg").convert("RGB")
        # messages = [
        #     {"role": "system", "content": "You are a helpful assistant."},
        #     {"role": "user", "content": [
        #         {"type": "image", "image": img},
        #         {"type": "text", "text": "Where should I next move my hand after these steps: incise skin? Output the next 10 trajectory points as an entire array"}
        #     ]},
        #     # {"role": "assistant", "content": [
        #     #         {"type": "text", "text": "what"}
        #     # ]},
        # ]
        # # model_id = "/Users/eddie/Telementoring/train_vlm/object-detection/checkpoint-849"
        # print(type(model))
        # outputs = model(**inputs)
        model.eval()
        inputs_no_labels = {k: v for k, v in inputs.items() if k != "labels"}
        # print(inputs_no_labels)
        # # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, **model_kwargs).to("cuda")
        # # processor = Qwen2_5_VLProcessor.from_pretrained(model_id)
        # # Tokenize
        # texts = processor.apply_chat_template(
        #     messages, tokenize=False, add_generation_prompt=False, return_tensors="pt"
        # )
        # print("MODEL")
        # print(type(model))
        # print(texts)
        # inputs = processor(text=texts, images=img, return_tensors="pt", padding=True).to("cuda")
        model.to("cuda")
        gen_ids = model.generate(**inputs_no_labels, max_new_tokens=2048, do_sample=True, temperature=0.7, top_p=0.9)
        # print(gen_ids)
        out = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
        # print("Predicted:" +out)
       # get label and prediction tokens
        labels = inputs.get("labels")
        # print(inputs)
        # # print(outputs)
        # # print(type(inputs))
        # inputs_no_labels = {k: v for k, v in inputs.items() if k != "labels"}
        # # print(inputs_no_labels)
        # print(inputs)
        # print(inputs_no_labels)
        # # with torch.amp.autocast(device_type="cpu",dtype=torch.bfloat16):
        # #     # model.eval()
        # #     print(inputs_no_labels["input_ids"])
        # #     predictions = model.generate(**inputs_no_labels, max_new_tokens=2048, do_sample=True, temperature=0.7, top_p=0.9)
        # predictions = outputs.get("logits")
        # last_ten = predictions.flatten()[-10:]
        # print(last_ten)
        # print(labels.flatten()[-10:])
        # # decode predictions and labels
        # # predicted_token_ids = torch.argmax(predictions, dim=-1)
        # print(labels)
        
        decoded_predictions = processor.batch_decode(gen_ids, skip_special_tokens=True)
        # decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

        # function to output quantities to a list    
        # print(decoded_predictions)
        # print(decoded_labels)
        predicted_quantities, actual_quantities = (decoded_predictions[0], labels[0])
        # print("predicted: ")
        # print(predicted_quantities)
        # print("label")
        # print(actual_quantities)
        # print(type(predicted_quantities),type(actual_quantities))
        prefix, sep, predicted_after = predicted_quantities.partition("\nassistant\n")
        # prefix, sep, actual_after = actual_quantities.partition("\nassistant\n")
        # predicted_quant = [extract_last_assistant(s) for s in predicted_quantities]

        # print("predicted: ")
        print(clean_string(predicted_after))
        # print("actual:")
        print(clean_string(actual_quantities))
        # print(type(predicted_after))
        try:
            predicted_after = literal_eval(clean_string(predicted_after))
        except Exception:
            # return 10000
            return nn.MSELoss()(torch.tensor([0.], device='cuda:0', dtype=torch.float16, requires_grad=True), torch.tensor([10000.], device='cuda:0', dtype=torch.float16))
        try:
            actual_after = literal_eval(clean_string(actual_quantities))
        except Exception:
            # return 10000
            return nn.MSELoss()(torch.tensor([0.], device='cuda:0', dtype=torch.float16, requires_grad=True), torch.tensor([10000.], device='cuda:0', dtype=torch.float16))
        # print("type")
        # print(type(predicted_after))
        alt_predicted_quantities = []
        for val in predicted_after:
            if val==10000:
                alt_predicted_quantities.append((10000,10000))
            else:
                try:
                    alt_predicted_quantities.append(val[0:2])
                except Exception:
                    # return 10000
                    return nn.MSELoss()(torch.tensor([0.], device='cuda:0', dtype=torch.float16, requires_grad=True), torch.tensor([10000.], device='cuda:0', dtype=torch.float16))
        predicted_after = alt_predicted_quantities
        # print(predicted_after)
        
        predicted_tensor = torch.tensor(predicted_after, device=model.device, dtype=torch.float16)
        actual_tensor = torch.tensor(actual_after, device=model.device,dtype=torch.float16)

        predicted_tensor.requires_grad_()
        
        # Compute MSE loss
        loss_function = nn.MSELoss()
        try: 
            loss = loss_function(predicted_tensor, actual_tensor)
        except Exception:
            # return 10000
            return nn.MSELoss()(torch.tensor([0.], device='cuda:0', dtype=torch.float16, requires_grad=True), torch.tensor([10000.], device='cuda:0', dtype=torch.float16))
        print(loss)
        return (loss, outputs) if return_outputs else loss

In [18]:
from peft import LoraConfig, get_peft_model, TaskType
# model = get_peft_model(model, peft_config)
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)

# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
# trainer.save_model()

# # free the memory again

  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
5,36.2543
10,33.453
15,31.2284
20,26.3706
25,22.034
30,20.806
35,19.4367
40,19.0336
45,18.3426
50,17.9314


KeyboardInterrupt: 

In [None]:
# del model
# del trainer
# torch.cuda.empty_cache()


In [19]:
trainer.save_model("qwen_mse9")

In [None]:
import numpy as np

def _l2(a, b, axis=-1, eps=1e-9):
    """Euclidean distance with tiny floor for stability."""
    return np.sqrt(np.maximum(np.sum((a - b) ** 2, axis=axis), eps))

def ade_single(y_true: np.ndarray,
               y_pred: np.ndarray,
               mask: np.ndarray | None = None) -> float:
    """
    Average Displacement Error for a single predicted path.

    Args:
        y_true: (T, D) or (N, T, D)
        y_pred: (T, D) or (N, T, D) — same shape as y_true
        mask:   optional (T,) or (N, T) boolean/0-1 (valid timesteps)

    Returns:
        Scalar ADE (float).
    """
    # Normalize shapes to (N, T, D)
    if y_true.ndim == 2:
        y_true = y_true[None, ...]
        y_pred = y_pred[None, ...]
        if mask is not None and mask.ndim == 1:
            mask = mask[None, ...]
    elif y_true.ndim != 3:
        raise ValueError("y_true must be (T, D) or (N, T, D)")

    if y_pred.shape != y_true.shape:
        raise ValueError("y_pred must have the same shape as y_true")

    N, T, _ = y_true.shape
    d = _l2(y_pred, y_true, axis=-1)  # (N, T)

    if mask is not None:
        if mask.shape == (T,):
            mask = mask[None, :]
        if mask.shape != (N, T):
            raise ValueError("mask must be (T,) or (N, T)")
        d = d * mask
        denom = np.clip(mask.sum(axis=1), 1e-9, None)  # per-sample valid steps
    else:
        denom = np.full(N, T, dtype=float)

    per_sample = d.sum(axis=1) / denom
    return float(per_sample.mean())

def fde_single(y_true: np.ndarray,
               y_pred: np.ndarray) -> float:
    """
    Final Displacement Error for a single predicted path.

    Args:
        y_true: (T, D) or (N, T, D)
        y_pred: same shape as y_true

    Returns:
        Scalar FDE (float).
    """
    # Normalize to (N, T, D)
    if y_true.ndim == 2:
        y_true = y_true[None, ...]
        y_pred = y_pred[None, ...]
    elif y_true.ndim != 3:
        raise ValueError("y_true must be (T, D) or (N, T, D)")

    gt_last = y_true[:, -1, :]  # (N, D)
    pr_last = y_pred[:, -1, :]  # (N, D)
    per_sample = _l2(pr_last, gt_last, axis=-1)  # (N,)
    return float(per_sample.mean())


In [10]:

model_id = "/Users/eddie/Telementoring/train_vlm/qwen_mse8"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, **model_kwargs).to("cuda")
processor = Qwen2_5_VLProcessor.from_pretrained(model_id)
model.eval()


Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.78s/it]


Qwen2_5_VLForConditionalGeneration(
  (model): Qwen2_5_VLModel(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-31): 32 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((1280,), eps=1e-06)
          (norm2): Qwen2RMSNorm((1280,), eps=1e-06)
          (attn): Qwen2_5_VLVisionAttention(
            (qkv): Linear4bit(in_features=1280, out_features=3840, bias=True)
            (proj): Linear4bit(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear4bit(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear4bit(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear4bit(in_features=3420, out_features=1280, bias=True)
        

In [13]:
i = 0
ades = []
best_paths = []
frames = []
fdes = []
from ast import literal_eval
# while i<800:
while i < (len(dataset["train"])):
    if i not in [0, 399, 799, 1199, 1599]:
        i=i+1
        continue
    root_dir = "/Users/eddie/Downloads/temp/"
    img = Image.open(root_dir+dataset["train"][i]["image"]+".jpg").convert("RGB")
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": img},
            {"type": "text", "text": "Where should I next move my hand after these steps: " + dataset["train"][i]["question"] + "? Output the next 10 trajectory points as an entire array"}
        ]},
        # {"role": "assistant", "content": [
        #         {"type": "text", "text": dataset["train"][i]["answer"]}
        # ]},
    ]

    # Tokenize
    texts = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False, return_tensors="pt"
    )

    inputs = processor(text=texts, images=img, return_tensors="pt", padding=True).to("cuda")

    print(inputs)

    gen_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, temperature=0.7, top_p=0.9)
    print(gen_ids)
    out = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
    print("Predicted:" +out)
    T, D = 10, 2
    y_true = dataset["train"][i]["answer"]
    y_true = literal_eval(y_true)
    prefix, sep, out = out.partition("\nassistant\n")
    # y_pred = literal_eval(out)
    try: 
        y_pred = literal_eval(out)
    except Exception:
        i=i+1
        continue
    y_pred_new = []
    for val in y_pred:
        if val==10000:
            y_pred_new.append((10000,10000))
        else:
            y_pred_new.append(val[0:2])
    y_pred = y_pred_new
    print(type(y_true), y_true)
    print(type(y_pred), y_pred)
    y_true = np.array(y_true)
    try:
        y_pred = np.array(y_pred)
    except Exception:
        i=i+1
        continue
    # Optionally ignore the last 2 timesteps:
    # mask = np.ones(T, dtype=float); mask[-2:] = 0.0
    try:
        ade = ade_single(y_true, y_pred)
        fde = fde_single(y_true,y_pred)
    except Exception:
        # i=i+1
        continue
    if ade<100:
        ades.append(ade)
        fdes.append(fde)
        best_paths.append(y_pred)
        frames.append(root_dir+dataset["train"][i]["image"]+".jpg")
        print(ade,y_pred,root_dir+dataset["train"][i]["image"]+".jpg")
        i = i+1
    print("ADE:", ade)
    print("FDE:", fde)


{'input_ids': tensor([[151644,   8948,    198,  ...,   1334, 151645,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802],
        [-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802],
        [-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802],
        ...,
        [-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802],
        [-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802],
        [-1.7923, -1.7923, -1.7923,  ..., -1.4802, -1.4802, -1.4802]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 66, 92]], device='cuda:0')}
tensor([[151644,   8948,    198,  ...,     19,   7252, 151645]],
       device='cuda:0')
Predicted:system
You are a helpful assistant.
user
Where should I next move my hand after these steps: incise skin? Output the next 10 trajectory points as an entire array
assistant
[(739.2485302734375,

In [None]:
print(len(dataset["train"][:36]))

3


In [None]:
print(len(frames), frames)

3 ['/Users/eddie/Downloads/temp/P02_38/frame_000001.jpg', '/Users/eddie/Downloads/temp/P02_38/frame_000800.jpg', '/Users/eddie/Downloads/temp/P02_38/frame_001600.jpg']


In [None]:
import pandas as pd
df = pd.DataFrame({
    "frame": frames,
    "path": best_paths,
    "ade": ades,
    "fde" : fdes
})
df.to_csv("res4.csv", index=False)

In [None]:
import numpy as np

def _l2(a, b, axis=-1, eps=1e-9):
    """Euclidean distance with tiny floor for stability."""
    return np.sqrt(np.maximum(np.sum((a - b) ** 2, axis=axis), eps))

def ade_single(y_true: np.ndarray,
               y_pred: np.ndarray,
               mask: np.ndarray | None = None) -> float:
    """
    Average Displacement Error for a single predicted path.

    Args:
        y_true: (T, D) or (N, T, D)
        y_pred: (T, D) or (N, T, D) — same shape as y_true
        mask:   optional (T,) or (N, T) boolean/0-1 (valid timesteps)

    Returns:
        Scalar ADE (float).
    """
    # Normalize shapes to (N, T, D)
    if y_true.ndim == 2:
        y_true = y_true[None, ...]
        y_pred = y_pred[None, ...]
        if mask is not None and mask.ndim == 1:
            mask = mask[None, ...]
    elif y_true.ndim != 3:
        raise ValueError("y_true must be (T, D) or (N, T, D)")

    if y_pred.shape != y_true.shape:
        raise ValueError("y_pred must have the same shape as y_true")

    N, T, _ = y_true.shape
    d = _l2(y_pred, y_true, axis=-1)  # (N, T)

    if mask is not None:
        if mask.shape == (T,):
            mask = mask[None, :]
        if mask.shape != (N, T):
            raise ValueError("mask must be (T,) or (N, T)")
        d = d * mask
        denom = np.clip(mask.sum(axis=1), 1e-9, None)  # per-sample valid steps
    else:
        denom = np.full(N, T, dtype=float)

    per_sample = d.sum(axis=1) / denom
    return float(per_sample.mean())

def fde_single(y_true: np.ndarray,
               y_pred: np.ndarray) -> float:
    """
    Final Displacement Error for a single predicted path.

    Args:
        y_true: (T, D) or (N, T, D)
        y_pred: same shape as y_true

    Returns:
        Scalar FDE (float).
    """
    # Normalize to (N, T, D)
    if y_true.ndim == 2:
        y_true = y_true[None, ...]
        y_pred = y_pred[None, ...]
    elif y_true.ndim != 3:
        raise ValueError("y_true must be (T, D) or (N, T, D)")

    gt_last = y_true[:, -1, :]  # (N, D)
    pr_last = y_pred[:, -1, :]  # (N, D)
    per_sample = _l2(pr_last, gt_last, axis=-1)  # (N,)
    return float(per_sample.mean())


In [None]:
# Example with one path (T,D)
T, D = 10, 2
y_true = [(469.1567077636719, 411.8211669921875), (469.4229278564453, 412.4986877441406), (469.78053283691406, 412.5917053222656), (470.10475158691406, 413.09332275390625), (470.45166015625, 413.42640686035156), (470.45166015625, 413.42640686035156), (470.95152282714844, 413.11997985839844), (471.043212890625, 414.3573303222656), (471.07945251464844, 414.34645080566406), (471.6600799560547, 414.26133728027344)]
       # a straight line
out = [(386.7945824848633, 433.70822369628906), (386.7945824848633, 433.70822369628906), (386.3790470654297, 433.63887890625), (386.3790470654297, 433.63887890625), (386.1921204003906, 433.48274365234375), (385.90791701953125, 432.94184847998046), (385.3577286669922, 432.326904296875), (385.3577286669922, 432.326904296875), (384.5634478876953, 431.5358971191406), (385.2377613769531, 431.63368359375)]

y_true = np.array(y_true)
y_pred = np.array(out)

# Optionally ignore the last 2 timesteps:
# mask = np.ones(T, dtype=float); mask[-2:] = 0.0

print("ADE:", ade_single(y_true, y_pred, ))
print("FDE:", fde_single(y_true, y_pred))


ADE: 86.77641859928823
FDE: 88.15109508691934
