# Environment Setup

In [None]:
!uv pip install -U Ipython
!uv pip install webvtt-py evaluate sacrebleu rouge_score pycocoevalcap
!pip install git+https://github.com/google-research/bleurt.git
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/e2e-slt-streaming
# !git fetch origin && git reset --hard origin/main # Don't care about local changes and overwrite everything
# !git pull
!ls

In [None]:
# Take around 14-16mins on A100, 30mins on T4
!rm -rf /tmp/auto_sat_aligned
!rm -rf /tmp/manual_sat_aligned
!rm -rf /tmp/bobsl_dwpose
!7z x /content/drive/MyDrive/e2e-slt-streaming/dataset/auto_sat_aligned.zip -o/tmp
!7z x /content/drive/MyDrive/e2e-slt-streaming/dataset/manual_sat_aligned.zip -o/tmp
!7z x /content/drive/MyDrive/e2e-slt-streaming/dataset/BOBSL/bobsl_dwpose.zip -o/tmp

In [4]:
import gc
import torch
from functools import partial
from transformers import (
    AutoTokenizer, DeformableDetrConfig,
    TrainingArguments, Trainer,
    EarlyStoppingCallback,
)
from loader import DVCDataset, trainer_collate_fn
from pdvc import DeformableDetrForObjectDetection
from evaluation import preprocess_logits_for_metrics, compute_metrics
from config import *

MAX_TOKENS_LEN = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/5.81M [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Data Loading

In [5]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base', use_fast=True)
train_dataset = DVCDataset(
    split='train', max_tries=20, max_tokens_len=MAX_TOKENS_LEN,
    min_events=1, load_by='window', tokenizer=tokenizer, seed=2025
)
val_dataset = DVCDataset(
    split='val', stride_ratio=0.9, max_tokens_len=MAX_TOKENS_LEN,
    min_events=1, load_by='window', tokenizer=tokenizer, seed=2025
)
print(f'Train dataset: {len(train_dataset)} samples')
print(f'Val dataset: {len(val_dataset)} samples')

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Found 1658 videos in the train split.


Building video metadata for train split: 100%|██████████| 1658/1658 [00:28<00:00, 58.20it/s]


Dataset initialized for train: 1658 videos
Window size: 15s (187 frames @ 12.5 fps)
Found 32 videos in the val split.


Building video metadata for val split: 100%|██████████| 32/32 [00:02<00:00, 14.76it/s]

Dataset initialized for val: 32 videos
Window size: 15s (187 frames @ 12.5 fps)
Train dataset: 1658 samples
Val dataset: 5788 samples





# Training Setup

In [6]:
weight_dict = {'loss_ce': 2, 'loss_bbox': 0, 'loss_giou': 4, 'loss_counter': 2, 'loss_caption': 2}
config = DeformableDetrConfig(
    d_model=512,
    encoder_layers=2,
    decoder_layers=2,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    encoder_n_points=4,
    decoder_n_points=4,
    num_feature_levels=4, # Number of input feature levels
    num_queries=20,       # 2-3x Maximum number of events a window can have
    num_labels=1,         # Single foreground class for caption
    auxiliary_loss=True,  # The training step may spend a time in per-layer caption alignment and Hungarian matching
    # Loss hyper-params in the Hungarian matching cost
    class_cost=weight_dict['loss_ce'],  # Relative weight of the classification error
    bbox_cost=weight_dict['loss_bbox'], # Relative weight of the L1 error of the bounding box coordinates
    giou_cost=weight_dict['loss_giou'], # Relative weight of the generalized IoU loss of the bounding box
    focal_alpha=0.25,
    with_box_refine=False, # Learnt (True) or Ground truth proposals (False, all losses except caption loss will be disabled)
)

In [7]:
model = DeformableDetrForObjectDetection(
    config=config,
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    rnn_num_layers=1,
    cap_dropout_rate=0.1,
    max_tokens_len=MAX_TOKENS_LEN,
    weight_dict=weight_dict
).to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f'Model initialized with {total_params / 1e6:.2f}M parameters')

Model initialized with 72.73M parameters


In [8]:
training_args = TrainingArguments(      # Find out more at https://huggingface.co/docs/transformers/en/main_classes/trainer
    output_dir='/tmp',                  # Directory for checkpoints and logs
    num_train_epochs=100,               # Total number of training epochs
    save_safetensors=False,             # Disable safe serialization to avoid the error
    # Data processing
    per_device_train_batch_size=32,     # Effective batch size = per_device_batch_size x gradient_accumulation_steps x num_devices
    per_device_eval_batch_size=64,      # Faster evaluation during training
    dataloader_num_workers=4,           # Number of subprocesses to use for data loading
    # Precision & optimization
    bf16=torch.cuda.is_available(),     # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
    optim='adamw_torch_fused',
    weight_decay=1e-4,                  # Low since random windows already provide regularization
    learning_rate=5e-4,                 # Linear decay learning rate
    # Reporting and saving
    report_to='wandb',                  # Whether to use trackio/wandb/tensorboard for experiment tracking
    logging_strategy='epoch',
    eval_strategy='epoch',              # Evaluate after each epoch
    save_strategy='epoch',
    save_total_limit=1,
    metric_for_best_model='eval_loss',  # Use validation loss/Bleu for early stopping
    greater_is_better=False,            # Lower loss / higher Bleu is better
    load_best_model_at_end=True,        # Load the best model based on validation loss/Bleu
)

# Train the Model

In [None]:
%%time
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=trainer_collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)
trainer.train()
trainer.save_model(CHECKPOINT_DIR)

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m18520339[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,29.8651,27.425827
2,26.6821,26.469093
3,25.4861,24.886063
4,24.6139,24.332266
5,24.2196,23.958889
6,23.8607,23.574055
7,23.5732,23.135437
8,23.1966,22.794626
9,23.0583,22.488495
10,22.8881,22.244268


=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fallback: pick a window that guarantees >= min_events if possible, otherwise the densest window (max contained events) within window size.
=> Fal

In [10]:
def evaluate_with_metrics(model, eval_dataset, metric_key_prefix):
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=eval_dataset,
        data_collator=trainer_collate_fn,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics, # Reduce logits size stored on GPU
        compute_metrics=partial(
            compute_metrics,
            ranking_temperature=2.0,  # Exponent T in caption score normalization by length^T
            alpha=0.3, # Ranking policy: joint_score = alpha * (caption_score / len(tokens)^T) + (1 - alpha) * det_score
            top_k=config.num_queries, # Should be num_queries during evaluation
            temporal_iou_thresholds=(0.3, 0.5, 0.7, 0.9),
            tokenizer=tokenizer,
            # soda_recursion_limit=2000,  # 0 to disable for faster calculations
        )
    )
    return trainer.evaluate(metric_key_prefix=metric_key_prefix)

evaluate_with_metrics(model, val_dataset, 'val')

{'val_loss': 19.604089736938477,
 'val_model_preparation_time': 0.0055,
 'val_loc_precision@30': 0.05952380952380953,
 'val_loc_recall@30': 0.07281030031842928,
 'val_loc_f1@30': 0.06550006574559251,
 'val_dense_bleu4@30': 0.3243391360128333,
 'val_dense_bleurt@30': -1.456469378772291,
 'val_dense_rougeL@30': 0.0826318924032099,
 'val_dense_meteor@30': 0.08320168579386872,
 'val_dense_cider@30': 0.030739543753126233,
 'val_loc_precision@50': 0.05952380952380953,
 'val_loc_recall@50': 0.03329284363084299,
 'val_loc_f1@50': 0.04270175265824996,
 'val_dense_bleu4@50': 0.22087262997021576,
 'val_dense_bleurt@50': -1.463103468307821,
 'val_dense_rougeL@50': 0.08244029649133837,
 'val_dense_meteor@50': 0.07445687803702865,
 'val_dense_cider@50': 0.029962062515243453,
 'val_loc_precision@70': 0.05952380952380953,
 'val_loc_recall@70': 0.011404766911105453,
 'val_loc_f1@70': 0.0191419370696947,
 'val_dense_bleu4@70': 0.2712545624386585,
 'val_dense_bleurt@70': -1.46104795504839,
 'val_dense_ro

In [None]:
model.to('cpu')
del train_dataset, val_dataset, model, trainer
gc.collect()
torch.cuda.empty_cache()
# !rm -rf {CHECKPOINT_DIR}

In [None]:
gc.collect()
torch.cuda.empty_cache()

# Evaluation on Test sets

In [None]:
model = DeformableDetrForObjectDetection(
    config=config,
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    rnn_num_layers=1,
    cap_dropout_rate=0.1,
    max_tokens_len=MAX_TOKENS_LEN,
    weight_dict=weight_dict
).to(device)
model.load_state_dict(torch.load(CHECKPOINT_DIR / 'pytorch_model.bin'))

In [11]:
test_dataset = DVCDataset(
    split='test', stride_ratio=0.9, max_tokens_len=MAX_TOKENS_LEN,
    min_events=1, load_by='window', tokenizer=tokenizer, seed=2025
)
print(f'Test dataset: {len(test_dataset)} samples')
evaluate_with_metrics(model, test_dataset, 'test')

Found 250 videos in the test split.


Building video metadata for test split: 100%|██████████| 250/250 [00:12<00:00, 20.72it/s]

Dataset initialized for test: 250 videos
Window size: 15s (187 frames @ 12.5 fps)
Test dataset: 43763 samples





{'test_loss': 19.333114624023438,
 'test_model_preparation_time': 0.0055,
 'test_loc_precision@30': 0.41176470588235287,
 'test_loc_recall@30': 0.2952941460618881,
 'test_loc_f1@30': 0.3439365955679816,
 'test_dense_bleu4@30': 0.2537302916961953,
 'test_dense_bleurt@30': -1.4501712120202583,
 'test_dense_rougeL@30': 0.08932592469013528,
 'test_dense_meteor@30': 0.10589603530316359,
 'test_dense_cider@30': 0.036109024908359336,
 'test_loc_precision@50': 0.41176470588235287,
 'test_loc_recall@50': 0.13395517570011267,
 'test_loc_f1@50': 0.2021477149178802,
 'test_dense_bleu4@50': 0.2728071451055303,
 'test_dense_bleurt@50': -1.4441926719475413,
 'test_dense_rougeL@50': 0.09569768752989098,
 'test_dense_meteor@50': 0.10882297367522317,
 'test_dense_cider@50': 0.037764951274307734,
 'test_loc_precision@70': 0.41176470588235287,
 'test_loc_recall@70': 0.04276502307968719,
 'test_loc_f1@70': 0.0774828400803242,
 'test_dense_bleu4@70': 0.2779540516633275,
 'test_dense_bleurt@70': -1.449935387