In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip -q "/content/drive/MyDrive/action-video.zip" -d "/content/"

In [None]:
!pip install -q transformers evaluate decord

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
from transformers import TrainingArguments, Trainer
import evaluate
import decord
from decord import VideoReader, cpu
import random
import os

decord.bridge.set_bridge('torch')

# Config
MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"
BATCH_SIZE = 4
EPOCHS = 10 
NUM_FRAMES = 16 
LEARNING_RATE = 5e-5

TRAIN_DIR = Path('action-video/data/data_train')
TEST_DIR = Path('action-video/data/test')

In [None]:
class VideoMAEDataset(Dataset):
    def __init__(self, root_dir, image_processor, num_frames=16, is_train=True):
        self.root = root_dir
        self.image_processor = image_processor
        self.num_frames = num_frames
        self.is_train = is_train

        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class2id = {c: i for i, c in enumerate(self.classes)}
        self.id2class = {i: c for c, i in self.class2id.items()}

        self.samples = []
        for cls in self.classes:
            cls_path = self.root / cls
            for vid_folder in sorted([d for d in cls_path.iterdir() if d.is_dir()]):
                frames = sorted([str(p) for p in vid_folder.glob('*.jpg')])
                if len(frames) > 0:
                    self.samples.append((frames, self.class2id[cls]))

    def __len__(self):
        return len(self.samples)

    def _get_indices(self, total_frames):
        # Chọn 16 frame đều nhau
        if total_frames <= self.num_frames:
            # Nếu ít hơn 16, lặp lại
            indices = np.arange(total_frames)
            # Pad bằng frame cuối
            pad = [indices[-1]] * (self.num_frames - total_frames)
            indices = np.concatenate([indices, pad])
        else:
            # Lấy cách đều (Uniform sampling)
            tick = total_frames / float(self.num_frames)
            indices = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_frames)])

        return indices.astype(int)

    def __getitem__(self, idx):
        frame_paths, label = self.samples[idx]

        indices = self._get_indices(len(frame_paths))
        video = []
        import cv2
        from PIL import Image

        for i in indices:
            img_path = frame_paths[int(i)]

            try:
                img = Image.open(img_path).convert("RGB")
                video.append(np.array(img))
            except Exception as e:
                print(f"Lỗi đọc ảnh: {img_path} - {e}")
                # Nếu lỗi ảnh, thêm ảnh đen để không crash
                video.append(np.zeros((224, 224, 3), dtype=np.uint8))

        # Processor xử lý (Resize, Normalize, Crop...)
        inputs = self.image_processor(list(video), return_tensors="pt")

        return {
            "pixel_values": inputs["pixel_values"].squeeze(), # [16, 3, 224, 224]
            "labels": torch.tensor(label)
        }

In [None]:
image_processor = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)

full_dataset = VideoMAEDataset(TRAIN_DIR, image_processor, num_frames=NUM_FRAMES)

from torch.utils.data import random_split
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_ds, val_ds = random_split(full_dataset, [train_size, val_size],
                                generator=torch.Generator().manual_seed(42))

print(f"Train: {len(train_ds)}, Val: {len(val_ds)}")
print(f"Classes: {len(full_dataset.classes)}")

# 4. Load Model VideoMAE
# ignore_mismatched_sizes=True để thay đổi lớp Head cuối cùng (từ 400 class Kinetics sang 51 class HMDB)
model = VideoMAEForVideoClassification.from_pretrained(
    MODEL_CKPT,
    label2id=full_dataset.class2id,
    id2label=full_dataset.id2class,
    ignore_mismatched_sizes=True,
    num_frames=NUM_FRAMES
)

In [None]:
# Hàm tính toán metric
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

# Cấu hình Train
args = TrainingArguments(
    output_dir="videomae-hmdb51",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,             
    fp16=True,                      
    gradient_accumulation_steps=4,  
    dataloader_num_workers=2
)

# Khởi tạo Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=None 
)

In [None]:
print("Bắt đầu Fine-tuning VideoMAE...")
trainer.train()

# Lưu model cuối cùng
trainer.save_model("/content/drive/MyDrive/videomae_best_model")
print("Đã lưu model tốt nhất")

In [None]:
# Load model đã train
trained_model = VideoMAEForVideoClassification.from_pretrained("/content/drive/MyDrive/videomae_best_model").to("cuda")
trained_model.eval()

# Dataset Test
test_folders = sorted([d for d in TEST_DIR.iterdir() if d.is_dir()], key=lambda x: int(x.name))

predictions = []

print("Running Inference on Test Set...")
with torch.no_grad():
    for vid_dir in tqdm(test_folders):
        vid_id = int(vid_dir.name)
        frames = sorted([str(p) for p in vid_dir.glob('*.jpg')])

        if len(frames) == 0: continue

        # 1. Preprocess 
        indices = full_dataset._get_indices(len(frames))
        video = []
        from PIL import Image
        for i in indices:
            img = Image.open(frames[i]).convert("RGB")
            video.append(np.array(img))

        inputs = image_processor(list(video), return_tensors="pt")
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # 2. Predict
        logits = trained_model(**inputs).logits
        pred_label_idx = logits.argmax(-1).item()
        pred_class = full_dataset.id2class[pred_label_idx]

        predictions.append((vid_id, pred_class))

with open('submission.csv', 'w') as f:
    f.write('id,class\n')
    for vid, cls in sorted(predictions, key=lambda x: x[0]):
        f.write(f'{vid},{cls}\n')

print("Submission saved!")