In [None]:
# ============================================================================
# Extract Visual Features with ViT for CausalVidQA
# ============================================================================
# M·ª•c ti√™u:
# - ƒê·ªçc video IDs t·ª´ file pkl (train.pkl, valid.pkl, test.pkl)
# - Tr√≠ch xu·∫•t visual-feature b·∫±ng ViT `google/vit-large-patch16-224-in21k`
# - L∆∞u feature d∆∞·ªõi d·∫°ng .h5 (gi·ªëng format hi·ªán t·∫°i c·ªßa visual-feature)
# - Upload l√™n HuggingFace ƒë·ªÉ d√πng l·∫°i sau
#
# Output format (t∆∞∆°ng th√≠ch DataLoader.py):
# - vit_visual_feat.h5: ch·ª©a "resnet_features" shape (N_videos, T, D)
# - idx2vid.pkl: list video_id theo th·ª© t·ª± index
# ============================================================================

# ============================================================================
# 0. C√†i ƒë·∫∑t th∆∞ vi·ªán
# ============================================================================
%pip install -q "transformers>=4.40.0" "huggingface_hub>=0.24.0" opencv-python tqdm h5py



In [None]:
import os
import pickle
from pathlib import Path

import cv2
import h5py
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoImageProcessor, AutoModel
from huggingface_hub import HfApi, login

# ============================================================================
# 1. C·∫§U H√åNH - S·ª¨A L·∫†I CHO ƒê√öNG DATASET C·ª¶A B·∫†N
# ============================================================================

class Config:
    # ==================== KAGGLE PATHS ====================
    # ƒê∆∞·ªùng d·∫´n t·ªõi split files (train.pkl, valid.pkl, test.pkl)
    SPLIT_PATH = "/kaggle/input/casual-vid-data-split/split"
    
    # ƒê∆∞·ªùng d·∫´n t·ªõi th∆∞ m·ª•c ch·ª©a video g·ªëc
    VIDEO_ROOT = "/kaggle/input/causalvid-videos"  # TODO: s·ª≠a l·∫°i
    
    # ==================== OUTPUT ====================
    # Th∆∞ m·ª•c l∆∞u feature output
    OUTPUT_DIR = "/kaggle/working/vit_visual_features"
    
    # ==================== MODEL ====================
    VIT_NAME = "google/vit-large-patch16-224-in21k"
    
    # S·ªë frame t·ªëi ƒëa l·∫•y m·ªói video (l·∫•y m·∫´u ƒë·ªÅu)
    NUM_FRAMES = 16
    
    # Thi·∫øt b·ªã
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    
    # ==================== HUGGINGFACE ====================
    HF_TOKEN = ""          # ƒêi·ªÅn token ƒë·ªÉ upload
    HF_REPO_ID = ""        # V√≠ d·ª•: "your-username/causalvid-vit-features"

cfg = Config()

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

print("=" * 60)
print("CONFIGURATION")
print("=" * 60)
print(f"SPLIT_PATH : {cfg.SPLIT_PATH}")
print(f"VIDEO_ROOT : {cfg.VIDEO_ROOT}")
print(f"OUTPUT_DIR : {cfg.OUTPUT_DIR}")
print(f"VIT_NAME   : {cfg.VIT_NAME}")
print(f"NUM_FRAMES : {cfg.NUM_FRAMES}")
print(f"DEVICE     : {cfg.DEVICE}")
print("=" * 60)


In [None]:
# ============================================================================
# 2. LOAD VIDEO IDs T·ª™ PKL FILES
# ============================================================================

def load_pkl(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Load t·∫•t c·∫£ video IDs t·ª´ train, valid, test
all_vids = set()

for split_name in ['train', 'valid', 'test']:
    split_file = os.path.join(cfg.SPLIT_PATH, f'{split_name}.pkl')
    if os.path.exists(split_file):
        vids = load_pkl(split_file)
        print(f"Loaded {len(vids)} videos from {split_name}.pkl")
        all_vids.update(vids)
    else:
        # Th·ª≠ val.pkl thay v√¨ valid.pkl
        alt_file = os.path.join(cfg.SPLIT_PATH, 'val.pkl')
        if split_name == 'valid' and os.path.exists(alt_file):
            vids = load_pkl(alt_file)
            print(f"Loaded {len(vids)} videos from val.pkl")
            all_vids.update(vids)
        else:
            print(f"[WARN] Not found: {split_file}")

all_vids = sorted(list(all_vids))
print(f"\n‚úÖ Total unique videos: {len(all_vids)}")


In [None]:
# ============================================================================
# 3. LOAD M√î H√åNH ViT
# ============================================================================

print("Loading ViT model and processor ...")
processor = AutoImageProcessor.from_pretrained(cfg.VIT_NAME)
model = AutoModel.from_pretrained(cfg.VIT_NAME)
model.to(cfg.DEVICE)
model.eval()

HIDDEN_DIM = model.config.hidden_size  # 1024 cho vit-large

print(f"‚úÖ Loaded {cfg.VIT_NAME}")
print(f"   Hidden size: {HIDDEN_DIM}")
print(f"   Image size : {getattr(model.config, 'image_size', 224)}")
print(f"   Patch size : {getattr(model.config, 'patch_size', 16)}")


In [None]:
# ============================================================================
# 4. H√ÄM TI·ªÜN √çCH
# ============================================================================

def find_video_path(video_id, video_root):
    """T√¨m file video theo video_id trong th∆∞ m·ª•c video_root"""
    video_root = Path(video_root)
    
    # Th·ª≠ c√°c extension ph·ªï bi·∫øn
    for ext in ['.mp4', '.avi', '.mkv', '.mov', '.webm']:
        # Th·ª≠ tr·ª±c ti·∫øp
        path = video_root / f"{video_id}{ext}"
        if path.exists():
            return path
        
        # Th·ª≠ trong subfolder
        for subpath in video_root.rglob(f"{video_id}{ext}"):
            return subpath
    
    return None


def load_video_frames(video_path, num_frames=16, target_size=(224, 224)):
    """ƒê·ªçc video v√† l·∫•y t·ªëi ƒëa `num_frames` frame ƒë∆∞·ª£c l·∫•y m·∫´u ƒë·ªÅu."""
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
    if total_frames == 0:
        cap.release()
        raise RuntimeError(f"Video has 0 frames: {video_path}")

    # Ch·ªçn index frame theo sampling ƒë·ªÅu
    num = min(num_frames, total_frames)
    indices = np.linspace(0, total_frames - 1, num=num, dtype=int)

    frames = []
    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ok, frame_bgr = cap.read()
        if not ok:
            continue
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        frame_rgb = cv2.resize(frame_rgb, target_size, interpolation=cv2.INTER_AREA)
        frames.append(frame_rgb)

    cap.release()

    if len(frames) == 0:
        raise RuntimeError(f"No frames decoded from: {video_path}")

    return frames


@torch.no_grad()
def extract_vit_features(video_path):
    """Tr√≠ch xu·∫•t visual feature t·ª´ ViT [CLS] token.
    
    Returns:
        frame_feat: (T, D) - visual feature cho m·ªói frame
    """
    # L·∫•y target size t·ª´ processor
    if isinstance(processor.size, dict) and "shortest_edge" in processor.size:
        target_size = (processor.size["shortest_edge"],) * 2
    else:
        target_size = (224, 224)
    
    frames = load_video_frames(video_path, num_frames=cfg.NUM_FRAMES, target_size=target_size)

    # Chu·∫©n b·ªã batch tensor
    inputs = processor(images=frames, return_tensors="pt")
    inputs = {k: v.to(cfg.DEVICE) for k, v in inputs.items()}

    outputs = model(**inputs)
    hidden = outputs.last_hidden_state  # (T, 1 + P, D)

    # Token [CLS] l√† token ƒë·∫ßu ti√™n -> visual feature c·ªßa frame
    cls_tok = hidden[:, 0]  # (T, D)
    frame_feat = cls_tok.detach().cpu().numpy()

    return frame_feat.astype(np.float32)

print("‚úÖ Helper functions defined")


In [None]:
# ============================================================================
# 5. TR√çCH XU·∫§T FEATURES CHO T·∫§T C·∫¢ VIDEOS
# ============================================================================

print("=" * 60)
print("EXTRACTING ViT FEATURES")
print("=" * 60)

# Chu·∫©n b·ªã l∆∞u tr·ªØ
features_dict = {}  # video_id -> (T, D)
idx2vid = []        # index -> video_id
failed_vids = []    # videos kh√¥ng extract ƒë∆∞·ª£c

for vid in tqdm(all_vids, desc="Extracting ViT features"):
    video_path = find_video_path(vid, cfg.VIDEO_ROOT)
    
    if video_path is None:
        failed_vids.append((vid, "Video file not found"))
        continue
    
    try:
        frame_feat = extract_vit_features(video_path)
        features_dict[vid] = frame_feat
        idx2vid.append(vid)
    except Exception as e:
        failed_vids.append((vid, str(e)))
        continue

print(f"\n‚úÖ Extracted features for {len(features_dict)} videos")
if failed_vids:
    print(f"‚ùå Failed: {len(failed_vids)} videos")
    for vid, err in failed_vids[:5]:
        print(f"   - {vid}: {err}")


In [None]:
# ============================================================================
# 6. L∆ØU FEATURES D∆Ø·ªöI D·∫†NG H5 (t∆∞∆°ng th√≠ch DataLoader.py)
# ============================================================================

print("=" * 60)
print("SAVING FEATURES")
print("=" * 60)

if len(features_dict) == 0:
    print("‚ùå No features to save!")
else:
    # T√¨m s·ªë frame t·ªëi ƒëa ƒë·ªÉ pad
    max_frames = max(feat.shape[0] for feat in features_dict.values())
    feat_dim = HIDDEN_DIM
    n_videos = len(idx2vid)
    
    print(f"Number of videos: {n_videos}")
    print(f"Max frames: {max_frames}")
    print(f"Feature dim: {feat_dim}")
    
    # T·∫°o array (N, T, D) v·ªõi padding
    all_features = np.zeros((n_videos, max_frames, feat_dim), dtype=np.float32)
    
    for i, vid in enumerate(idx2vid):
        feat = features_dict[vid]
        T = feat.shape[0]
        all_features[i, :T, :] = feat
    
    # L∆∞u h5 file
    h5_path = os.path.join(cfg.OUTPUT_DIR, "vit_visual_feat.h5")
    with h5py.File(h5_path, 'w') as f:
        f.create_dataset('resnet_features', data=all_features, compression='gzip')
    print(f"‚úÖ Saved: {h5_path}")
    print(f"   Shape: {all_features.shape}")
    
    # L∆∞u idx2vid.pkl
    idx2vid_path = os.path.join(cfg.OUTPUT_DIR, "idx2vid.pkl")
    with open(idx2vid_path, 'wb') as f:
        pickle.dump(idx2vid, f)
    print(f"‚úÖ Saved: {idx2vid_path}")
    print(f"   Videos: {len(idx2vid)}")
    
    # L∆∞u failed videos ƒë·ªÉ debug
    if failed_vids:
        failed_path = os.path.join(cfg.OUTPUT_DIR, "failed_videos.txt")
        with open(failed_path, 'w') as f:
            for vid, err in failed_vids:
                f.write(f"{vid}\t{err}\n")
        print(f"‚ö†Ô∏è Saved failed list: {failed_path}")


In [None]:
# ============================================================================
# 7. UPLOAD L√äN HUGGINGFACE HUB
# ============================================================================

if cfg.HF_TOKEN and cfg.HF_REPO_ID:
    print("=" * 60)
    print("UPLOADING TO HUGGINGFACE")
    print("=" * 60)
    
    login(token=cfg.HF_TOKEN)
    api = HfApi()
    
    # T·∫°o repo n·∫øu ch∆∞a c√≥
    api.create_repo(cfg.HF_REPO_ID, repo_type="dataset", exist_ok=True)
    
    # Upload to√†n b·ªô th∆∞ m·ª•c output
    api.upload_folder(
        repo_id=cfg.HF_REPO_ID,
        folder_path=cfg.OUTPUT_DIR,
        repo_type="dataset",
    )
    
    print(f"‚úÖ Uploaded to: https://huggingface.co/datasets/{cfg.HF_REPO_ID}")
else:
    print("\n[INFO] HF_TOKEN ho·∫∑c HF_REPO_ID ch∆∞a ƒë∆∞·ª£c c·∫•u h√¨nh.")
    print("       B·ªè qua b∆∞·ªõc upload. Features ƒë√£ ƒë∆∞·ª£c l∆∞u local t·∫°i:")
    print(f"       {cfg.OUTPUT_DIR}")


In [None]:
# ============================================================================
# 8. VERIFY - KI·ªÇM TRA L·∫†I FEATURES ƒê√É L∆ØU
# ============================================================================

print("=" * 60)
print("VERIFICATION")
print("=" * 60)

h5_path = os.path.join(cfg.OUTPUT_DIR, "vit_visual_feat.h5")
idx2vid_path = os.path.join(cfg.OUTPUT_DIR, "idx2vid.pkl")

if os.path.exists(h5_path) and os.path.exists(idx2vid_path):
    # Load l·∫°i v√† ki·ªÉm tra
    with h5py.File(h5_path, 'r') as f:
        feats = f['resnet_features']
        print(f"‚úÖ vit_visual_feat.h5")
        print(f"   Shape: {feats.shape}")
        print(f"   Dtype: {feats.dtype}")
        
        # Sample m·ªôt video
        sample_feat = feats[0]
        print(f"   Sample video[0] shape: {sample_feat.shape}")
        print(f"   Sample video[0] mean: {sample_feat.mean():.4f}")
    
    with open(idx2vid_path, 'rb') as f:
        loaded_idx2vid = pickle.load(f)
    print(f"\n‚úÖ idx2vid.pkl")
    print(f"   Number of videos: {len(loaded_idx2vid)}")
    print(f"   First 5 videos: {loaded_idx2vid[:5]}")
    
    print("\n" + "=" * 60)
    print("üìã S·ª¨ D·ª§NG TRONG DATALOADER:")
    print("=" * 60)
    print(f"""
# Trong DataLoader.py, thay ƒë·ªïi ƒë∆∞·ªùng d·∫´n:
video_feature_path = "{cfg.OUTPUT_DIR}"

# Ho·∫∑c sau khi upload l√™n HuggingFace:
# video_feature_path = "/path/to/downloaded/vit_visual_features"

# File structure:
#   {cfg.OUTPUT_DIR}/
#   ‚îú‚îÄ‚îÄ vit_visual_feat.h5   # (N, T, {HIDDEN_DIM})
#   ‚îî‚îÄ‚îÄ idx2vid.pkl          # list of video_ids

# L∆∞u √Ω: C·∫ßn s·ª≠a DataLoader.py ƒë·ªÉ load "vit_visual_feat.h5" 
# thay v√¨ "appearance_feat.h5" v√† "motion_feat.h5"
""")
else:
    print("‚ùå Feature files not found!")


In [None]:
# ============================================================================
# 9. SUMMARY
# ============================================================================

print("=" * 60)
print("‚úÖ EXTRACTION COMPLETE!")
print("=" * 60)
print(f"""
üìÇ Output files:
   - {cfg.OUTPUT_DIR}/vit_visual_feat.h5
   - {cfg.OUTPUT_DIR}/idx2vid.pkl

üìä Feature info:
   - Model: {cfg.VIT_NAME}
   - Feature dim: {HIDDEN_DIM}
   - Frames per video: {cfg.NUM_FRAMES}

üîÑ ƒê·ªÉ s·ª≠ d·ª•ng features n√†y trong training:
   1. Upload l√™n HuggingFace (ƒë√£ l√†m n·∫øu HF_TOKEN ƒë∆∞·ª£c c·∫•u h√¨nh)
   2. S·ª≠a DataLoader.py ƒë·ªÉ load vit_visual_feat.h5
   3. Ho·∫∑c t·∫°o DataLoader m·ªõi cho ViT features

üéØ Next steps:
   - Download features t·ª´ HuggingFace
   - C·∫≠p nh·∫≠t DataLoader.py
   - Ch·∫°y training v·ªõi ViT features
""")
