In [3]:
import kagglehub

path = kagglehub.dataset_download("georgelifinrell/tvsum50-video-summarization")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/tvsum50-video-summarization


In [4]:
import os
import glob
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, applications
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, LSTM, Bidirectional, Dense, Layer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import requests 

In [5]:
NEW_KAGGLE_ROOT = "/kaggle/input/tvsum50-video-summarization"
TEST_VIDEO_ID = "-esJrBWj2d8" 

SKIP_FRAMES = 15
TOP_K = 5
EPOCHS = 10
LR = 1e-4

VIDEO_DIR = "" 
LABEL_FILE = "ydata-tvsum50-anno.tsv" 

print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.19.0


In [None]:
def setup_paths():
    print("\n" + "="*70)
    print("PATH VALIDATION")
    print("="*70)

    if not os.path.isdir(VIDEO_DIR):
        print(f"ERROR: VIDEO_DIR not found: {VIDEO_DIR}")
        return False
    print(f"Video directory: {VIDEO_DIR}")

    video_files = glob.glob(os.path.join(VIDEO_DIR, "*.mp4"))
    if not video_files:
        print(f"ERROR: No .mp4 files in {VIDEO_DIR}")
        return False
    print(f"Found {len(video_files)} video files")

    if not os.path.isfile(LABEL_FILE):
        print(f"ERROR: Label file not found: {LABEL_FILE}")
        return False
    print(f"Annotation file: {os.path.basename(LABEL_FILE)}")

    print("="*70)
    return True

In [None]:
def load_all_tsv_ids():
    valid_ids = set()
    try:
        with open(LABEL_FILE, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    valid_ids.add(parts[0])
    except Exception as e:
        print(f"Error reading TSV: {e}")
    return valid_ids

def get_manual_tvsum_labels(video_id, total_frames, valid_ids):
    if video_id not in valid_ids:
        return None

    try:
        with open(LABEL_FILE, 'r') as f:
            lines = f.readlines()

        user_scores = []
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
            if parts[0] == video_id:
                scores = np.array([float(s) for s in parts[2].split(',')])
                user_scores.append(scores)

        if not user_scores:
            return None

        avg_scores = np.mean(user_scores, axis=0)
        gt_score = cv2.resize(avg_scores.reshape(1, -1), (total_frames, 1), 
                             interpolation=cv2.INTER_NEAREST)
        gt_score = (gt_score - gt_score.min()) / (gt_score.max() - gt_score.min() + 1e-6)
        return gt_score.flatten()
    except Exception as e:
        print(f"Error getting labels for {video_id}: {e}")
        return None

def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    original_frames = []
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % SKIP_FRAMES == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            original_frames.append(frame_rgb)
            frames.append(cv2.resize(frame_rgb, (224, 224)))
        count += 1

    cap.release()
    return np.array(frames), np.array(original_frames)

In [None]:
def build_feature_extractor():
    return applications.ResNet50(include_top=False, weights='imagenet', pooling='avg')

def extract_features(frames, model):
    if len(frames) == 0:
        return np.array([])
    frames_pre = applications.resnet50.preprocess_input(frames.astype('float32'))
    return model.predict(frames_pre, batch_size=32, verbose=0)

class SelfAttention(Layer):
    def __init__(self, units):
        super(SelfAttention, self).__init__()
        self.W1 = Dense(units)
        self.V = Dense(1)

    def call(self, features):
        score = tf.nn.tanh(self.W1(features))
        return tf.nn.softmax(self.V(score), axis=1)

class BiLSTMSummarizer(tf.keras.Model):
    def __init__(self, hidden_dim=256):
        super(BiLSTMSummarizer, self).__init__()
        self.lstm = Bidirectional(LSTM(hidden_dim, return_sequences=True))
        self.attention = SelfAttention(64)
        self.regressor = Dense(1, activation='sigmoid')

    def call(self, x):
        lstm_out = self.lstm(x)
        att_weights = self.attention(lstm_out)
        frame_scores = self.regressor(lstm_out)
        return frame_scores * att_weights

In [None]:

def train_supervised_tf(video_dir):
    print("\n" + "="*70)
    print("TRAINING MODEL 1: Bi-LSTM (Supervised)")
    print("="*70)

    valid_ids = load_all_tsv_ids()
    print(f"Loaded {len(valid_ids)} video IDs from annotations")

    if len(valid_ids) == 0:
        print("❌ No valid IDs found")
        return None, None

    feat_model = build_feature_extractor()
    model = BiLSTMSummarizer()
    optimizer = optimizers.Adam(learning_rate=LR)
    mse = tf.keras.losses.MeanSquaredError()

    video_files = glob.glob(os.path.join(video_dir, "*.mp4"))
    print(f"Training on {len(video_files)} videos for {EPOCHS} epochs\n")

    for epoch in range(EPOCHS):
        total_loss = 0
        count = 0
        np.random.shuffle(video_files)

        for v_path in video_files:
            v_id = os.path.splitext(os.path.basename(v_path))[0]

            if v_id not in valid_ids:
                continue

            frames, _ = preprocess_video(v_path)
            if len(frames) == 0:
                continue

            labels = get_manual_tvsum_labels(v_id, len(frames), valid_ids)
            if labels is None:
                continue

            feats = extract_features(frames, feat_model)
            x = np.expand_dims(feats, axis=0)
            y = np.expand_dims(labels, axis=0)[..., np.newaxis]

            with tf.GradientTape() as tape:
                preds = model(x, training=True)
                min_len = min(preds.shape[1], y.shape[1])
                loss = mse(y[:, :min_len, :], preds[:, :min_len, :])

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            total_loss += loss.numpy()
            count += 1

        if count > 0:
            print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/count:.4f} | Videos: {count}")
        else:
            print("❌ No matching videos found")
            break

    model.save_weights("model1.weights.h5")
    print(f"\nModel saved: model1.weights.h5")
    return model, feat_model


def visualize_keyframes(original_frames, indices, title):
    """Display keyframes"""
    indices = sorted(indices)
    plt.figure(figsize=(15, 4))
    plt.suptitle(title, fontsize=16, fontweight='bold', y=1.05)

    for i, idx in enumerate(indices):
        plt.subplot(1, len(indices), i + 1)
        plt.imshow(original_frames[idx])
        plt.axis('off')
        plt.title(f"Frame {idx}")

    plt.tight_layout()
    plt.show()
    

if __name__ == "__main__":
    if setup_paths():
        # TRAIN
        model1, feat_extractor = train_supervised_tf(VIDEO_DIR)

        # TEST
        print("\n" + "="*70)
        print(f"TESTING ON: {TEST_VIDEO_ID}")
        print("="*70)

        test_search = glob.glob(os.path.join(VIDEO_DIR, f"*{TEST_VIDEO_ID}*"))

        if test_search:
            test_path = test_search[0]
            print(f"Video: {os.path.basename(test_path)}")

            frames, orig_frames = preprocess_video(test_path)
            print(f"Extracted {len(frames)} frames (every {SKIP_FRAMES}th frame)\n")

            # Model 1
            if model1 and len(frames) > 0:
                print("Model 1 (BiLSTM):")
                feats = extract_features(frames, feat_extractor)
                x = np.expand_dims(feats, axis=0)
                scores = model1.predict(x, verbose=0).flatten()
                idx1 = scores.argsort()[-TOP_K:][::-1]
                print(f"  Keyframes: {sorted(idx1)}")
                visualize_keyframes(orig_frames, idx1, "Model 1: BiLSTM")

        else:
            print(f"❌ Video not found: {TEST_VIDEO_ID}")
            all_videos = glob.glob(os.path.join(VIDEO_DIR, "*.mp4"))
            print(f"\nAvailable videos ({len(all_videos)} total):")
            for v in all_videos[:10]:
                print(f"  - {os.path.splitext(os.path.basename(v))[0]}")
            if len(all_videos) > 10:
                print(f"  ... and {len(all_videos) - 10} more")
    else:
        print("\n❌ Setup failed - check paths above")

--- Configuring Paths ---
✅ Found Video Directory: /kaggle/input/tvsum50-video-summarization/video (50 videos)
✅ Found Local Annotation File: /kaggle/input/tvsum50-video-summarization/data/ydata-tvsum50-anno.tsv

=== Training Model 1 (Bi-LSTM) ===
DEBUG: Label file contains 50 valid IDs.
Train videos: 40, Val videos: 10
Epoch 1/10 - train_loss: 0.1816, val_loss: 0.1837 (used 40 train / 10 val videos)
✅ Best model updated (val_loss=0.1837)
Epoch 2/10 - train_loss: 0.1806, val_loss: 0.1835 (used 40 train / 10 val videos)
✅ Best model updated (val_loss=0.1835)
Epoch 3/10 - train_loss: 0.1801, val_loss: 0.1835 (used 40 train / 10 val videos)
Epoch 4/10 - train_loss: 0.1799, val_loss: 0.1836 (used 40 train / 10 val videos)
Epoch 5/10 - train_loss: 0.1797, val_loss: 0.1832 (used 40 train / 10 val videos)
✅ Best model updated (val_loss=0.1832)
Epoch 6/10 - train_loss: 0.1795, val_loss: 0.1838 (used 40 train / 10 val videos)
Epoch 7/10 - train_loss: 0.1793, val_loss: 0.1835 (used 40 train / 10