In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# AGS 070 - AVENet Demo on VGGSound

!pip install torchvision torchaudio opencv-python einops -q

import torch
import torchvision.transforms as T
import torchaudio
import cv2
import numpy as np
import matplotlib.pyplot as plt
from einops import rearrange
from PIL import Image

# 🎥 Load Sample Video + Audio
video_path = "vggsound_sample.mp4"  # use a VGGSound sample or your own

# 📥 Load Video Frames
def extract_video_frames(video_path, max_frames=8):
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while True:
        ret, frame = cap.read()
        if not ret or count >= max_frames:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (224, 224))
        frames.append(frame)
        count += 1
    cap.release()
    return torch.tensor(np.stack(frames)).permute(0, 3, 1, 2).float() / 255.

# 🎧 Load Audio
def extract_audio_waveform(video_path):
    waveform, sample_rate = torchaudio.load(video_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform

video_frames = extract_video_frames(video_path)
audio_waveform = extract_audio_waveform(video_path)

# 🧠 Dummy AVENet-like Network (for demo – replace with trained model)
class DummyAVENet(torch.nn.Module):
    def forward(self, frames, audio):
        b, t, c, h, w = frames.shape
        saliency_map = torch.rand(h, w)  # Fake saliency
        return saliency_map

# Simulated Inference
frames = rearrange(video_frames, 't c h w -> 1 t c h w')
model = DummyAVENet()
saliency = model(frames, audio_waveform)

# 🖼️ Show Result
def show_saliency_map(frame, saliency_map):
    sal = saliency_map.cpu().numpy()
    sal = cv2.resize(sal, (224, 224))
    heatmap = cv2.applyColorMap((sal * 255).astype(np.uint8), cv2.COLORMAP_JET)
    overlay = cv2.addWeighted(frame, 0.6, heatmap, 0.4, 0)
    plt.imshow(overlay)
    plt.axis('off')
    plt.title("Sound Localization in Frame")
    plt.show()

show_saliency_map(video_frames[0].permute(1, 2, 0).numpy(), saliency)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

ValueError: need at least one array to stack