In [1]:
import cv2
import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Activation
from tensorflow.keras.activations import sigmoid

In [3]:
# Load pre-trained ResNet model without top classification layer
resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(None, None, 3))

# Add layers for global average pooling and sigmoid activation
x = GlobalAveragePooling2D()(resnet.output)
x = Dense(512, activation='relu')(x)
x = Dense(11)(x)  # K is the desired number of output features
x = Activation('sigmoid')(x)

# Define the model
model = Model(inputs=resnet.input, outputs=x)

In [4]:
# Path to the video file
video_path = r"F:\Sound_of_Pixels\Dataset\MUSIC_solo_videos\flute\14th Song-- D Gray-Man flute cover.mp4"  # Replace with the path to your video file

# Open the video file
cap = cv2.VideoCapture(video_path)

# Initialize an empty list to store per-frame features
per_frame_features = []

# Iterate over each frame of the video
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Preprocess frame (e.g., resize, normalize) as required
    frame = cv2.resize(frame, (224, 224))  # Resize to match ResNet input size
    frame = frame / 255.0  # Normalize pixel values
    
    # Extract features using ResNet model
    features = model.predict(np.expand_dims(frame, axis=0))
    
    # Append the features to the list
    per_frame_features.append(features)

# Perform temporal pooling (e.g., average pooling) over the per-frame features
pooled_features = np.mean(per_frame_features, axis=0)

# Apply sigmoid activation to obtain visual features for each pixel
visual_features = sigmoid(pooled_features)

# Close the video file
cap.release()

# Visual features for each pixel are stored in the variable `visual_features`


In [5]:
print(visual_features)

tf.Tensor(
[[0.6057217  0.64887434 0.64728737 0.6704617  0.53219426 0.6707192
  0.60998183 0.5842934  0.6718442  0.56825125 0.67055124]], shape=(1, 11), dtype=float32)
