# Updated Pipeline

In [10]:
import numpy as np
import scipy.io

# Load .npy file (all frames)
features = np.load("/Users/mymac/Projects/MainProject/Dataset/SUMme/Air_Force_One.npy")  # Shape: (10000, 128, 128, 3)

# Load .mat file (annotations)
annotations = scipy.io.loadmat("/Users/mymac/Projects/MainProject/Dataset/SUMme/Air_Force_One.mat")
print("Features shape:", features.shape)
print("Annotations keys:", annotations.keys())

# Extract relevant annotations
gt_score = annotations["gt_score"]  # Ground truth scores
nFrames = annotations["nFrames"][0][0]  # Number of annotated frames
print("Ground truth scores shape:", gt_score.shape)
print("Number of annotated frames:", nFrames)

Features shape: (10000, 128, 128, 3)
Annotations keys: dict_keys(['__header__', '__version__', '__globals__', 'all_userIDs', 'segments', 'nFrames', 'video_duration', 'FPS', 'gt_score', 'user_score'])
Ground truth scores shape: (4494, 1)
Number of annotated frames: 4494


In [11]:
# Calculate the step size to select key frames
step_size = len(features) // nFrames

# Select key frames from the features
key_frames = features[::step_size][:nFrames]  # Shape: (4494, 128, 128, 3)
print("Key frames shape:", key_frames.shape)

# Verify alignment
if len(key_frames) == nFrames:
    print("Key frames and annotations are aligned.")
else:
    print("Mismatch in key frames and annotations.")

Key frames shape: (4494, 128, 128, 3)
Key frames and annotations are aligned.


In [12]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable

# Load pre-trained ResNet-50
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove the last fully connected layer
resnet.eval()

# Preprocessing function
def preprocess_frame(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(frame).unsqueeze(0)

# Extract features from key frames
def extract_features(frames):
    features = []
    for frame in frames:
        input_tensor = preprocess_frame(frame)
        with torch.no_grad():
            feature = resnet(input_tensor)
        features.append(feature.squeeze().numpy())
    return np.array(features)

# Extract features from the key frames
extracted_features = extract_features(key_frames)  # Shape: (4494, 2048)
print("Extracted features shape:", extracted_features.shape)

Extracted features shape: (4494, 2048)


In [13]:
# Prepare data
X = extracted_features  # Input features (4494, 2048)
y = gt_score  # Target labels (4494, 1)
print("Features shape:", X.shape)
print("Labels shape:", y.shape)

Features shape: (4494, 2048)
Labels shape: (4494, 1)


In [14]:
import torch.nn as nn

class VideoSummarizationModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(VideoSummarizationModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Output importance score for each frame

    def forward(self, x):
        h_lstm, _ = self.lstm(x)
        out = self.fc(h_lstm)
        return torch.sigmoid(out)

# Example usage
input_size = extracted_features.shape[1]  # Feature size (2048 for ResNet-50)
hidden_size = 128
model = VideoSummarizationModel(input_size, hidden_size)
print(model)

VideoSummarizationModel(
  (lstm): LSTM(2048, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [15]:
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)

# Training loop
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward pass
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 0.7240
Epoch [2/10], Loss: 0.4345
Epoch [3/10], Loss: 0.4286
Epoch [4/10], Loss: 0.4356
Epoch [5/10], Loss: 0.4296
Epoch [6/10], Loss: 0.4184
Epoch [7/10], Loss: 0.4082
Epoch [8/10], Loss: 0.4020
Epoch [9/10], Loss: 0.4000
Epoch [10/10], Loss: 0.4009


In [16]:
model.eval()
with torch.no_grad():
    val_outputs = model(X_val)
    val_loss = criterion(val_outputs, y_val)
    print(f"Validation Loss: {val_loss.item():.4f}")

Validation Loss: 0.4002


In [19]:
# updated extract feature
import cv2
import numpy as np
import torch
import torchvision.models as models
import torchvision.transforms as transforms

# Load pre-trained ResNet-50
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove the last fully connected layer
resnet.eval()

# Preprocessing function
def preprocess_frame(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(frame).unsqueeze(0)

# Extract features from a video
def extract_features(video_path, frame_rate=1):
    cap = cv2.VideoCapture(video_path)
    frames = []
    features = []
    frame_id = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Extract frames at the specified frame rate
        if frame_id % frame_rate == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

            # Preprocess and extract features
            input_tensor = preprocess_frame(frame)
            with torch.no_grad():
                feature = resnet(input_tensor)
            features.append(feature.squeeze().numpy())

        frame_id += 1

    cap.release()
    return np.array(features)



In [20]:
def generate_summary(video_path, model, frame_rate=1):
    # Extract features from the video
    features = extract_features(video_path, frame_rate)
    
    # Convert features to a PyTorch tensor
    features = torch.tensor(features, dtype=torch.float32)

    # Generate importance scores
    model.eval()
    with torch.no_grad():
        scores = model(features).squeeze().numpy()

    # Select top-k frames as the summary
    k = int(0.1 * len(scores))  # Summarize 10% of the video
    top_k_indices = np.argsort(scores)[-k:]
    summary = np.zeros_like(scores)
    summary[top_k_indices] = 1

    return summary

In [21]:
# Example usage
video_path = "/Users/mymac/Projects/CrossTask/video.mp4"
summary = generate_summary(video_path, model)
print("Generated summary:", summary)

Generated summary: [1. 0. 0. ... 0. 0. 0.]


In [None]:
#visualize
import cv2

def visualize_summary(video_path, summary):
    cap = cv2.VideoCapture(video_path)
    frame_id = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Check if the frame is part of the summary
        if frame_id < len(summary) and summary[frame_id] == 1:
            cv2.imshow("Summary Frame", frame)
            cv2.waitKey(0)  # Press any key to see the next frame

        frame_id += 1

    cap.release()
    cv2.destroyAllWindows()

# Example usage
video_path = "/Users/mymac/Projects/CrossTask/video.mp4"
visualize_summary(video_path, summary)