# Notebook to replicate the Shot boundaries using Popatov et als Procedure for the SumMe dataset

The procedure followed listed verbatim from the paper is as follows:

1. The video is sub-sampled by skipping every sth frame (s = 5),
2. MultiScale SIFT  features are extracted and PCA is applied for dimen-
sionality reduction, leading to a set of 64 features,
3. For all of the sub-sampled frame features, a Gaussian Mixture Model is
trained with 128 Gaussians,
4. Finally, a Fisher vector  is extracted for each frame.

This notebook should be run in environment 2 which uses python version 3.10. the full set of instructions to setup this environment can be seen in the Setup.md


In [1]:
import cv2
import numpy as np
from sklearn.decomposition import PCA
import time
from skimage.feature import fisher_vector, ORB, learn_gmm
import h5py
from Utils import calculate_metrics,kts
import json
import os
import skimage

ModuleNotFoundError: No module named 'torch'

In [2]:

skimage.__version__

'0.24.0'

In [3]:

def g(seq):
    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
    #lambda version by Tony Veijalainen
    ''' This returns a list of indices sorted based on the values in the original array
    '''
    return [x for x,y in sorted(enumerate(seq), key = lambda x: x[1])]

def popatov_feat_extract(video_path):
    '''A description of the video feature extraction used by Popatov et al on Category Specific video summarization. Which is described as SIFT feature extraction, PCA and Fisher model'''
    cap = cv2.VideoCapture(video_path)

    all_descriptors = []
    frame_count = 0
    sift = cv2.SIFT_create() # Create the sift feature extracot
    pca = PCA(n_components=64)
    start = time.time()
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_count % 5 == 0:  # Process every 5th frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            _,descriptors = sift.detectAndCompute(frame,None)
            if descriptors is not None and descriptors.shape[0]>64:
                all_descriptors.append(pca.fit_transform(descriptors))# Apply PCA to the SIFT features

        frame_count += 1
    end = time.time()
    cap.release()
    print(all_descriptors[0].shape)
    print(f'Time to read video detect sift and normalize: {end-start}')
    k = 128
    start = time.time()

    gmm = learn_gmm(all_descriptors, n_modes=k)
    end = time.time()

    print(f'Time to learn GMM  : {end-start}')
    
    def normalize(fisher_vector):
        fisher_vector = (fisher_vector-np.mean(fisher_vector,axis=0))/(np.std(fisher_vector,axis=0))  #Mentioned 
        v = np.sqrt(abs(fisher_vector)) * np.sign(fisher_vector)
        return v / np.sqrt(np.dot(v, v))
    start = time.time()

    fisher_vectors_array = np.array([normalize(fisher_vector(descriptor,gmm)) for descriptor in all_descriptors])
    end = time.time()   
    print(f'Time to create fishers : {end-start}')


    return fisher_vectors_array

In [None]:
video_dir_path = "Videos"

## Optional 
Run the Fisher Vector extractor and test if the array dimensionality matches that from Popatov et al (D = 16512)



In [None]:
fisher_vector_arr = popatov_feat_extract(os.path.join(video_dir_path,'/summe/video_10.mp4'))

In [2]:
dataset_features = []
summe_dataset = h5py.File('Data/googlenet/googlenet_summe.h5')
lengths  = [(summe_dataset[key]['n_frames'][...].item()) for key in list(summe_dataset.keys())]
indices =g(lengths)
dataset_keys = list(summe_dataset.keys())

In [None]:

for i in range(10):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'summe/{dataset_keys[index]}.mp4')
    features = popatov_feat_extract(video_path)
    dataset_features.append(features)
np.save('Fishers_Features_summe.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

In [22]:
Fishers_features= np.load('Fishers_Features_summe.npy',allow_pickle = True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=1.0,frame_skip=5)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [4]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries.npy',allow_pickle=True)
googlenet_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(googlenet_f1_scores)}')


Fisher average f1 : 0.02408578287737998


In [None]:

Fishers_features= np.load('Fishers_Features_summe.npy',allow_pickle = True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8,frame_skip=5)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe_0.8.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6,frame_skip=5)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe_0.6.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4,frame_skip=5)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe_0.4.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [6]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries.npy',allow_pickle=True)
Fishcher_1_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_1_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_1_f1_scores)}')

gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries_summe_0.8.npy',allow_pickle=True)
Fishcher_0_8_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_8_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_8_f1_scores)}')
Fishcher_0_6_f1_scores = []
googlenet_shots = np.load('Fisher_shot_boundaries_summe_0.6.npy',allow_pickle=True)
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_6_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_6_f1_scores)}')

Fishcher_0_4_f1_scores = []
googlenet_shots = np.load('Fisher_shot_boundaries_summe_0.4.npy',allow_pickle=True)
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_4_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_4_f1_scores)}')



results_dict = {'Vmax 1.0 ':np.mean(Fishcher_1_f1_scores) , 'Vmax 0.8':np.mean(Fishcher_0_8_f1_scores),'Vmax 0.6':np.mean(Fishcher_0_6_f1_scores),'Vmax 0.4':np.mean(Fishcher_0_8_f1_scores)}

json.dump(results_dict,open('Results/Fisher_Shot_boundary_results.json','w'),indent=4)

Fisher average f1 : 0.02408578287737998
Fisher average f1 : 0.029868413386656946
Fisher average f1 : 0.029868413386656946
Fisher average f1 : 0.029868413386656946
