# Notebook to demonstrate the inconsistency in the Shot boundaries based on the feature representations

In [3]:
from PIL import Image
import cv2
import numpy as np
import torch
import torchvision
from tqdm import tqdm
import h5py
import os
from torchvision.models import resnet50, ResNet50_Weights,googlenet,GoogLeNet_Weights
from torchvision.models.video import r3d_18, R3D_18_Weights
from torchvision.models import densenet121, DenseNet121_Weights
import torchvision.transforms as transforms

import torch.nn as nn
import cv2
import numpy as np
from sklearn.decomposition import PCA
import json
from Utils import kts,calculate_metrics
from numpy import linalg as LA

In [4]:
video_dir_path = "Videos"

In [5]:

def g(seq):
    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
    #lambda version by Tony Veijalainen
    return [x for x,y in sorted(enumerate(seq), key = lambda x: x[1])]

class THWC_to_CTHW(torch.nn.Module):
    def forward(self, data):
        # Do some transformations
        return data.permute(3, 0, 1, 2)
class PreProcessorVidSum(object):
    def __init__(self,feature_extractor,target_downsample=2,shot_aware = True):
        self.target_downsample = target_downsample
        self.feature_extractor = feature_extractor # TODO add support for GPU
        self.shot_aware = shot_aware
    def run(self,video_path,shot_boundaries = []):
        ''' This is using the shot boundaries from the h5 datasets to frames to pick the selected, so it returns all the frames features and the selected ones
        '''
        shot_boundaries = np.array(shot_boundaries).astype(int)
        cap = cv2.VideoCapture(video_path)
        frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print(frame_rate)
        print(total_frames)
        downsample_target = frame_rate//self.target_downsample if self.target_downsample!=0 else 1
        picked_frames = np.arange(0,total_frames,downsample_target )
        selected_frames = np.union1d(shot_boundaries,picked_frames)
        print(len(selected_frames))
        if selected_frames[-1]>total_frames-1: selected_frames[-1]=total_frames-1
        print(selected_frames[-1],selected_frames[-2])
        all_frames = []
        for sub_frame in tqdm(selected_frames):
            cap.set(cv2.CAP_PROP_POS_FRAMES,sub_frame)
            ret,frame = cap.read()
            if not ret:
                print(f"Error reading frame at index {sub_frame}")
                continue
            all_frames.append(self.feature_extractor.run(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))).numpy())
        cap.release()
        return all_frames, selected_frames

class FeatureExtractor():
    def __init__(self,model,transforms):
        self.model = model
        self.transforms = transforms # Transforms should act like one function, otherwise, one should do this outside and pass identity through this transform

    def run(self,input):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # The model has to be in eval mode and on the GPU/CPU set outside.
        with torch.no_grad():
            return self.model(self.transforms(input).unsqueeze(0).to(device)).squeeze().to('cpu')


In [6]:
tvsum_dataset = h5py.File('Data/googlenet/googlenet_tvsum.h5')

lengths  = [(tvsum_dataset[key]['n_frames'][...].item()) for key in list(tvsum_dataset.keys())]
indices =g(lengths)
dataset_keys = list(tvsum_dataset.keys())




Googlenet

In [7]:
model = googlenet(weights = GoogLeNet_Weights.IMAGENET1K_V1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms()

submodel = nn.Sequential(*list(model.children())[:-2]).to(device).eval()
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(10):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'tvsum/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('GoogleNet_Features_tvsum.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

23
2500
2500
2499 2498


 18%|█▊        | 446/2500 [00:14<01:08, 29.82it/s]


KeyboardInterrupt: 

ResNet

In [None]:
hdf5file = tvsum_dataset


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

submodel = nn.Sequential(*list(model.children())[:-1])
submodel.eval().to(device)
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(10):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'tvsum/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('Resnet_Features_tvsum.npy',np.array(dataset_features, dtype=object), allow_pickle=True)



DenseNet

In [None]:
model = densenet121(weights =DenseNet121_Weights.IMAGENET1K_V1)
submodel = nn.Sequential(*list(model.children())[:-1],nn.AdaptiveAvgPool2d(1)).to('cuda')

processed_dataset = 'densnet'
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(10):
    index = indices[i]
    video_path =os.path.join(video_dir_path,f'tvsum/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('Densnet_Features_tvsum.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

Creation of Shot Boundaries

In [8]:
googlenet_features= np.load('GoogleNet_Features_tvsum.npy',allow_pickle = True)
resnet_features = np.load('Resnet_Features_tvsum.npy',allow_pickle = True)
densenet_features = np.load('Densnet_Features_tvsum.npy',allow_pickle = True)


In [9]:


shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [10]:


shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.8.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.8.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.8.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)



shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.6.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.6.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.6.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [11]:
shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.4.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.4.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.4.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [14]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_tvsum.h5')
googlenet_shots = np.load('googlenet_shot_boundaries.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries.npy',allow_pickle=True)

In [15]:
googlenet_results = {}
resnet_results ={}
densenet_results ={}

In [16]:
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []

In [17]:
densenet_shots[1]

array([[0, 2940]], dtype=object)

In [18]:
gt_shot_boundary[dataset_keys[indices[0]]]['change_points'][...]

array([[   0,  201],
       [ 202,  311],
       [ 312,  478],
       [ 479,  666],
       [ 667,  960],
       [ 961, 1048],
       [1049, 1124],
       [1125, 1228],
       [1229, 1350],
       [1351, 1487],
       [1488, 1624],
       [1625, 1706],
       [1707, 1794],
       [1795, 1958],
       [1959, 2241],
       [2242, 2467],
       [2468, 2499]], dtype=int64)

In [19]:
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)

googlenet_results['Vmax 1.0'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 1.0'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 1.0'] = np.mean(densenet_f1_scores)

In [20]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))

In [21]:

print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.4517452491313002
resnet average f1 : 0.35952287214641804
DenseNet average f1 : 0.08209442409442409


0.8 Vmax

In [19]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.8.npy',allow_pickle=True)[10:20]
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.8.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.8.npy',allow_pickle=True)[20:30]
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.39347434440659274
resnet average f1 : 0.3168795153733152
DenseNet average f1 : 0.08209442409442409


In [20]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.25253413184825124


In [21]:
googlenet_results['Vmax 0.8'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.8'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.8'] = np.mean(densenet_f1_scores)

In [22]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.6.npy',allow_pickle=True)[10:20]
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.6.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.6.npy',allow_pickle=True)[20:30]
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.32693164649341244
resnet average f1 : 0.2633254972947965
DenseNet average f1 : 0.08209442409442409


In [23]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.23142867232486375


In [24]:
googlenet_results['Vmax 0.6'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.6'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.6'] = np.mean(densenet_f1_scores)

In [25]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.4.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.4.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.4.npy',allow_pickle=True)
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.2372799281259132
resnet average f1 : 0.19722524250195686
DenseNet average f1 : 0.08209442409442409


In [26]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.21228872323701808


In [27]:
googlenet_results['Vmax 0.4'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.4'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.4'] = np.mean(densenet_f1_scores)

In [28]:
json.dump(googlenet_results,open('Results/Googlenet_Shot_boundary_results.json','w'),indent=4)
json.dump(resnet_results,open('Results/Resnet_Shot_boundary_results.json','w'),indent=4)
json.dump(densenet_results,open('Results/Densenet_Shot_boundary_results.json','w'),indent=4)