# Notebook to demonstrate the inconsistency in the Shot boundaries based on the CNN representations

In [1]:
from PIL import Image
import cv2
import numpy as np
import torch
import torchvision
from tqdm import tqdm
import h5py
import os 
from torchvision.models import resnet50, ResNet50_Weights,googlenet,GoogLeNet_Weights
from torchvision.models.video import r3d_18, R3D_18_Weights
from torchvision.models import densenet121, DenseNet121_Weights
import torchvision.transforms as transforms

import torch.nn as nn
import cv2
import numpy as np
from sklearn.decomposition import PCA
import json
from Utils import kts,calculate_metrics
from numpy import linalg as LA

In [2]:
video_dir_path = "Videos"

In [3]:

class THWC_to_CTHW(torch.nn.Module):
    def forward(self, data):
        # Do some transformations
        return data.permute(3, 0, 1, 2)
class PreProcessorVidSum(object):
    def __init__(self,feature_extractor,target_downsample=2,shot_aware = True):
        self.target_downsample = target_downsample
        self.feature_extractor = feature_extractor # TODO add support for GPU
        self.shot_aware = shot_aware
    def run(self,video_path,shot_boundaries = []):
        ''' This is using the shot boundaries from the h5 datasets to frames to pick the selected, so it returns all the frames features and the selected ones
        '''
        shot_boundaries = np.array(shot_boundaries).astype(int)
        cap = cv2.VideoCapture(video_path)
        frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print(frame_rate)
        print(total_frames)
        downsample_target = frame_rate//self.target_downsample if self.target_downsample!=0 else 1
        picked_frames = np.arange(0,total_frames,downsample_target )
        selected_frames = np.union1d(shot_boundaries,picked_frames)
        print(len(selected_frames))
        if selected_frames[-1]>total_frames-1: selected_frames[-1]=total_frames-1
        print(selected_frames[-1],selected_frames[-2])
        all_frames = []
        for sub_frame in tqdm(selected_frames):
            cap.set(cv2.CAP_PROP_POS_FRAMES,sub_frame)
            ret,frame = cap.read()
            if not ret:
                print(f"Error reading frame at index {sub_frame}")
                continue
            all_frames.append(self.feature_extractor.run(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))).numpy())
        cap.release()
        return all_frames, selected_frames

class FeatureExtractor():
    def __init__(self,model,transforms):
        self.model = model
        self.transforms = transforms # Transforms should act like one function, otherwise, one should do this outside and pass identity through this transform

    def run(self,input):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # The model has to be in eval mode and on the GPU/CPU set outside.
        with torch.no_grad():
            return self.model(self.transforms(input).unsqueeze(0).to(device)).squeeze().to('cpu')
        
def g(seq):
    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
    #lambda version by Tony Veijalainen
    return [x for x,y in sorted(enumerate(seq), key = lambda x: x[1])]


In [4]:
summe_dataset = h5py.File('Data/googlenet/googlenet_summe.h5')
lengths  = [(summe_dataset[key]['n_frames'][...].item()) for key in list(summe_dataset.keys())]
indices =g(lengths)
dataset_keys = list(summe_dataset.keys())

Googlenet

In [None]:
model = googlenet(weights = GoogLeNet_Weights.IMAGENET1K_V1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
preprocess = ResNet50_Weights.IMAGENET1K_V2.transforms()

submodel = nn.Sequential(*list(model.children())[:-2]).to(device).eval()
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(25):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'summe/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('GoogleNet_Features_summe.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

ResNet

In [None]:
hdf5file = summe_dataset


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

submodel = nn.Sequential(*list(model.children())[:-1])
submodel.eval().to(device)
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(25):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'summe/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('Resnet_Features_summe.npy',np.array(dataset_features, dtype=object), allow_pickle=True)



DenseNet

In [None]:
model = densenet121(weights =DenseNet121_Weights.IMAGENET1K_V1)
submodel = nn.Sequential(*list(model.children())[:-1],nn.AdaptiveAvgPool2d(1)).to('cuda')

processed_dataset = 'densnet'
feature_extractor = FeatureExtractor(submodel,preprocess)
preprocesser_sum = PreProcessorVidSum(feature_extractor,target_downsample=0)
dataset_features = []
for i in range(25):
    index = indices[i]
    video_path = os.path.join(video_dir_path,f'summe/{dataset_keys[index]}.mp4')
    features,_ = preprocesser_sum.run(video_path)
    dataset_features.append(features)



np.save('Densnet_Features_summe.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

Creation of Shot Boundaries

In [13]:
googlenet_features= np.load('GoogleNet_Features_summe.npy',allow_pickle = True)
resnet_features = np.load('Resnet_Features_summe.npy',allow_pickle = True)
densenet_features = np.load('Densnet_Features_summe.npy',allow_pickle = True)


In [None]:


shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature))
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [None]:


shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.8_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.8_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.8_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)



shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.6_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.6_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.6_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [None]:
shot_boundary = []

for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('resnet_shot_boundaries_vmax_0.4_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []

for feature in googlenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('googlenet_shot_boundaries_0.4_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for feature in densenet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('densenet_shot_boundaries_0.4_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [None]:
Fishers_features= np.load('Fishers_Features_summe.npy',allow_pickle = True)

shot_boundary = []
for feature in resnet_features:
    feature = [feat/LA.norm(feat) for feat in feature]
    n_frames = len(feature)
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=1.0)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)


In [34]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('googlenet_shot_boundaries_summe.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries_summe.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_summe.npy',allow_pickle=True)

In [35]:
googlenet_results = {}
resnet_results ={}
densenet_results ={}

In [36]:
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []

In [37]:
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


googlenet_results['Vmax 1.0'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 1.0'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 1.0'] = np.mean(densenet_f1_scores)

In [38]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))

In [39]:
print(np.mean(perfs_avg))

0.11500576460716297


In [40]:

print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.13774717986131765
resnet average f1 : 0.11034652746312465
DenseNet average f1 : 0.1215005227419415


0.8 Vmax

In [41]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.8_summe.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.8_summe.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.8_summe.npy',allow_pickle=True)
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.10201714940551831
resnet average f1 : 0.09089453203378416
DenseNet average f1 : 0.1215005227419415


In [42]:
googlenet_results['Vmax 0.8'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.8'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.8'] = np.mean(densenet_f1_scores)

In [43]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.11989786777833027


In [44]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.6_summe.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.6_summe.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.6_summe.npy',allow_pickle=True)
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.09233794170268375
resnet average f1 : 0.07743198392249974
DenseNet average f1 : 0.1215005227419415


In [45]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.11345499856805104


In [46]:
googlenet_results['Vmax 0.6'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.6'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.6'] = np.mean(densenet_f1_scores)

In [47]:
googlenet_shots = np.load('googlenet_shot_boundaries_0.4_summe.npy',allow_pickle=True)
resnet_shots = np.load('resnet_shot_boundaries_vmax_0.4_summe.npy',allow_pickle=True)
densenet_shots = np.load('densenet_shot_boundaries_0.4_summe.npy',allow_pickle=True)
googlenet_f1_scores = []
resnet_f1_scores = []
densenet_f1_scores = []
for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),resnet_shots[i].flatten())
    resnet_f1_scores.append(f1)
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),densenet_shots[i].flatten())
    densenet_f1_scores.append(f1)


print(f'Googlenet average f1 : {np.mean(googlenet_f1_scores)}')
print(f'resnet average f1 : {np.mean(resnet_f1_scores)}')
print(f'DenseNet average f1 : {np.mean(densenet_f1_scores)}')

Googlenet average f1 : 0.08213614840477737
resnet average f1 : 0.07061977140682282
DenseNet average f1 : 0.1215005227419415


In [48]:
perfs_avg = []
for i in range(10):
    _,_,f1_goog_res = calculate_metrics(resnet_shots[i].flatten(),googlenet_shots[i].flatten())
    _,_,f1_res_dense= calculate_metrics(densenet_shots[i].flatten(),resnet_shots[i].flatten())
    _,_,f1_dens_gog= calculate_metrics(densenet_shots[i].flatten(),googlenet_shots[i].flatten())
    perfs_avg.append( np.mean([f1_goog_res,f1_res_dense]))
print(np.mean(perfs_avg))

0.12695281923192953


In [49]:
googlenet_results['Vmax 0.4'] = np.mean(googlenet_f1_scores)
resnet_results['Vmax 0.4'] = np.mean(resnet_f1_scores)
densenet_results['Vmax 0.4'] = np.mean(densenet_f1_scores)

In [52]:
json.dump(googlenet_results,open('Results/Googlenet_Shot_boundary_results.json','w'),indent=4)
json.dump(resnet_results,open('Results/Resnet_Shot_boundary_results.json','w'),indent=4)
json.dump(densenet_results,open('Results/Densenet_Shot_boundary_results.json','w'),indent=4)