In [2]:
import cv2
import numpy as np
from sklearn.decomposition import PCA
import time
from skimage.feature import fisher_vector, ORB, learn_gmm
import h5py
def g(seq):
    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
    #lambda version by Tony Veijalainen
    return [x for x,y in sorted(enumerate(seq), key = lambda x: x[1])]
import json

In [2]:
import skimage
skimage.__version__

'0.24.0'

In [3]:
def popatov_feat_extract(video_path):
    '''A description of the video feature extraction used by Popatov et al on Category Specific video summarization. Which is described as SIFT feature extraction, PCA and Fisher model'''
    cap = cv2.VideoCapture(video_path)

    all_descriptors = []
    frame_count = 0
    sift = cv2.SIFT_create() # Create the sift feature extracot
    pca = PCA(n_components=64)
    start = time.time()
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_count % 5 == 0:  # Process every 5th frame
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            _,descriptors = sift.detectAndCompute(frame,None)
            if descriptors is not None and descriptors.shape[0]>64:
                all_descriptors.append(pca.fit_transform(descriptors))# Apply PCA to the SIFT features

        frame_count += 1
    end = time.time()
    cap.release()
    print(all_descriptors[0].shape)
    print(f'Time to read video detect sift and normalize: {end-start}')
    k = 128
    start = time.time()

    gmm = learn_gmm(all_descriptors, n_modes=k)
    end = time.time()

    print(f'Time to learn GMM  : {end-start}')
    
    def normalize(fisher_vector):
        fisher_vector = (fisher_vector-np.mean(fisher_vector,axis=0))/(np.std(fisher_vector,axis=0))  #Mentioned 
        v = np.sqrt(abs(fisher_vector)) * np.sign(fisher_vector)
        return v / np.sqrt(np.dot(v, v))
    start = time.time()

    fisher_vectors_array = np.array([normalize(fisher_vector(descriptor,gmm)) for descriptor in all_descriptors])
    end = time.time()   
    print(f'Time to create fishers : {end-start}')


    return fisher_vectors_array

In [None]:
fisher_vector_arr = popatov_feat_extract(f'C:\\Users\\test\\Project-order\\Videos\\summe/video_10.mp4')

In [4]:
dataset_features = []
summe_dataset = h5py.File('Data/googlenet/googlenet_summe.h5')
lengths  = [(summe_dataset[key]['n_frames'][...].item()) for key in list(summe_dataset.keys())]
indices =g(lengths)
dataset_keys = list(summe_dataset.keys())
for i in range(10):
    index = indices[i]
    video_path = f'C:\\Users\\test\\Project-order\\Videos\\summe/{dataset_keys[index]}.mp4'
    features = popatov_feat_extract(video_path)
    dataset_features.append(features)
np.save('Fishers_Features_summe.npy',np.array(dataset_features, dtype=object), allow_pickle=True)

(361, 64)
Time to read video detect sift and normalize: 6.964510917663574
Time to learn GMM  : 37.45828866958618
Time to create fishers : 0.833204984664917
(652, 64)
Time to read video detect sift and normalize: 19.38700294494629
Time to learn GMM  : 61.492006063461304
Time to create fishers : 1.3966965675354004
(314, 64)
Time to read video detect sift and normalize: 14.765989065170288
Time to learn GMM  : 45.25356435775757
Time to create fishers : 0.7623374462127686
(376, 64)
Time to read video detect sift and normalize: 22.212496280670166
Time to learn GMM  : 90.1943371295929
Time to create fishers : 1.3633191585540771
(11646, 64)
Time to read video detect sift and normalize: 79.60613250732422
Time to learn GMM  : 4094.601256608963
Time to create fishers : 33.506054401397705
(9532, 64)
Time to read video detect sift and normalize: 35.70115327835083
Time to learn GMM  : 3017.1729414463043
Time to create fishers : 29.991528749465942
(549, 64)
Time to read video detect sift and normaliz

In [13]:
def cpd_auto(K, ncp, vmax, desc_rate=1, **kwargs):
    """Main interface

    Detect change points automatically selecting their number
        K       - kernel between each pair of frames in video
        ncp     - maximum ncp
        vmax    - special parameter
    Optional arguments:
        lmin     - minimum segment length
        lmax     - maximum segment length
        desc_rate - rate of descriptor sampling (vmax always corresponds to 1x)

    Note:
        - cps are always calculated in subsampled coordinates irrespective to
            desc_rate
        - lmin and m should be in agreement
    ---
    Returns: (cps, costs)
        cps   - best selected change-points
        costs - costs for 0,1,2,...,m change-points

    Memory requirement: ~ (3*N*N + N*ncp)*4 bytes ~= 16 * N^2 bytes
    That is 1,6 Gb for the N=10000.
    """
    m = ncp
    (_, scores) = cpd_nonlin(K, m, backtrack=False, **kwargs)

    N = K.shape[0]
    N2 = N*desc_rate  # length of the video before subsampling

    penalties = np.zeros(m+1)
    # Prevent division by zero (in case of 0 changes)
    ncp = np.arange(1, m+1)
    penalties[1:] = (vmax*ncp/(2.0*N2))*(np.log(float(N2)/ncp)+1)

    costs = scores/float(N) + penalties
    m_best = np.argmin(costs)
    (cps, scores2) = cpd_nonlin(K, m_best, **kwargs)

    return (cps, scores2)


#from scipy import weave

def calc_scatters(K):
    """
    Calculate scatter matrix:
    scatters[i,j] = {scatter of the sequence with starting frame i and ending frame j}
    """
    n = K.shape[0]
    K1 = np.cumsum([0] + list(np.diag(K)))
    K2 = np.zeros((n+1, n+1))
    K2[1:, 1:] = np.cumsum(np.cumsum(K, 0), 1) # TODO: use the fact that K - symmetric

    scatters = np.zeros((n, n))

    diagK2 = np.diag(K2)

    i = np.arange(n).reshape((-1,1))
    j = np.arange(n).reshape((1,-1))
    scatters = (K1[1:].reshape((1,-1))-K1[:-1].reshape((-1,1))
                - (diagK2[1:].reshape((1,-1)) + diagK2[:-1].reshape((-1,1)) - K2[1:,:-1].T - K2[:-1,1:]) / ((j-i+1).astype(float) + (j==i-1).astype(float)))
    scatters[j<i]=0
    #code = r"""
    #for (int i = 0; i < n; i++) {
    #    for (int j = i; j < n; j++) {
    #        scatters(i,j) = K1(j+1)-K1(i) - (K2(j+1,j+1)+K2(i,i)-K2(j+1,i)-K2(i,j+1))/(j-i+1);
    #    }
    #}
    #"""
    #weave.inline(code, ['K1','K2','scatters','n'], global_dict = \
    #    {'K1':K1, 'K2':K2, 'scatters':scatters, 'n':n}, type_converters=weave.converters.blitz)

    return scatters

def cpd_nonlin(K, ncp, lmin=1, lmax=100000, backtrack=True, verbose=True,
    out_scatters=None):
    """ Change point detection with dynamic programming
    K - square kernel matrix
    ncp - number of change points to detect (ncp >= 0)
    lmin - minimal length of a segment
    lmax - maximal length of a segment
    backtrack - when False - only evaluate objective scores (to save memory)

    Returns: (cps, obj)
        cps - detected array of change points: mean is thought to be constant on [ cps[i], cps[i+1] )
        obj_vals - values of the objective function for 0..m changepoints

    """
    m = int(ncp)  # prevent numpy.int64

    (n, n1) = K.shape
    assert(n == n1), "Kernel matrix awaited."

    assert(n >= (m + 1)*lmin)
    assert(n <= (m + 1)*lmax)
    assert(lmax >= lmin >= 1)

    if verbose:
        #print "n =", n
        print ("Precomputing scatters...")
    J = calc_scatters(K)

    if out_scatters != None:
        out_scatters[0] = J

    if verbose:
        print ("Inferring best change points...")
    # I[k, l] - value of the objective for k change-points and l first frames
    I = 1e101*np.ones((m+1, n+1))
    I[0, lmin:lmax] = J[0, lmin-1:lmax-1]

    if backtrack:
        # p[k, l] --- "previous change" --- best t[k] when t[k+1] equals l
        p = np.zeros((m+1, n+1), dtype=int)
    else:
        p = np.zeros((1,1), dtype=int)

    for k in range(1,m+1):
        for l in range((k+1)*lmin, n+1):
            tmin = max(k*lmin, l-lmax)
            tmax = l-lmin+1
            c = J[tmin:tmax,l-1].reshape(-1) + I[k-1, tmin:tmax].reshape(-1)
            I[k,l] = np.min(c)
            if backtrack:
                p[k,l] = np.argmin(c)+tmin

    #code = r"""
    ##define max(x,y) ((x)>(y)?(x):(y))
    #for (int k=1; k<m+1; k++) {
    #    for (int l=(k+1)*lmin; l<n+1; l++) {
    #        I(k, l) = 1e100; //nearly infinity
    #        for (int t=max(k*lmin,l-lmax); t<l-lmin+1; t++) {
    #            double c = I(k-1, t) + J(t, l-1);
    #            if (c < I(k, l)) {
    #                I(k, l) = c;
    #                if (backtrack == 1) {
    #                    p(k, l) = t;
    #                }
    #            }
    #        }
    #    }
    #}
    #"""

    #weave.inline(code, ['m','n','p','I', 'J', 'lmin', 'lmax', 'backtrack'], \
    #    global_dict={'m':m, 'n':n, 'p':p, 'I':I, 'J':J, \
    #    'lmin':lmin, 'lmax':lmax, 'backtrack': int(1) if backtrack else int(0)},
    #    type_converters=weave.converters.blitz)

    # Collect change points
    cps = np.zeros(m, dtype=int)

    if backtrack:
        cur = n
        for k in range(m, 0, -1):
            cps[k-1] = p[k, cur]
            cur = cps[k-1]

    scores = I[:, n].copy()
    scores[scores > 1e99] = np.inf
    return cps, scores

In [14]:
def kts(n_frames,features,vmax=1, frame_skip = 1):
      """ Receives the frame features from the CNN to do the Shot division based on KTS #TODO need to see how exactly this functions
      """
      seq_len = len(features)
      picks = np.arange(0, seq_len) * frame_skip

      # compute change points using KTS
      kernel = np.matmul(features, features.T)
      change_points, _ = cpd_auto(kernel, seq_len - 1, vmax, verbose=False)
      change_points *= frame_skip
      change_points = np.hstack((0, change_points, n_frames))
      begin_frames = change_points[:-1]
      end_frames = change_points[1:]
      change_points = np.vstack((begin_frames, end_frames - 1)).T

      n_frame_per_seg = end_frames - begin_frames
      return change_points, n_frame_per_seg, picks

def g(seq):
    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
    #lambda version by Tony Veijalainen
    return [x for x,y in sorted(enumerate(seq), key = lambda x: x[1])]

summe_dataset = h5py.File('Data/googlenet/googlenet_summe.h5')

lengths  = [(summe_dataset[key]['n_frames'][...].item()) for key in list(summe_dataset.keys())]
indices =g(lengths)
dataset_keys = list(summe_dataset.keys())


fish_feat = np.load('Fishers_Features_summe.npy',allow_pickle = True)

In [None]:
kts(lengths[indices[0]],fish_feat[0],1,frame_skip=5)[0]

In [14]:
fish_feat[1].shape

(588, 16512)

In [8]:
Fishers_features= np.load('Fishers_Features_summe.npy',allow_pickle = True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=1.0)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

NameError: name 'kts' is not defined

In [6]:
def calculate_metrics(true_boundaries, predicted_boundaries):
    TP = len(set(true_boundaries) & set(predicted_boundaries))
    FP = len(set(predicted_boundaries) - set(true_boundaries))
    FN = len(set(true_boundaries) - set(predicted_boundaries))

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
  

    return precision, recall, f1_score

In [11]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries.npy',allow_pickle=True)
googlenet_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    googlenet_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(googlenet_f1_scores)}')


Fisher average f1 : 0.02408578287737998


In [15]:

Fishers_features= np.load('Fishers_Features_summe.npy',allow_pickle = True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.8)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe_0.8.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.6)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)
shot_boundary = []
for i,feature in enumerate(Fishers_features):
    n_frames = lengths[indices[i]]
    change_points,_ ,_= kts(n_frames,np.array(feature),vmax=0.4)
    shot_boundary.append(change_points)
np.save('Fisher_shot_boundaries_summe_0.4.npy',np.array(shot_boundary, dtype=object), allow_pickle=True)

In [17]:
gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries.npy',allow_pickle=True)
Fishcher_1_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_1_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_1_f1_scores)}')

gt_shot_boundary = h5py.File('Data/googlenet/googlenet_summe.h5')
googlenet_shots = np.load('Fisher_shot_boundaries.npy',allow_pickle=True)
Fishcher_0_8_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_8_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_8_f1_scores)}')
Fishcher_0_6_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_6_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_6_f1_scores)}')

Fishcher_0_4_f1_scores = []

for i,index in enumerate(indices[:10]):
    precison, recall, f1 = calculate_metrics(gt_shot_boundary[dataset_keys[index]]['change_points'][...].flatten(),googlenet_shots[i].flatten())
    Fishcher_0_4_f1_scores.append(f1)
print(f'Fisher average f1 : {np.mean(Fishcher_0_4_f1_scores)}')



results_dict = {'Vmax 1.0 ':np.mean(Fishcher_1_f1_scores) , 'Vmax 0.8':np.mean(Fishcher_0_8_f1_scores),'Vmax 0.6':np.mean(Fishcher_0_6_f1_scores),'Vmax 0.4':np.mean(Fishcher_0_8_f1_scores)}

json.dump(results_dict,open('Results/Fisher_Shot_boundary_results.json','w'),indent=4)

Fisher average f1 : 0.02408578287737998
Fisher average f1 : 0.02408578287737998
Fisher average f1 : 0.02408578287737998
Fisher average f1 : 0.02408578287737998
