# Notebook to demonstrate evaluation divergence of video summarization models 

This notebook demonstrates the differences in the each of the evaluation protocols followed in previous research. 

For this notebook, I'd like to credit the following repositories 

1. [DSNet](https://github.com/li-plus/DSNet)
2. [CSTASUM](https://github.com/thswodnjs3/CSTA)
3. [MSVA](https://github.com/TIBHannover/MSVA/tree/master)
4. [PGLSUM](https://github.com/e-apostolidis/PGL-SUM/tree/master)

# Helper functions and model loading:

In [1]:
import h5py 
import numpy as np 
import json 
from Utils import *
from Model import model_dict,params_dict
import os 
import torch
from Data import VideoData
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm
seeds = [12412,31235,123123,53216,123151] # Set the seeds to ensure the results are consistent NOTE: the results are within margin of error, but not exactly obtained on different machines

Helper Functions

In [2]:
 # Should do comparisons of any pred with avg/user annotations 
def correlation_single_pred(score,video_name,dataset,dataset_name='tvsum',downsample_gt=True):
    "This compares the scores with a downsampled version of the ground truth, Scenario 1"
    kendall_spearman_scores = []
    if dataset_name=="tvsum":
        data = load_tvsum_mat('Utils//ydata-tvsum50.mat')
        video_number = int(video_name.split('_')[1])
        all_user_summary = data[video_number-1]['user_anno'].T
        pick = dataset[video_name]['picks']
        all_correlations_tau = []
        all_correlations_spearman = []
        for user_summary in all_user_summary:
            if downsample_gt:
                down_sampled_summary = (user_summary/user_summary.max())[pick] # Change this to take the picks from which a certain frame was sampled from
            else:
                down_sampled_summary = (user_summary/user_summary.max())
        
            correlation_tau = kendalltau(-rankdata(down_sampled_summary),-rankdata(score))[0]
            correlation_spear = spearmanr(down_sampled_summary,score)[0]
            all_correlations_tau.append(correlation_tau)
            all_correlations_spearman.append(correlation_spear)
        kendall_spearman_scores.append(np.mean(all_correlations_tau))
        kendall_spearman_scores.append(np.mean(all_correlations_spearman))
    elif dataset_name =="summe":
        user_summarie = dataset[video_name]['user_summary']
        pick = dataset[video_name]['picks']
        if downsample_gt:
            averaged_downsampled_summary = np.average(user_summarie,axis=0)[::15]
        else:
            averaged_downsampled_summary = np.average(user_summarie,axis=0)
        kendall_score = kendalltau(rankdata(averaged_downsampled_summary),rankdata(score))[0]
        spearman_score = spearmanr(averaged_downsampled_summary,score)[0]
        kendall_spearman_scores.append(np.mean(kendall_score))
        kendall_spearman_scores.append(np.mean(spearman_score))
    
    return kendall_spearman_scores

# This should take an Upsampled score, or post knapsack score and then compare the correlation between them
def correlation_with_knapsack_scores(score,video_name,dataset):
    ''' This function first performs the knapsack processing'''
    kendall_spearman_scores = []
    avg_correlation_kendall = []
    avg_correlation_spearman = []
    user_summaries = dataset[video_name]['user_summary'][...]
    for user_summary in user_summaries:
        avg_correlation_kendall.append(kendalltau(-rankdata(user_summary),-rankdata(score))[0])
        avg_correlation_spearman.append(spearmanr(user_summary,score)[0])
    kendall_spearman_scores.append(np.mean(avg_correlation_kendall))
    kendall_spearman_scores.append(np.mean(avg_correlation_spearman))

    return kendall_spearman_scores

def correlation_with_average_gt(score,video_name,dataset):
    kendall_spearman_scores = []
    user_summary = dataset[video_name]['gtscore'][...]
    kendall_spearman_scores.append(kendalltau(-rankdata(user_summary),-rankdata(score))[0])
    kendall_spearman_scores.append(spearmanr(user_summary,score)[0])

    return kendall_spearman_scores


def upsample_prediction(score,picks,video_length):
    upsampled_pred = np.zeros(video_length)
    for i in range(len(picks)-1):
        upsampled_pred[picks[i]:picks[i+1]] = score[i]

    return upsampled_pred 
def knapsack_wrapper_with_rating(score,test_index,dataset,dataset_name):
    ''' This wrapper is used for scenario 2, Knapsack into evaluation of the correlation '''
    shot_boundaries = dataset[test_index]['change_points'][...]
    positions = dataset[test_index]['picks'][...]
    n_frames = dataset[test_index]['n_frames'][...]
    knapsack_pred = generate_summary_single(shot_boundaries,score,n_frames,positions)
    return correlation_single_pred(knapsack_pred,test_index,dataset,dataset_name,False)

    

def upsample_wrapper(score,test_index,dataset,dataset_name):
    '''This wrapper performs Scenario 3 post-processing, upsampling model prediction into evaluation'''
    positions = dataset[test_index]['picks'][...]
    n_frames = dataset[test_index]['n_frames'][...]
    upsampled_pred = upsample_prediction(score,positions,n_frames)
    return correlation_single_pred(upsampled_pred,test_index,dataset,dataset_name,False)



In [6]:
def train(run_number, config_path,save_path = 'weights'):
    with open(config_path,'r') as config_file:
        config = json.load(config_file)
    
    assert config['Model'] in model_dict.keys(), "Model is not available, modify dictionary to include them or check spelling"
    dataset_name = config['split'].split("_")[0]
    split_string = config['split'].strip(dataset_name).strip('.json')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    modelclass = model_dict[config['Model']]
    criterion = loss_dict[config['loss_function']]()
    num_epochs = config["num_epochs"]
    feature_extractor = config['feature_extractor']
    save_name = f'{feature_extractor}_{dataset_name}{split_string}'
    if not os.path.exists(os.path.join(save_path,save_name,dataset_name,config['Model'] )):
        os.makedirs(os.path.join(save_path,save_name,dataset_name,config['Model'] ))


    save_path = os.path.join(save_path,save_name,dataset_name,config['Model'])


    params = params_dict[config['Model']][config['feature_extractor']]

    if config['data_aug'] :  # Unused function for this work
        pass
    else:
        data_augmentations = []
    splits = config['total_splits'] if 'total_splits' in config.keys() else 5
    dataset = h5py.File(os.path.join('Data',config['feature_extractor'],f'{config["feature_extractor"]}_{dataset_name}.h5'))
    print(params)
    split_perfs_1 = [] 
    split_perfs_2 = []
    split_perfs_3 = []
    split_perfs_4 = []
    for split in range(splits):
        print(f"Running Split:  {split+1}  for model: {config['Model']}")
        model = modelclass(**params)
        batchloader = VideoData('train',config['split'],split,transforms=data_augmentations,feature_extractor=feature_extractor,trainval=True)
        batchloader = DataLoader(batchloader,batch_size=1,shuffle=True)
        testdata = VideoData('test',config['split'],split,feature_extractor=feature_extractor,trainval=True)
        testloader = DataLoader(testdata,batch_size=1,shuffle=False)
        optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"],weight_decay=config['reg'])
        best_f1_score = -float('inf')
        best_correlation = -float('inf')
        best_correlation_scenario_2 = -float('inf')
        best_correlation_scenario_3 = -float('inf')
        best_correlation_scenario_4 = -float('inf')
        model.to(device)
        if 'gradnorm_clip' in config:
            gradnorm_clip = config['gradnorm_clip']
        else:
            gradnorm_clip = 2
        # Make the directory for the split if it doesn't exist 
        if not os.path.exists(os.path.join(save_path,f'split_{split+1}')):
            os.mkdir(os.path.join(save_path,f'split_{split+1}'))
        save_path_split = os.path.join(save_path,f'split_{split+1}')
        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            total_samples = 0

            for data in batchloader:
                inputs, labels = data[0].to(device), data[1].to(device)
                optimizer.zero_grad()
                labels-=labels.min()
                labels/=labels.max()
                outputs = model(inputs)
                if len(outputs.shape)>2:
                    outputs = outputs.squeeze(-1)
                loss = criterion(outputs, labels)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), gradnorm_clip)
                optimizer.step()
                running_loss += loss.item()
                total_samples+=1
            epoch_loss = running_loss / len(batchloader)

            model.eval()
            test_datapoints = []
            test_names = []
            
# Adding the correlation scores to have the picks from the datapoints 
            for inputs_t,names in testloader:
                with torch.no_grad():
                    importance_scores = model(inputs_t.to(device))
                importance_scores = importance_scores[0].to('cpu').tolist()
                test_datapoints.append(importance_scores)
                test_names.append(names[0])
            all_scores = eval_summary(test_datapoints,dataset,test_names,dataset_name)



            correlation_dict = evaluate_correlation(test_datapoints ,dataset,test_names,dataset_name)
            scenario_2 = [knapsack_wrapper_with_rating(score,test_name,dataset,dataset_name) for score,test_name in zip(test_datapoints,test_names)]
            scenario_3 = [upsample_wrapper(score,test_name,dataset,dataset_name) for score,test_name in zip(test_datapoints,test_names)] # Eaach of these is one 
            scenario_4 = [correlation_with_average_gt(score,test_name,dataset) for score,test_name in zip(test_datapoints,test_names)] 
            if correlation_dict['Average_Kendall']> best_correlation:    
                print(f"Saving epoch {epoch+1}")
                best_correlation = correlation_dict['Average_Kendall']
                torch.save(model.state_dict(), os.path.join(save_path_split,f"best_run_corr_run_{run_number}_scenario_1" + ".pth")) 
            if np.mean(np.array(scenario_2)[:,0]) > best_correlation_scenario_2:
                best_correlation_scenario_2  = np.mean(np.array(scenario_2)[:,0])
                torch.save(model.state_dict(), os.path.join(save_path_split,f"best_run_corr_run_{run_number}_scenario_2" + ".pth")) 
            if np.mean(np.array(scenario_3)[:,0]) > best_correlation_scenario_3:
                best_correlation_scenario_3  = np.mean(np.array(scenario_3)[:,0])
                torch.save(model.state_dict(), os.path.join(save_path_split,f"best_run_corr_run_{run_number}_scenario_3" + ".pth")) 
            if np.mean(np.array(scenario_4)[:,0]) > best_correlation_scenario_4:
                best_correlation_scenario_4  = np.mean(np.array(scenario_4)[:,0])
                torch.save(model.state_dict(), os.path.join(save_path_split,f"best_run_corr_run_{run_number}_scenario_4" + ".pth")) 
            
            
            if np.mean(all_scores).item() > best_f1_score:
                best_f1_score = np.mean(all_scores).item()
                print(f"Best F1 Score:  {epoch+1}: {best_f1_score} ")
                #torch.save(model.state_dict(), os.path.join(save_path_split,"best_run_f1" + ".pth"))

        print(f'Best F1 score for split {split+1}: {best_f1_score} ')
        print(f'Best Correlation for split {split+1}: {best_correlation} ')
        print(f'Best Correlation of split {split+1}for Scenario 2: {best_correlation_scenario_2} ')
        print(f'Best Correlation of split {split+1}for Scenario 3: {best_correlation_scenario_3} ')
        print(f'Best Correlation of split {split+1}for Scenario 4: {best_correlation_scenario_4} ')
        split_perfs_1.append(best_correlation)
        split_perfs_2.append(best_correlation_scenario_2)
        split_perfs_3.append(best_correlation_scenario_3)
        split_perfs_4.append(best_correlation_scenario_4)
    print('Completed Training')
    return np.mean(split_perfs_1),np.mean(split_perfs_2),np.mean(split_perfs_3),np.mean(split_perfs_4)

In [None]:
five_trial_scenario_1 = []
five_trial_scenario_2 = []
five_trial_scenario_3 = []
five_trial_scenario_4 = []
for i in range(5):
    torch.manual_seed(seeds[i])
    best_correlation,best_correlation_scenario_2,best_correlation_scenario_3,best_correlation_scenario_4  = train(i,'Configs/MLP/googlenet_tvsum_can_1.json')
    five_trial_scenario_1.append(best_correlation)
    five_trial_scenario_2.append(best_correlation_scenario_2)
    five_trial_scenario_3.append(best_correlation_scenario_3)
    five_trial_scenario_4.append(best_correlation_scenario_4)


In [8]:
print('Mean over five iterations')
print(np.mean(five_trial_scenario_1))
print(np.mean(five_trial_scenario_2))
print(np.mean(five_trial_scenario_3))
print(np.mean(five_trial_scenario_4))


Mean over five iterations
0.17438962320409088
0.1001690568893652
0.17337357989117505
0.30798796502515585


In [9]:
print('variance over five iterations')
print(np.var(five_trial_scenario_1))
print(np.var(five_trial_scenario_2))
print(np.var(five_trial_scenario_3))
print(np.var(five_trial_scenario_4))


variance over five iterations
1.2943194708072375e-05
8.254003040649061e-06
1.3372578756724833e-05
5.004370930273496e-05


In [None]:
results_dict = {'Scenario 1':five_trial_scenario_1,'Scenario 2' : five_trial_scenario_2,'Scenario 3':five_trial_scenario_3 ,'Scenario 4' : five_trial_scenario_4 }
json.dump(results_dict,open('Results/Trial_results_Tvsum.json','w'),indent = 4)

Running Trials over the SumMe dataset 

In [None]:
five_trial_scenario_1 = []
five_trial_scenario_2 = []
five_trial_scenario_3 = []
five_trial_scenario_4 = []
for i in range(5):
    torch.manual_seed(seeds[i])
    best_correlation,best_correlation_scenario_2,best_correlation_scenario_3,best_correlation_scenario_4  = train(i,'Configs/MLP/googlenet_summe_can_1.json')
    five_trial_scenario_1.append(best_correlation)
    five_trial_scenario_2.append(best_correlation_scenario_2)
    five_trial_scenario_3.append(best_correlation_scenario_3)
    five_trial_scenario_4.append(best_correlation_scenario_4)


In [11]:
print('Mean over five iterations')
print(np.mean(five_trial_scenario_1))
print(np.mean(five_trial_scenario_2))
print(np.mean(five_trial_scenario_3))
print(np.mean(five_trial_scenario_4))

Mean over five iterations
0.08415445081748876
0.15273205176557683
0.08621844444669655
0.08415445081748876


In [12]:
print('variance over five iterations')
print(np.var(five_trial_scenario_1))
print(np.var(five_trial_scenario_2))
print(np.var(five_trial_scenario_3))
print(np.var(five_trial_scenario_4))


variance over five iterations
0.00039088435623092007
1.513135116877178e-05
0.00039205897653938156
0.00039088435623092007


In [13]:
results_dict = {'Scenario 1':five_trial_scenario_1,'Scenario 2' : five_trial_scenario_2,'Scenario 3':five_trial_scenario_3 ,'Scenario 4' : five_trial_scenario_4 }
json.dump(results_dict,open('Results/Trial_results_Summe.json','w'),indent = 4)

Training example of different scenario performance