This notebook calculates general incremental performance metrics.

In [None]:
import pickle
import torch
from collections import Counter
import pandas as pd

In [2]:
def load_data(model,mode, dataset, split,full=True):
#load predicted and gold bounding boxes

    try:
        if full==False:
            with open(r"/home/users/fschreiber/project/bboxes_empty_"+model+"/"+dataset+"/"+split+"_pred_bbox_list.p","rb") as f:
                pred_bbox_list=list(pickle.load(f)) 

            #the target bounding box
            with open(r"/home/users/fschreiber/project/bboxes_empty_"+model+"/"+dataset+"/"+split+"_target_bbox_list.p","rb") as f:
                target_bbox_list=list(pickle.load(f))   
        else:
            #the predicted bounding box
            with open(r"/home/users/fschreiber/project/bboxes_"+model+"/"+dataset+"/"+split+"_pred_bbox_list.p","rb") as f:
                pred_bbox_list=list(pickle.load(f))

            if mode=="non_inc":
                #the target bounding box
                with open(r"/home/users/fschreiber/project/bboxes_noninc_"+model+"/"+dataset+"/"+split+"_pred_bbox_list.p","rb") as f:
                    target_bbox_list=list(pickle.load(f))

            elif mode == "inc":
                #the target bounding box
                with open(r"/home/users/fschreiber/project/bboxes_"+model+"/"+dataset+"/"+split+"_target_bbox_list.p","rb") as f:
                    target_bbox_list=list(pickle.load(f))
            else:
                print("The mode can only be non_inc or inc")
                return -1,-1,-1,-1,-1

        #the number of one sentence split up incrementally ("the left zebra" would have length 3)
        with open(r"/home/users/fschreiber/project/incremental_pickles/length_incremental_units/"+dataset+"_"+split+"_length_unit.p","rb") as f:
            inc_len=pickle.load(f)

        #the original model data split up incrementally
        data_model=torch.load("/home/users/fschreiber/project/ready_inc_data/"+dataset+"/"+dataset+"_"+split+".pth")

        with open(r"/home/users/fschreiber/project/binary_grouped/"+model+"/"+mode+"/"+dataset+split+".p","rb") as f:
            binary_grouped=pickle.load(f)

        
        if mode=="non_inc":
            target_bbox_list=[x for x,y in zip(target_bbox_list,inc_len) for _ in range(y)]

        if full==False:
            pred_bbox_list=[x for x,y in zip(pred_bbox_list,inc_len) for _ in range(y)]
            target_bbox_list=[x for x,y in zip(target_bbox_list,inc_len) for _ in range(y)]

        if model=="TVG":
            pred_bbox_list,target_bbox_list=TVG_prep(pred_bbox_list,target_bbox_list)
                    

        return pred_bbox_list,target_bbox_list,inc_len,data_model,binary_grouped
    
    except FileNotFoundError as e:
        #print(e)
        
        return  -1,-1,-1,-1,-1
    


#TVG needs some extra adjustments to fit the same data format as Resc
def TVG_prep(pred_bbox_list,target_bbox_list):
    #print("TVG")
    for ind,(pred,targ) in enumerate (zip (pred_bbox_list,target_bbox_list)):

        pred=pred.view(1,-1)

        pred=xywh2xyxy(pred)
        pred=torch.clamp(pred,0,1)

        pred_bbox_list[ind]=pred

        targ=targ.view(1,-1)
        targ=xywh2xyxy(targ)

        target_bbox_list[ind]=targ
    return pred_bbox_list,target_bbox_list

#copied from TransVG needed to transform the bounding box vectors
def xywh2xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)


In [3]:
pred_bbox_list,target_bbox_list,inc_len,model,binary_grouped=load_data("ReSc","inc","unc","testB",False)

In [4]:
#copied from ReSC
def bbox_iou(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes
    """
    if x1y1x2y2:
        # Get the coordinates of bounding boxes
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
    else:
        # Transform from center and width to exact coordinates
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2

    # get the coordinates of the intersection rectangle
    inter_rect_x1 = torch.max(b1_x1, b2_x1)

    #print("inter x1",inter_rect_x1)

    inter_rect_y1 = torch.max(b1_y1, b2_y1)

    #print("inter y1",inter_rect_y1)

    inter_rect_x2 = torch.min(b1_x2, b2_x2)

    #print("inter x2",inter_rect_x2)

    inter_rect_y2 = torch.min(b1_y2, b2_y2)

    #print("inter y2",inter_rect_y2)

    #print("x2-x1",inter_rect_x2-inter_rect_x1)

    #print("y2-y1",inter_rect_y2-inter_rect_y1)

    # Intersection area
    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
    # Union Area
    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    # print(box1, box1.shape)
    # print(box2, box2.shape)

    #print("inter area",inter_area)
    return inter_area / (b1_area + b2_area - inter_area + 1e-16)
    #return inter_area / (b1_area + b2_area - inter_area + 1e-16),inter_rect_x2-inter_rect_x1,inter_rect_y2-inter_rect_y1


#copied from TransVG needed to transform the bounding box vectors
def xywh2xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=-1)


In [5]:
#group sentences that belong to one incremental unit
def group_by_increment(bbox_list,inc_len):
    counter=0
    group_list=[]
    for i in inc_len:
        
        group_list.append(bbox_list[counter:counter+i])
        counter=counter+i
    return group_list

In [6]:
#gives the overall accuracy
acc_list=[]
for i,j in zip(pred_bbox_list,target_bbox_list):
    acc_list.append(bbox_iou(i,j,True))

percentage = sum(1 for item in acc_list if item > 0.5) / len(acc_list)
print(percentage)

#change the percentages into true or false
binary_list=[1 if entry > 0.5 else 0 for entry in acc_list]

binary_grouped=group_by_increment(binary_list,inc_len)

0.2931183726466133


In [7]:
#is correct found at the end? final applicable
def final_applicable(binary_grouped):
    counter=0
    for entry in binary_grouped:
         
        if entry[-1]==1:
            counter+=1

    return counter/len(binary_grouped)

#final_applicable(binary_grouped)


In [8]:
#is the correct object found at all? first applicable
def first_applicable(binary_grouped):
    counter=0
    for entry in binary_grouped:
        if 1 in entry:
            counter+=1
    
    return counter/len(binary_grouped)

#first_applicable(binary_grouped=binary_grouped)

In [9]:

#first correct position
#find the position where the first correct answer is given 
def first_correct_position(binary_grouped):
    
    first_corr_pos=[]
    for entry in binary_grouped:

        #for every entry with at least one correct guess
        if 1 in entry:
            #add the index of the first correct guess and stop
            for index,number in enumerate(entry):
                if number==1:
                    
                    first_corr_pos.append(index)
                    break
    
    return first_corr_pos

first_corr_pos=first_correct_position(binary_grouped)

#filter binary list for only answers where the correct answer is reached at least once
only_correct=[]
for entry in binary_grouped:
    if 1 in entry:
        only_correct.append(entry)

#calculate the percentage of the sentence where fc is found
#and the absolute position
perc_first_correct_pos=[]
for entry,pos in zip(only_correct, first_corr_pos):
    
    perc_first_correct_pos.append( pos/len(entry))

average_first_correct_rel=sum(perc_first_correct_pos)/len(perc_first_correct_pos)
average_first_correct_pos=sum(first_corr_pos)/len(first_corr_pos)

print(average_first_correct_rel)
print(average_first_correct_pos)


c=Counter(first_corr_pos)
c.most_common()



0.0
0.0


[(0, 1631)]

In [10]:
#find first final position
def find_unchanged_position(numbers_list):
    for index, number in enumerate(numbers_list):
        if number == 1 and all(num == 1 for num in numbers_list[index:]):
             
            return index , index/len(numbers_list)
    
    #if no position is found    
    return -1,-1 


def first_final_position(binary_grouped):
    
    #get results as the absolute position and the position relative to the sentence length
    first_final_pos=[]
    first_final_pos_rel=[]
    for entry in binary_grouped:
            index,rel_pos =find_unchanged_position(entry)
            first_final_pos.append(index)
            first_final_pos_rel.append(rel_pos)

    #remove all -1 from list
    first_final_pos = [num for num in first_final_pos if num != -1]
    first_final_pos_rel = [num for num in first_final_pos_rel if num != -1]
    
    return first_final_pos,first_final_pos_rel


first_final_pos, first_final_pos_rel= first_final_position(binary_grouped)

average_first_final= sum(first_final_pos_rel)/len(first_final_pos_rel)
print("Average first final",average_first_final)
#c=Counter(first_final_pos)
#c.most_common()



Average first final 0.0


In [11]:
"""
#the function tries to determine if the predicted target boxes reference the same object
#the prediction boxes are compared to each other if their overlap is bigger than 0.5 it is assumed 
#that they describe the same object

def find_same_object(pred_bbox_list):
    
    #first we assume that every entry in the predictions describes a unique object
    #therefore it gets a unique index number
    same_list=list(range(len(pred_bbox_list)))
    
    #compare each entry to each other
    for ind_sm, value_sm in enumerate(pred_bbox_list):
        
        for ind_lg , value_lg in enumerate(pred_bbox_list):
            
            #if the index of the later entry is smaller or same as the earlier entry 
            # this entry was already seen in an earlier loop and can be skipped 
            if same_list[ind_lg] > same_list[ind_sm]:

                #if the bounding boxes of the two entries compared overlap
                #the index number of the later entry is changed to the earliest occurrence
                if bbox_iou(value_sm,value_lg)>0.5:
                    same_list[ind_lg]=same_list[ind_sm]
    
    


    #change the index position in the script to be strictly ascending
    
    unique_list=[]
    #the number of unique objects
    obj_c=0
    
    #list to match the index position to a unqiue object number
    #the first entry is the old index position
    #the second entry is the new unique object count
    match_list=[(int(0),int(0))]
    
    for num in same_list:
        
        #when the index position is not in match list
        #it gets added to the list with a new object number
        if num > match_list[-1][0]:
            
            obj_c+=1
            match_list.append((num,obj_c))
            
            unique_list.append(obj_c)

        #when the number was seen before 
        # add the object number matching the index position to the new list
        else:
            result = next((item[1] for item in match_list if item[0] == num), None)
            
            if result==None:  
                print("Number:",num,"was not found in list. This should not happen")
            else:
                
                unique_list.append(result)
                
    return unique_list

"""

'\n#the function tries to determine if the predicted target boxes reference the same object\n#the prediction boxes are compared to each other if their overlap is bigger than 0.5 it is assumed \n#that they describe the same object\n\ndef find_same_object(pred_bbox_list):\n    \n    #first we assume that every entry in the predictions describes a unique object\n    #therefore it gets a unique index number\n    same_list=list(range(len(pred_bbox_list)))\n    \n    #compare each entry to each other\n    for ind_sm, value_sm in enumerate(pred_bbox_list):\n        \n        for ind_lg , value_lg in enumerate(pred_bbox_list):\n            \n            #if the index of the later entry is smaller or same as the earlier entry \n            # this entry was already seen in an earlier loop and can be skipped \n            if same_list[ind_lg] > same_list[ind_sm]:\n\n                #if the bounding boxes of the two entries compared overlap\n                #the index number of the later entry is 

In [12]:
#calculate correctness
def correctness(binary_grouped):
    count_list=[]

    for entry in binary_grouped:
        count_list.append(entry.count(1)/len(entry))

    average_correctness=sum(count_list)/len(count_list)
    #print(average_correctness)
    
    return average_correctness

#correctness(binary_grouped)

In [13]:
# Calculates the mean edits per utterance
def mean_edits_per_utterance(binary_grouped):
    
    # Initialize two empty lists to store the positions of first correct (fc) with and without a final (ff) in each entry
    fc_no_final = []  # fc without a final
    fc_ed_utt = []    # fc with a final

    # Loop through each entry in the binary_grouped data
    for entry in binary_grouped:
        
        # For every entry where a first final is found
        if entry[-1] == 1:
            # Find the position of the first '1' in the entry, which represents a first correct
            for index, number in enumerate(entry):
                if number == 1:
                    fc_ed_utt.append(index)
                    break
        
        # For every entry where the first correct is found but the first final is not
        # Take the length to the end
        else:
            if 1 in entry:
                # Find the position of the first '1' (first correct) in the entry
                for index, number in enumerate(entry):
                    if number == 1:
                        fc_no_final.append(len(entry) - index - 1)
                        break

    # Create an empty list to store the differences between first final position and first correct position
    diff_ed_ut = []

    # Get the positions of the first final and first correct in each entry using a helper function 'first_final_position'
    first_final_pos, _ = first_final_position(binary_grouped)

    # Calculate the difference between first final and first correct positions for entries with a final
    for fc, ff in zip(fc_ed_utt, first_final_pos):
        diff_ed_ut.append(ff - fc)

    # Combine the differences for entries with and without a final
    diff_ed_ut = diff_ed_ut + fc_no_final

    # Calculate the average of the differences, which represents the mean edits per utterance
    return sum(diff_ed_ut) / len(diff_ed_ut)


In [14]:
#load data

found_sets_list,accuracy_list,fcp_list,fc_rel_list,fca_list, \
ffp_list,ff_rel_list,ffa_list,ed_utt_list, \
overhead_list,correctness_list,ffc_pos_list=([] for i in range(12))

mode="inc"
model_input="ReSc"
split_list=["testB","testA","val","test"]
dataset_list=["unc","unc+","gref_umd","gref"]
#dataset_list=["unc"]

non_inc_acc_Resc=[71.85, 78.61, 76.74, 56.08, 65.94, 63.21, 64.89, 64.01,61.16]
non_inc_acc_TVG=[76.9, 83.4, 80.8, 59.2, 72.5, 68.0, 68.7, 68.0, 68.0]


for file in dataset_list:
    for split in split_list:
    
        pred_bbox_list,target_bbox_list,inc_len,model,binary_grouped= load_data(model_input,mode,file,split,False)

        #if the file is not found pass
        if pred_bbox_list==-1 or target_bbox_list==-1 or inc_len==-1:
             pass

        else:

            print(file+split)
            found_sets_list.append(file+" "+split)
            
            #calculate the overall accuracy
            acc_list=[]
            for i,j in zip(pred_bbox_list,target_bbox_list):
                acc_list.append(bbox_iou(i,j,True))

            accuracy_list.append(sum(1 for item in acc_list if item > 0.5) / len(acc_list))
            
            #change the percentages into true or false
            binary_list=[1 if entry > 0.5 else 0 for entry in acc_list]

            binary_grouped=group_by_increment(binary_list,inc_len)
            

            #get the metrics
            
            #first correct position
            first_corr_pos=first_correct_position(binary_grouped)
            ffc_pos_list.append(Counter(first_corr_pos))
           
            #filter binary list for only answers where the correct answer is reached at least once
            only_correct=[]
            for entry in binary_grouped:
                if 1 in entry:
                    only_correct.append(entry)
            
            #get relative and absolute first correct position
            perc_first_correct_pos=[]
            for entry,pos in zip(only_correct, first_corr_pos):
                
                perc_first_correct_pos.append( pos/len(entry))

            average_first_correct_perc=sum(perc_first_correct_pos)/len(perc_first_correct_pos)
            average_first_correct_pos=sum(first_corr_pos)/len(first_corr_pos)
            #print(average_first_correct)
            fcp_list.append(average_first_correct_pos)
            fc_rel_list.append(average_first_correct_perc)
            

            #first correct applicable
            fca_list.append(first_applicable(binary_grouped))

            #first final position
            first_final_pos,first_final_pos_rel=first_final_position(binary_grouped)
            
            average_first_final_pos= sum(first_final_pos)/len(first_final_pos)
            average_first_final_rel= sum(first_final_pos_rel)/len(first_final_pos_rel)
            
            ffp_list.append(average_first_final_pos)
            ff_rel_list.append(average_first_final_rel)

            #first final aplicable
            ffa_list.append(final_applicable(binary_grouped))

            #mean edits per utterance
            ed_utt_list.append(mean_edits_per_utterance(binary_grouped))

            #correctness
            correctness_list.append(correctness(binary_grouped))

#Create dataframes
data_fc={}

data_fc["Dataset First Correct"+model_input+" "+mode]=found_sets_list
data_fc["Percentage"]=fc_rel_list
data_fc["Position"]=fcp_list
data_fc["Applicable"]=fca_list

df_fc = pd.DataFrame(data_fc)
if mode=="non_inc":
    df_fc["Percentage"]= df_fc["Percentage"].round(2)
    df_fc["Position"]= df_fc["Position"].round(2)
    
elif mode=="inc":
    df_fc=df_fc.round(2)
    
else: 
    print("Mode can only be inc or non_inc. Mode is:",mode)

df_fc.set_index("Dataset First Correct"+model_input+" "+mode,inplace=True)
display(df_fc)


data_ff={}
data_ff["Dataset First Final "+model_input+" "+mode]=found_sets_list
data_ff["Percentage"]=ff_rel_list
data_ff["Position"]=ffp_list
data_ff["Applicable"]=ffa_list


df_ff=pd.DataFrame(data_ff)
if mode=="non_inc":
    df_ff["Percentage"]= df_ff["Percentage"].round(2)
    df_ff["Position"]= df_ff["Position"].round(2)

elif mode=="inc":
    df_ff=df_ff.round(2)
    
else: 
    print("Mode can only be inc or non_inc. Mode is:",mode)

df_ff.set_index("Dataset First Final "+model_input+" "+mode,inplace=True)
display(df_ff)

data_edit={}
data_edit["Dataset Edits "+model_input+" "+mode]=found_sets_list
data_edit["Edits per Utterance"]=ed_utt_list
data_edit["Correctness"]=correctness_list
data_edit["Accuracy"]=accuracy_list

if model_input=="ReSc":
    data_edit["Non Incremental Accuracy"]=non_inc_acc_Resc

elif model_input=="TVG":
    data_edit["Non Incremental Accuracy"]=non_inc_acc_TVG
else:
    print("Model Input can only be ReSc or TVG not:",model_input)

df_edit=pd.DataFrame(data_edit)


if mode=="non_inc":
    df_edit["Edits per Utterance"]= df_edit["Edits per Utterance"].round(2)
    df_edit["Correctness"]= df_edit["Correctness"].round(2)
    df_edit["Accuracy"]= df_edit["Accuracy"].round(2)
    df_edit["Non Incremental Accuracy"]= df_edit["Non Incremental Accuracy"].round(2)

elif mode=="inc":
    df_edit=df_edit.round(2)
    
else: 
    print("Mode can only be inc or non_inc. Mode is:",mode)

df_edit.set_index("Dataset Edits "+model_input+" "+mode,inplace=True)
display(df_edit)


unctestB
unctestA
uncval
unc+testB
unc+testA
unc+val
gref_umdval
gref_umdtest
grefval


Unnamed: 0_level_0,Percentage,Position,Applicable
Dataset First CorrectReSc inc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
unc testB,0.0,0.0,0.32
unc testA,0.0,0.0,0.32
unc val,0.0,0.0,0.32
unc+ testB,0.0,0.0,0.15
unc+ testA,0.0,0.0,0.13
unc+ val,0.0,0.0,0.14
gref_umd val,0.0,0.0,0.35
gref_umd test,0.0,0.0,0.35
gref val,0.0,0.0,0.18


Unnamed: 0_level_0,Percentage,Position,Applicable
Dataset First Final ReSc inc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
unc testB,0.0,0.0,0.32
unc testA,0.0,0.0,0.32
unc val,0.0,0.0,0.32
unc+ testB,0.0,0.0,0.15
unc+ testA,0.0,0.0,0.13
unc+ val,0.0,0.0,0.14
gref_umd val,0.0,0.0,0.35
gref_umd test,0.0,0.0,0.35
gref val,0.0,0.0,0.18


Unnamed: 0_level_0,Edits per Utterance,Correctness,Accuracy,Non Incremental Accuracy
Dataset Edits ReSc inc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
unc testB,0.0,0.32,0.29,71.85
unc testA,0.0,0.32,0.29,78.61
unc val,0.0,0.32,0.29,76.74
unc+ testB,0.0,0.15,0.13,56.08
unc+ testA,0.0,0.13,0.13,65.94
unc+ val,0.0,0.14,0.12,63.21
gref_umd val,0.0,0.35,0.35,64.89
gref_umd test,0.0,0.35,0.34,64.01
gref val,0.0,0.18,0.19,61.16
