In [2]:
import torch
import numpy as np
from models import *
#from dataset_dad import *
from torch.utils.data import DataLoader

import argparse

import torch.nn as nn
import torch.nn.functional as F
from torchvision.utils import save_image

from torch.optim.lr_scheduler import MultiStepLR

from torchmetrics.functional import pairwise_cosine_similarity
import scipy.io as io

import sklearn 
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

import time
from eval_utils import evaluation

In [3]:
torch.manual_seed(0)   #3407

<torch._C.Generator at 0x111379f10>

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classification criterion
cls_criterion = nn.CrossEntropyLoss().to(device)

best_ap = -1 
n_frames = 100

In [5]:
def test_model(epoch, model, test_dataloader):

	""" Function to evaluate the model on the test data 
	Inputs: 
	epoch: training epoch number (0 if it is only testing)
	model: trained model 
	test_dataloader: Dataloader for the testset 
	"""

	global best_ap
	print("")
	model.eval()
	total_correct, total, all_toa = 0, 0, []
 
	for batch_i, (X, edge_index, y_true, img_feat, video_adj_list, edge_embeddings, temporal_adj_list, obj_vis_feat, batch_vec, toa) in enumerate(test_dataloader):
		print("Batch: ", batch_i)
		X = X.reshape(-1, X.shape[2])
		img_feat = img_feat.reshape(-1, img_feat.shape[2])
		edge_index = edge_index.reshape(-1, edge_index.shape[2])
		edge_embeddings = edge_embeddings.view(-1, edge_embeddings.shape[-1])
		video_adj_list = video_adj_list.reshape(-1, video_adj_list.shape[2])
		temporal_adj_list = temporal_adj_list.reshape(-1, temporal_adj_list.shape[2])
		y = y_true.reshape(-1) 
		
		obj_vis_feat = obj_vis_feat.reshape(-1, obj_vis_feat.shape[-1]).to(device)
		feat_sim = pairwise_cosine_similarity(obj_vis_feat+1e-7, obj_vis_feat+1e-7)
		temporal_edge_w = feat_sim[temporal_adj_list[0, :], temporal_adj_list[1, :]]
		batch_vec = batch_vec.view(-1).long()

		X, edge_index, y, img_feat, video_adj_list = X.to(device), edge_index.to(device), y.to(device), img_feat.to(device), video_adj_list.to(device)
		temporal_adj_list, temporal_edge_w, edge_embeddings, batch_vec = temporal_adj_list.to(device), temporal_edge_w.to(device), edge_embeddings.to(device), batch_vec.to(device)
		all_toa += [toa.item()]
        
		with torch.no_grad():
			logits, probs = model(X, edge_index, img_feat, video_adj_list, edge_embeddings, temporal_adj_list, temporal_edge_w, batch_vec)
		
		pred_labels = probs.argmax(1)
		
		total_correct += (pred_labels == y).cpu().numpy().sum()
		total += y.shape[0]
		
		if batch_i == 0: 
			all_probs_vid2 = probs[:, 1].cpu().unsqueeze(0)
			all_pred = pred_labels.cpu()
			all_y =  y.cpu() #.unsqueeze(0)
			all_y_vid =  torch.max(y).unsqueeze(0).cpu() #y.cpu() #.unsqueeze(0)
		else: 
			all_probs_vid2 = torch.cat((all_probs_vid2, probs[:, 1].cpu().unsqueeze(0)))
			all_pred = torch.cat((all_pred, pred_labels.cpu()))
			all_y = torch.cat((all_y, y.cpu()))
			all_y_vid = torch.cat((all_y_vid, torch.max(y).unsqueeze(0).cpu()))

		# Empty cache
		if torch.cuda.is_available():
			torch.cuda.empty_cache()
	print("All toa :", all_toa)
	#Print the avergae precision 
	print("All Probs vid2 : ", len(all_probs_vid2))
	print("All y vid : ", len(all_y_vid))
	return all_probs_vid2, all_y_vid, all_toa
	
	avg_prec, curr_ttc, _ = evaluation(all_probs_vid2.numpy(), all_y_vid.numpy(), all_toa)    
	avg_prec = 100 * avg_prec

	#Print the confusion matrix 
	cf = confusion_matrix(all_y.numpy(), all_pred.numpy())
	print(cf)

	#class-wise accuracy 
	class_recall = cf.diagonal()/cf.sum(axis=1)
	print(np.round(class_recall, 3))

	if bool(opt.test_only):
		exit(0)

	#Saving checkpoint
	if avg_prec > best_ap:
		best_ap = avg_prec
		os.makedirs("model_checkpoints/dad", exist_ok=True)
		torch.save(model.state_dict(), f"model_checkpoints/dad/{model.__class__.__name__}_{epoch}.pth")
		print(f"Saved the model checkpoint - model_checkpoints/dad/{model.__class__.__name__}_{epoch}.pth")
	print("Best Frame avg precision: %.2f%%" % (best_ap))

	model.train()
	print("")


In [6]:
from dataset_dad import Dataset
img_dataset_path = "data/dad/i3d_feat"
dataset_path = "data/dad/obj_feat"
split_path = "splits_dad/"
obj_mapping_file = "data/dad/obj_idx_to_labels.json"
ref_interval = 20
video_batch_size = 1
# Define training set
train_dataset = Dataset(
    img_dataset_path = img_dataset_path,
    dataset_path=dataset_path,
    split_path=split_path,
    #frame_batch_size=opt.batch_size,
    ref_interval=ref_interval,
    objmap_file=obj_mapping_file,
    training=True,
)
train_dataloader = DataLoader(train_dataset, batch_size=video_batch_size, shuffle=True, num_workers=8)







In [7]:
# Define test set
test_video_batch_size = 1
test_dataset = Dataset(
    img_dataset_path = img_dataset_path,
    dataset_path=dataset_path,
    split_path=split_path,
    #frame_batch_size=opt.batch_size,
    ref_interval=ref_interval,
    objmap_file=obj_mapping_file,
    training=False,
)
test_dataloader = DataLoader(test_dataset, batch_size=test_video_batch_size, shuffle=False, num_workers=8)




In [8]:
	# Define network
input_dim = 4096
embedding_dim = 256
img_feat_dim = 2048
num_classes = 2
model = SpaceTempGoG_detr_dad(input_dim=input_dim, embedding_dim=embedding_dim, img_feat_dim=img_feat_dim, num_classes=num_classes).to(device)
print(model)
	
model.train()

SpaceTempGoG_detr_dad(
  (x_fc): Linear(in_features=4096, out_features=512, bias=True)
  (x_bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (obj_l_fc): Linear(in_features=300, out_features=128, bias=True)
  (obj_l_bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gc1_spatial): GCNConv(640, 128)
  (gc1_norm1): InstanceNorm(128)
  (gc1_temporal): GCNConv(640, 128)
  (gc1_norm2): InstanceNorm(128)
  (pool): TopKPooling(256, ratio=0.8, multiplier=1.0)
  (img_fc): Linear(in_features=2048, out_features=512, bias=True)
  (gc2_sg): GATv2Conv(256, 128, heads=1)
  (gc2_norm1): InstanceNorm(128)
  (gc2_i3d): GATv2Conv(512, 128, heads=1)
  (gc2_norm2): InstanceNorm(128)
  (classify_fc1): Linear(in_features=256, out_features=128, bias=True)
  (classify_fc2): Linear(in_features=128, out_features=2, bias=True)
  (relu): LeakyReLU(negative_slope=0.2)
  (softmax): Softmax(dim=-1)
)


SpaceTempGoG_detr_dad(
  (x_fc): Linear(in_features=4096, out_features=512, bias=True)
  (x_bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (obj_l_fc): Linear(in_features=300, out_features=128, bias=True)
  (obj_l_bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gc1_spatial): GCNConv(640, 128)
  (gc1_norm1): InstanceNorm(128)
  (gc1_temporal): GCNConv(640, 128)
  (gc1_norm2): InstanceNorm(128)
  (pool): TopKPooling(256, ratio=0.8, multiplier=1.0)
  (img_fc): Linear(in_features=2048, out_features=512, bias=True)
  (gc2_sg): GATv2Conv(256, 128, heads=1)
  (gc2_norm1): InstanceNorm(128)
  (gc2_i3d): GATv2Conv(512, 128, heads=1)
  (gc2_norm2): InstanceNorm(128)
  (classify_fc1): Linear(in_features=256, out_features=128, bias=True)
  (classify_fc2): Linear(in_features=128, out_features=2, bias=True)
  (relu): LeakyReLU(negative_slope=0.2)
  (softmax): Softmax(dim=-1)
)

In [9]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 3581698


In [10]:
	# Add weights from checkpoint model if specified
checkpoint_model ="model_checkpoints/dad_model.pth"
if checkpoint_model:
    #model.load_state_dict(torch.load(checkpoint_model, map_location=torch.device('cpu')))
    #model.load_state_dict(torch.load(checkpoint_model))
    checkpoint = torch.load(checkpoint_model, map_location=torch.device('cpu'))


In [11]:
test_only =1
if bool(test_only):
    all_probs_vid2, all_y_vid, all_toa  = test_model(0, model, test_dataloader)


Batch:  0
Batch:  1
Batch:  2
All toa : [90, 90, 101]
All Probs vid2 :  3
All y vid :  3


In [12]:
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[25], gamma=0.5)

In [13]:
time_of_accidents = all_toa
all_labels = all_y_vid.numpy()
all_pred = all_probs_vid2.numpy()
fps=20.0
preds_eval = []
min_pred = np.inf
n_frames = 0
for idx, toa in enumerate(time_of_accidents):
    print("Idx %d, toa %d"%(idx, toa))
    if all_labels[idx] > 0:
        pred = all_pred[idx, :int(toa)]  # positive video
    else:
        pred = all_pred[idx, :]  # negative video
    # find the minimum prediction
    min_pred = np.min(pred) if min_pred > np.min(pred) else min_pred
    preds_eval.append(pred)
    n_frames += len(pred)
total_seconds = all_pred.shape[1] / fps

Idx 0, toa 90
Idx 1, toa 90
Idx 2, toa 101


In [17]:
# iterate a set of thresholds from the minimum predictions
# temp_shape = int((1.0 - max(min_pred, 0)) / 0.001 + 0.5) 
print("n_frames : ",n_frames)
n_frames = 741
Precision = np.zeros((n_frames))
Recall = np.zeros((n_frames))
Time = np.zeros((n_frames))
cnt = 0
print("N_frames: ", n_frames)
print("Precision: ", Precision.shape)
print("Min Pred : ",min_pred)
print(np.arange(max(min_pred, 0), 1.0, 0.001).shape)
print("Len Pred Eval: ", preds_eval[0].shape)

n_frames :  280
N_frames:  741
Precision:  (741,)
Min Pred :  0.25901392
(741,)
Len Pred Eval:  (90,)


In [20]:
for Th in np.arange(max(min_pred, 0), 1.0, 0.001):
    print("TH:",Th)
    Tp = 0.0
    Tp_Fp = 0.0
    Tp_Tn = 0.0
    time = 0.0
    counter = 0.0  # number of TP videos
    # iterate each video sample
    for i in range(len(preds_eval)):
        # true positive frames: (pred->1) * (gt->1)
        tp =  np.where(preds_eval[i]*all_labels[i]>=Th)
        Tp += float(len(tp[0])>0)
        if float(len(tp[0])>0) > 0:
            # if at least one TP, compute the relative (1 - rTTA)
            time += tp[0][0] / float(time_of_accidents[i])
            counter = counter+1
        # all positive frames
        Tp_Fp += float(len(np.where(preds_eval[i]>=Th)[0])>0)
    

    print("TPFP : ",Tp_Fp)
    
    if Tp_Fp == 0:  # predictions of all videos are negative
        continue
    else:
        try:
            Precision[cnt] = Tp/Tp_Fp
        except ValueError as e:
            print("Error : ",e)
    if np.sum(all_labels) ==0: # gt of all videos are negative
        continue
    else:
        Recall[cnt] = Tp/np.sum(all_labels)
    if counter == 0:
        continue
    else:
        Time[cnt] = (1-time/counter)
    cnt += 1


TH: 0.25901392102241516
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26001392102241516
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26101392102241516
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26201392102241516
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26301392102241516
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26401392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26501392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26601392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26701392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26801392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.26901392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.27001392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.27101392102241517
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.2720139210224152
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.2730139210224152
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.2740139210224152
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.2750139210224152
i :  0
i :  1
i :  2
TPFP :  3.0
TH: 0.2760139210224152
i :  0
i :  

In [23]:
new_index = np.argsort(Recall)
Precision = Precision[new_index]
Recall = Recall[new_index]
Time = Time[new_index]
print("New index: ", new_index.shape)
print("Precision: ", Precision[new_index].shape)
print("Recall: ", Recall[new_index].shape)
print("Time: ", Time[new_index].shape)

New index:  (741,)
Precision:  (741,)
Recall:  (741,)
Time:  (741,)


In [24]:

# unique the recall, and fetch corresponding precisions and TTAs
_,rep_index = np.unique(Recall,return_index=1)
rep_index = rep_index[1:]
new_Time = np.zeros(len(rep_index))
new_Precision = np.zeros(len(rep_index))
for i in range(len(rep_index)-1):
        new_Time[i] = np.max(Time[rep_index[i]:rep_index[i+1]])
        new_Precision[i] = np.max(Precision[rep_index[i]:rep_index[i+1]])
# sort by descending order
new_Time[-1] = Time[rep_index[-1]]
new_Precision[-1] = Precision[rep_index[-1]]
new_Recall = Recall[rep_index]

In [25]:
AP = 0.0
if new_Recall[0] != 0:
    AP += new_Precision[0]*(new_Recall[0]-0)
for i in range(1,len(new_Precision)):
    AP += (new_Precision[i-1]+new_Precision[i])*(new_Recall[i]-new_Recall[i-1])/2


In [26]:
# transform the relative mTTA to seconds
mTTA = np.mean(new_Time) * total_seconds
print("Average Precision= %.4f, mean Time to accident= %.4f"%(AP, mTTA))
sort_time = new_Time[np.argsort(new_Recall)]
sort_recall = np.sort(new_Recall)
TTA_R80 = sort_time[np.argmin(np.abs(sort_recall-0.8))] * total_seconds
print("Recall@80%, Time to accident= " +"{:.4}".format(TTA_R80))

Average Precision= 0.9167, mean Time to accident= 4.8611
Recall@80%, Time to accident= 4.944
