-
Notifications
You must be signed in to change notification settings - Fork 5
/
dataloader_vidvrd.py
687 lines (568 loc) · 30.1 KB
/
dataloader_vidvrd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
import os
import json
from tqdm import tqdm
import numpy as np
import torch
import pickle
from copy import deepcopy
import random
from collections import defaultdict
from utils.utils_func import is_overlap, merge_duration_list,linear_interpolation
from utils.categories_v2 import vidvrd_CatName2Id,vidvrd_PredName2Id
class TrajProposal(object):
def __init__(self,video_name,
cat_ids,traj_bboxes_with_score,traj_durations,roi_features,MAX_PROPOSAL):
"""
roi_features: list[np.arry], len==num_proposals, shape == (num_frames,dim)
"""
self.MAX_PROPOSAL = MAX_PROPOSAL
self.video_name = video_name
assert len(cat_ids) == len(traj_bboxes_with_score)
assert len(cat_ids) == len(roi_features)
assert len(cat_ids) == len(traj_durations)
if len(cat_ids) == 0:
self.num_proposals = 0
print("video:{} has no proposal".format(video_name))
return
self.cat_ids = torch.LongTensor(cat_ids) # len == num_proposals
self.scores,self.bboxes_list = self._preprocess_boxes(traj_bboxes_with_score) # len == num_proposals
# list[float], list[tensor], traj_bboxes.shape == (num_frames,4)
self.traj_durations = torch.tensor(traj_durations,dtype=torch.long) # --> shape == (num_proposals, 2)
self.traj_durations[:,1] -= 1 # convert to closed interval
self.features_list = [torch.FloatTensor(r) for r in roi_features] # len==num_proposals, shape == (num_frames,dim_feat) traj_durations
# score_clipping:
self.scores = torch.FloatTensor(self.scores)
index = torch.argsort(self.scores,descending=True)
index = index[:self.MAX_PROPOSAL]
self.scores = self.scores[index]
self.bboxes_list = [self.bboxes_list[ii] for ii in index]
self.traj_durations = self.traj_durations[index]
self.features_list = [self.features_list[ii] for ii in index]
self.cat_ids = self.cat_ids[index]
self.num_proposals = len(self.bboxes_list)
if self.num_proposals > self.MAX_PROPOSAL:
self.num_proposals = self.MAX_PROPOSAL
self.dim_feat = self.features_list[0].shape[1] if self.num_proposals > 0 else "[]"
def _preprocess_boxes(self,traj_bboxes_with_score):
scores = []
traj_bboxes = []
for proposal in traj_bboxes_with_score:
# proposal: list[[xmin,ymin,xmax,ymax,score]] len == num_frames
bboxes = torch.FloatTensor([p[0:4] for p in proposal])
traj_bboxes.append(bboxes)
scores.append(
sum([p[4] for p in proposal])/len(proposal)
)
return scores,traj_bboxes
def to(self,device):
if self.num_proposals == 0:
return self
self.cat_ids = self.cat_ids.to(device)
self.scores = self.scores.to(device)
self.traj_durations = self.traj_durations.to(device)
for i in range(self.num_proposals):
self.bboxes_list[i] = self.bboxes_list[i].to(device)
self.features_list[i] = self.features_list[i].to(device)
return self
def __repr__(self):
return "TrajProposal[{},num_proposals={},feature_dim={}]".format(self.video_name,self.num_proposals,self.dim_feat)
class VideoGraph(object):
def __init__(self,video_info,split,
traj_cat_ids,traj_durations,traj_bboxes,
pred_cat_ids,pred_durations,
adj_matrix_subject,adj_matrix_object,MAX_PREDS):
self.video_name,self.video_len,self.video_wh = video_info
self.MAX_PREDS = MAX_PREDS
assert len(traj_cat_ids) == len(traj_durations)
assert len(traj_cat_ids) == len(traj_bboxes)
assert len(pred_cat_ids) == len(pred_durations)
self.num_trajs = len(traj_cat_ids)
self.num_preds = len(pred_cat_ids)
self.traj_cat_ids = torch.LongTensor(traj_cat_ids) # shape == (num_trajs,)
self.traj_durations = torch.tensor(traj_durations) # shape == (num_trajs,2),
self.traj_durations[:,1] -= 1 # convert to closed interval
self.traj_bboxes = [torch.FloatTensor(b) for b in traj_bboxes] # list[tensor] len == num_trajs, shape == (num_frames,4)
self.pred_cat_ids = torch.LongTensor(pred_cat_ids) # shape == (num_preds,)
self.pred_durations = torch.tensor(pred_durations).float() # shape == (num_preds,2),
self.pred_durations[:,1] -= 1 # convert to closed interval
self.n_frames_list = [traj.shape[0] for traj in self.traj_bboxes]
self.max_frames = max(self.n_frames_list)
assert adj_matrix_object.shape == adj_matrix_subject.shape
adj_s = torch.from_numpy(adj_matrix_subject) # shape == (num_preds,num_trajs)
adj_o = torch.from_numpy(adj_matrix_object)
self.adj_matrix = torch.stack([adj_s,adj_o],dim=0).float() # shape = (2,num_preds,num_trajs)
if split == "train" and (self.num_preds > self.MAX_PREDS):
self.pred_cat_ids = self.pred_cat_ids[:self.MAX_PREDS]
self.pred_durations = self.pred_durations[:self.MAX_PREDS,:]
self.adj_matrix = self.adj_matrix[:,:self.MAX_PREDS,:]
self.num_preds = self.pred_cat_ids.shape[0]
def _prepare_pred_duras(self,pred_durations):
start = []
end = []
for dura in pred_durations:
start.append(dura[0])
end.append(dura[1])
start = torch.FloatTensor(start)
end = torch.FloatTensor(end)
gt_duras = torch.stack([start,end],dim=1) # shape == (num_gt_pred,2)
return gt_duras
def to(self,device):
self.traj_cat_ids = self.traj_cat_ids.to(device)
self.pred_cat_ids = self.pred_cat_ids.to(device)
self.traj_durations = self.traj_durations.to(device)
self.pred_durations = self.pred_durations.to(device) # shape == (num_preds,2), start_fid, end_fid
self.adj_matrix = self.adj_matrix.to(device)
for i in range(self.num_trajs):
self.traj_bboxes[i] = self.traj_bboxes[i].to(device)
return self
def __repr__(self):
return "VideoGraph[num_trajs={},num_preds={}]".format(self.num_trajs,self.num_preds)
class Dataset(object):
def __init__(self,split,ann_dir,proposal_dir,
dim_boxfeature,min_frames_th,max_proposal,max_preds,cache_tag
):
self.split = self._get_split(split) # self.split in ["train","test"]
self.proposal_dir = proposal_dir # e.g., "proposals/vidvrd-dataset/miss30_minscore0p3/VidVRD_test_every1frames"
self.dim_boxfeature = dim_boxfeature
self.min_frames_th = min_frames_th
self.max_proposal = max_proposal
self.max_preds = max_preds
self.cache_tag = cache_tag
self.cache_dir = "datasets/cache"
if not os.path.exists(self.cache_dir):
os.mkdir(self.cache_dir)
if self.split == "train":
self.video_ann_dir = os.path.join(ann_dir,'train') # e.g., "datasets/vidvrd-dataset/train/"
else: # self.split == "test":
self.video_ann_dir = os.path.join(ann_dir,'test') # e.g., "datasets/vidvrd-dataset/test/"
self.video_name_list = self._prepare_video_names()
cache_path = self.cache_tag + "_" + "VidVRD{}_th_{}-{}-{}".format(self.split,self.min_frames_th,self.max_proposal,self.max_preds)
cache_path = os.path.join(self.cache_dir,cache_path + ".pkl")
if os.path.exists(cache_path):
print("load all data into memory, from cache file {}".format(cache_path))
with open(cache_path,'rb') as f:
data_dict = pickle.load(f)
else:
print("no cache file find, preparing data... {}".format(cache_path))
data_dict = dict()
for video_name in tqdm(self.video_name_list):
data_dict[video_name] = self.get_data(video_name)
with open(cache_path,'wb') as f:
pickle.dump(data_dict,f)
print("all data have been saved as cache file: {}".format(cache_path))
self.data_dict = data_dict
print("all data have been loaded into memory")
def _get_split(self,split):
train = {x:"train" for x in ["train","training"]}
test = {x:"test" for x in ["test","testing"]}
split_dict = {}
for x in [train,test]:
split_dict.update(x)
try:
s = split_dict[split.lower()]
except:
assert False, "only support split in {}".format(split_dict.keys())
return s
def _prepare_video_names(self):
video_name_list = os.listdir(self.video_ann_dir)
video_name_list = sorted([v.split('.')[0] for v in video_name_list])
return video_name_list
def __getitem__(self,idx):
video_name = self.video_name_list[idx]
if (video_name == "ILSVRC2015_train_00082000") or (video_name == "ILSVRC2015_train_00790000"): # videos in trainset, which cost too much gpu memory
idx = random.randint(0,len(self.video_name_list)-1)
return self.__getitem__(idx)
traj_proposal, gt_graph = deepcopy(self.data_dict[video_name])
if traj_proposal.num_proposals > 0 or self.split == "test":
return traj_proposal, gt_graph
else:
idx = random.randint(0,len(self.video_name_list)-1)
# print("video:{} has no proposal, random choose {} instead".format(video_name,self.video_name_list[idx]))
return self.__getitem__(idx)
def __len__(self):
return len(self.video_name_list)
def get_data(self,video_name):
traj_proposal = self._get_proposal(video_name)
gt_graph = self._get_gt_graph(video_name)
traj_proposal.video_len = gt_graph.video_len # TODO add traj_proposal.video_len separately and assert `traj_proposal.video_len == gt_graph.video_len`
traj_proposal.video_wh = gt_graph.video_wh
return traj_proposal, gt_graph
def _get_proposal(self,video_name):
track_res_path = os.path.join(self.proposal_dir,video_name+".npy") #ILSVRC2015_train_00010001.npy
track_res = np.load(track_res_path,allow_pickle=True)
trajs = {box_info[1]:{} for box_info in track_res}
for tid in trajs.keys():
trajs[tid]["frame_ids"] = []
trajs[tid]["bboxes"] = []
trajs[tid]["roi_features"] = []
trajs[tid]["category_id"] = [] # 如果某个tid只有len==6的box_info,那就无法获取 category_id ,默认为背景
for box_info in track_res:
if not isinstance(box_info,list):
box_info = box_info.tolist()
assert len(box_info) == 6 or len(box_info) == 12 + self.dim_boxfeature,"len(box_info)=={}".format(len(box_info))
frame_id = box_info[0]
tid = box_info[1]
tracklet_xywh = box_info[2:6]
xmin_t,ymin_t,w_t,h_t = tracklet_xywh
xmax_t = xmin_t + w_t
ymax_t = ymin_t + h_t
bbox_t = [xmin_t,ymin_t,xmax_t,ymax_t]
confidence = float(0)
if len(box_info) == 12 + self.dim_boxfeature:
confidence = box_info[6]
cat_id = box_info[7]
xywh = box_info[8:12]
xmin,ymin,w,h = xywh
xmax = xmin+w
ymax = ymin+h
bbox = [(xmin+xmin_t)/2, (ymin+ymin_t)/2, (xmax+xmax_t)/2,(ymax+ymax_t)/2]
roi_feature = box_info[12:]
trajs[tid]["category_id"].append(cat_id)
trajs[tid]["roi_features"].append(roi_feature)
if len(box_info) == 6:
bbox_t.append(confidence)
trajs[tid]["bboxes"].append(bbox_t)
trajs[tid]["roi_features"].append([0]*self.dim_boxfeature)
else:
bbox.append(confidence)
trajs[tid]["bboxes"].append(bbox)
trajs[tid]["frame_ids"].append(frame_id) #
for tid in trajs.keys():
if trajs[tid]["category_id"] == []:
trajs[tid]["category_id"] = 0
else:
# print(trajs[tid]["category_id"])
temp = np.argmax(np.bincount(trajs[tid]["category_id"])) # 求众数
trajs[tid]["category_id"] = int(temp)
frame_ids = trajs[tid]["frame_ids"]
start = min(frame_ids)
end = max(frame_ids) + 1
dura_len = end - start
duration = (start,end) # 前闭后开区间
trajs[tid]["roi_features"] = np.array(trajs[tid]["roi_features"])
trajs[tid]["bboxes"] = np.array(trajs[tid]["bboxes"])
# 将太短的视为背景,后续过滤掉
if len(frame_ids) < self.min_frames_th:
trajs[tid]["category_id"] = 0
else:
trajs[tid]["duration"] = (start,end)
# 对于非背景的traj, 看是否需要插值
if trajs[tid]["category_id"] !=0 and len(frame_ids) != dura_len:
trajs[tid]["roi_features"] = linear_interpolation(trajs[tid]["roi_features"],frame_ids)
trajs[tid]["bboxes"] = linear_interpolation(trajs[tid]["bboxes"],frame_ids)
if trajs[tid]["category_id"] !=0:
assert len(trajs[tid]["bboxes"]) == dura_len
# trajs = {k:v for k,v in trajs.items() if v["category_id"]!=0}
cat_ids = []
traj_boxes = []
roi_features_list = []
traj_durations = []
for tid in trajs.keys():
if trajs[tid]["category_id"] != 0:
dura_len = trajs[tid]["duration"][1] - trajs[tid]["duration"][0]
assert len(trajs[tid]["bboxes"]) == dura_len
cat_ids.append(trajs[tid]["category_id"])
traj_boxes.append(trajs[tid]["bboxes"])
roi_features_list.append(trajs[tid]["roi_features"])
traj_durations.append(trajs[tid]["duration"])
return TrajProposal(video_name,cat_ids,traj_boxes,traj_durations,roi_features_list,self.max_proposal)
def _get_gt_graph(self,video_name):
video_ann_path = os.path.join(self.video_ann_dir, video_name + ".json")
## 1. construct trajectory annotations from frame-level bbox annos
if os.path.exists(video_ann_path):
with open(video_ann_path,'r') as f:
video_anno = json.load(f)
else:
print(video_name,"not find its anno")
raise NotImplementedError
video_len = len(video_anno["trajectories"])
video_wh = (video_anno["width"],video_anno["height"])
traj_categories = video_anno["subject/objects"] # tid not necessary 0 ~ len(traj_categories)-1
# tid2category_map = [traj["category"] for traj in traj_categories] # This is WRONG!
tid2category_map = {traj["tid"]:traj["category"] for traj in traj_categories} # this is CORRECT
# e.g., {0: 'bicycle', 1: 'bicycle', 2: 'person', 3: 'person', 5: 'person'}
trajs = {traj["tid"]:{} for traj in traj_categories}
for tid in trajs.keys():
trajs[tid]["all_bboxes"] = []
trajs[tid]["frame_ids"] = []
trajs[tid]["cat_name"] = tid2category_map[tid]
for frame_id,frame_anno in enumerate(video_anno["trajectories"]):
for bbox_anno in frame_anno:
tid = bbox_anno["tid"]
category_name = tid2category_map[tid]
category_id = vidvrd_CatName2Id[category_name]
bbox = bbox_anno["bbox"]
bbox = [bbox["xmin"],bbox["ymin"],bbox["xmax"],bbox["ymax"]]
trajs[tid]["all_bboxes"].append(bbox)
trajs[tid]["frame_ids"].append(frame_id)
trajs[tid]["category_id"] = category_id
## 2.linear_interpolation
for tid in trajs.keys():
frame_ids = trajs[tid]["frame_ids"]
start,end = min(frame_ids),max(frame_ids)+1 # 前开后闭区间
all_bboxes = np.array(trajs[tid]["all_bboxes"]).astype(np.float)
all_bboxes = linear_interpolation(all_bboxes,frame_ids)
trajs[tid]["all_bboxes"] = all_bboxes # np.ndarray, shape == (dura_len,4)
trajs[tid]["duration"] = (start,end) # dura_len == end - start
traj_cat_ids = []
traj_durations = []
traj_bbox_list = []
tid2idx_map = {}
for idx,tid in enumerate(trajs.keys()):
traj_cat_ids.append(trajs[tid]["category_id"])
traj_durations.append(trajs[tid]["duration"])
traj_bbox_list.append(trajs[tid]["all_bboxes"])
tid2idx_map[tid] = idx
traj_cat_ids = np.array(traj_cat_ids) # shape == (num_trajs,)
traj_durations = np.array(traj_durations) # shape == (num_trajs,2)
num_trajs = len(traj_cat_ids)
# 3. merge relations
# in the train-set of vidvrd, some long relations is annotated as short segments
# we merge them to one whole relation
# e.g., a original relation might have a duration_list = [(195, 225), (225, 240), (375, 405), (390, 420)]
# we merge it to [(195, 240), (375, 420)]
# NOTE: train 的需要merge, 在vidvrd的train-set中, 一个大于30的duration都没有,在test-set中, long-duration没有被按照30一段标注
preds = video_anno["relation_instances"]
pred_cat_ids = []
pred_durations = []
trituple_list = []
trituple2durations_dict = defaultdict(list)
for pred in preds:
predicate = pred["predicate"]
subject_tid = pred["subject_tid"]
object_tid = pred["object_tid"]
trituple = str(subject_tid) + "-" + predicate + "-" + str(object_tid)
begin_fid = pred["begin_fid"]
end_fid = pred["end_fid"]
trituple2durations_dict[trituple].append((begin_fid,end_fid))
for trituple,durations in trituple2durations_dict.items():
merged_durations = merge_duration_list(durations) # e.g., [(30,60),(60,90),(120,150)] --> [(30,90),(120,150)]
trituple2durations_dict[trituple] = merged_durations
pred_name = trituple.split('-')[1]
pred_catid = vidvrd_PredName2Id[pred_name]
for duration in merged_durations:
trituple_list.append(trituple)
pred_cat_ids.append(pred_catid)
pred_durations.append(duration)
num_preds = len(pred_cat_ids)
pred_cat_ids = np.array(pred_cat_ids) # shape == (num_preds,)
pred_durations = np.array(pred_durations) # shape == (num_preds,2)
# 2.3. construct adjacency matrix
adj_matrix_subject = np.zeros((num_preds,num_trajs),dtype=np.int)
adj_matrix_object = np.zeros((num_preds,num_trajs),dtype=np.int)
for idx in range(num_preds):
trituple = trituple_list[idx]
pred_duration = pred_durations[idx]
subj_tid = int(trituple.split('-')[0])
obj_tid = int(trituple.split('-')[-1])
subj_idx = tid2idx_map[subj_tid]
obj_idx = tid2idx_map[obj_tid]
subj_duration = traj_durations[subj_idx]
if is_overlap(pred_duration,subj_duration):
adj_matrix_subject[idx,subj_idx] = 1
obj_duration = traj_durations[obj_idx]
if is_overlap(pred_duration,obj_duration): # is_overlap 可以用于 1-d np.ndarray
adj_matrix_object[idx,obj_idx] = 1
for row in adj_matrix_subject:
assert np.sum(row) == 1, "video:{} not correct".format(video_name)
for row in adj_matrix_object:
assert np.sum(row) == 1, "video:{} not correct".format(video_name)
video_info = (video_name,video_len,video_wh)
return VideoGraph(video_info,self.split,
traj_cat_ids,traj_durations,traj_bbox_list,
pred_cat_ids,pred_durations,
adj_matrix_subject,adj_matrix_object,self.max_preds
)
@property
def collator_func(self):
"""
batch is a list ,len(batch) == batch_size
batch[i] is a tuple, batch[i][0],batch[i][1] is an object of class TrajProposal, class VideoGraph, respectively
This function should be passed to the torch.utils.data.DataLoader
"""
# batch_size = len(batch)
# as for gramma of this , refer to `test_gramma/regonized/collator_func.py`
def collator_func_v2(batch):
batch = sorted(batch,key=lambda x: x[1].max_frames)
batch_proposal = [b[0] for b in batch]
batch_gt_graph = [b[1] for b in batch]
return batch_proposal,batch_gt_graph
return collator_func_v2
class Dataset_pku(Dataset):
def __init__(self, **kargs):
super().__init__(**kargs)
assert self.cache_tag[:3].lower() == "pku"
def _get_proposal(self,video_name):
if video_name == "ILSVRC2015_train_00884000": # PKU missing this file, use ours instead
video_name = "ILSVRC2015_train_00884000" + "_myFaster18"
track_res_path = os.path.join(self.proposal_dir,video_name+".npy") #ILSVRC2015_train_00010001.npy
track_res = np.load(track_res_path,allow_pickle=True)
trajs = {box_info[1]:{} for box_info in track_res}
for tid in trajs.keys():
trajs[tid]["frame_ids"] = []
trajs[tid]["bboxes"] = []
trajs[tid]["roi_features"] = []
trajs[tid]["category_id"] = [] # 如果某个tid只有len==6的box_info,那就无法获取 category_id ,默认为背景
for box_info in track_res:
if not isinstance(box_info,list):
box_info = box_info.tolist()
assert len(box_info) == 12 + self.dim_boxfeature,"len(box_info)=={}".format(len(box_info))
frame_id = int(box_info[0])
tid = int(box_info[1])
tracklet_xywh = box_info[2:6]
xmin_t,ymin_t,w_t,h_t = tracklet_xywh
xmax_t = xmin_t + w_t
ymax_t = ymin_t + h_t
bbox_t = [xmin_t,ymin_t,xmax_t,ymax_t]
confidence = box_info[6]
cat_id = int(box_info[7])
if cat_id <= 0:
confidence = 0.0
bbox = bbox_t + [confidence]
roi_feature = [0]*self.dim_boxfeature
else:
xywh = box_info[8:12]
xmin,ymin,w,h = xywh
xmax = xmin+w
ymax = ymin+h
bbox = [(xmin+xmin_t)/2, (ymin+ymin_t)/2, (xmax+xmax_t)/2,(ymax+ymax_t)/2,confidence]
roi_feature = box_info[12:]
trajs[tid]["category_id"].append(cat_id)
trajs[tid]["bboxes"].append(bbox)
trajs[tid]["roi_features"].append(roi_feature)
trajs[tid]["frame_ids"].append(frame_id) #
for tid in trajs.keys():
if trajs[tid]["category_id"] == []:
trajs[tid]["category_id"] = 0
else:
# print(video_name,trajs[tid]["category_id"])
temp = np.argmax(np.bincount(trajs[tid]["category_id"])) # 求众数
trajs[tid]["category_id"] = int(temp)
frame_ids = trajs[tid]["frame_ids"]
start = min(frame_ids)
end = max(frame_ids) + 1
dura_len = end - start
duration = (start,end) # 前闭后开区间
trajs[tid]["roi_features"] = np.array(trajs[tid]["roi_features"])
trajs[tid]["bboxes"] = np.array(trajs[tid]["bboxes"])
# 将太短的视为背景,后续过滤掉
if len(frame_ids) < self.min_frames_th:
trajs[tid]["category_id"] = 0
else:
trajs[tid]["duration"] = (start,end)
# 对于非背景的traj, 看是否需要插值
if trajs[tid]["category_id"] !=0 and len(frame_ids) != dura_len:
trajs[tid]["roi_features"] = linear_interpolation(trajs[tid]["roi_features"],frame_ids)
trajs[tid]["bboxes"] = linear_interpolation(trajs[tid]["bboxes"],frame_ids)
if trajs[tid]["category_id"] !=0:
assert len(trajs[tid]["bboxes"]) == dura_len
# trajs = {k:v for k,v in trajs.items() if v["category_id"]!=0}
cat_ids = []
traj_boxes = []
roi_features_list = []
traj_durations = []
for tid in trajs.keys():
if trajs[tid]["category_id"] != 0:
dura_len = trajs[tid]["duration"][1] - trajs[tid]["duration"][0]
assert len(trajs[tid]["bboxes"]) == dura_len
cat_ids.append(trajs[tid]["category_id"])
traj_boxes.append(trajs[tid]["bboxes"])
roi_features_list.append(trajs[tid]["roi_features"])
traj_durations.append(trajs[tid]["duration"])
return TrajProposal(video_name,cat_ids,traj_boxes,traj_durations,roi_features_list,self.max_proposal)
class Dataset_pku_i3d(Dataset):
def __init__(self, i3d_dir, dim_i3d, **kargs):
self.i3d_dir = i3d_dir
self.dim_i3d = dim_i3d
super().__init__(**kargs) # 会调用子类重载的成员函数
def _get_proposal(self,video_name):
if video_name == "ILSVRC2015_train_00884000":
video_name = "ILSVRC2015_train_00884000" + "_myFaster18"
i3d_path = os.path.join(self.i3d_dir,video_name+".npy")
i3d_features = np.load(i3d_path)
track_res_path = os.path.join(self.proposal_dir,video_name+".npy") #ILSVRC2015_train_00010001.npy
track_res = np.load(track_res_path,allow_pickle=True)
trajs = {box_info[1]:{} for box_info in track_res}
for tid in trajs.keys():
trajs[tid]["frame_ids"] = []
trajs[tid]["bboxes"] = []
trajs[tid]["roi_features"] = []
trajs[tid]["i3d_features"] = []
trajs[tid]["category_id"] = [] # 如果某个tid只有len==6的box_info,那就无法获取 category_id ,默认为背景
for idx,box_info in enumerate(track_res):
if not isinstance(box_info,list):
box_info = box_info.tolist()
assert len(box_info) == 12 + self.dim_boxfeature,"len(box_info)=={}".format(len(box_info))
frame_id = int(box_info[0])
tid = int(box_info[1])
tracklet_xywh = box_info[2:6]
xmin_t,ymin_t,w_t,h_t = tracklet_xywh
xmax_t = xmin_t + w_t
ymax_t = ymin_t + h_t
bbox_t = [xmin_t,ymin_t,xmax_t,ymax_t]
confidence = box_info[6]
cat_id = int(box_info[7])
if cat_id <= 0:
confidence = 0.0
bbox = bbox_t + [confidence]
roi_feature = [0]*self.dim_boxfeature
i3d_feature = [0]*self.dim_i3d
else:
xywh = box_info[8:12]
xmin,ymin,w,h = xywh
xmax = xmin+w
ymax = ymin+h
bbox = [(xmin+xmin_t)/2, (ymin+ymin_t)/2, (xmax+xmax_t)/2,(ymax+ymax_t)/2,confidence]
roi_feature = box_info[12:]
i3d_feature = i3d_features[idx][12:]
trajs[tid]["category_id"].append(cat_id)
if video_name == "ILSVRC2015_train_00884000" + "_myFaster18":
i3d_feature = [0]*self.dim_i3d
trajs[tid]["bboxes"].append(bbox)
trajs[tid]["roi_features"].append(roi_feature)
trajs[tid]["i3d_features"].append(i3d_feature)
trajs[tid]["frame_ids"].append(frame_id) #
# print(video_name)
for tid in trajs.keys():
if trajs[tid]["category_id"] == []:
trajs[tid]["category_id"] = 0
else:
# print(video_name,trajs[tid]["category_id"])
temp = np.argmax(np.bincount(trajs[tid]["category_id"])) # 求众数
trajs[tid]["category_id"] = int(temp)
frame_ids = trajs[tid]["frame_ids"]
start = min(frame_ids)
end = max(frame_ids) + 1
dura_len = end - start
duration = (start,end) # 前闭后开区间
roi_feature = np.array(trajs[tid]["roi_features"])
i3d_feature = np.array(trajs[tid]["i3d_features"]) # list of np.array --> 2d-numpy array
# concatenate i3d_features and visual_feature
trajs[tid]["roi_features"] = np.concatenate([roi_feature,i3d_feature],axis=-1) # shape == (n_frames,2048+832)
# print(trajs[tid]["box_feats"].shape)
trajs[tid]["bboxes"] = np.array(trajs[tid]["bboxes"])
# 将太短的视为背景,后续过滤掉
if len(frame_ids) < self.min_frames_th:
trajs[tid]["category_id"] = 0
else:
trajs[tid]["duration"] = (start,end)
# 对于非背景的traj, 看是否需要插值
if trajs[tid]["category_id"] !=0 and len(frame_ids) != dura_len:
trajs[tid]["roi_features"] = linear_interpolation(trajs[tid]["roi_features"],frame_ids)
trajs[tid]["bboxes"] = linear_interpolation(trajs[tid]["bboxes"],frame_ids)
if trajs[tid]["category_id"] !=0:
assert len(trajs[tid]["bboxes"]) == dura_len
# trajs = {k:v for k,v in trajs.items() if v["category_id"]!=0}
cat_ids = []
traj_boxes = []
roi_features_list = []
traj_durations = []
for tid in trajs.keys():
if trajs[tid]["category_id"] != 0:
dura_len = trajs[tid]["duration"][1] - trajs[tid]["duration"][0]
assert len(trajs[tid]["bboxes"]) == dura_len
cat_ids.append(trajs[tid]["category_id"])
traj_boxes.append(trajs[tid]["bboxes"])
roi_features_list.append(trajs[tid]["roi_features"])
traj_durations.append(trajs[tid]["duration"])
return TrajProposal(video_name,cat_ids,traj_boxes,traj_durations,roi_features_list,self.max_proposal)