# Object tracking with DiffusionDet

## Author : Guillaume Horent

## Installation of all dependencies

In [1]:
import sys


import cv2
import os
import torch
import numpy as np
import math
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
from detectron2.modeling import detector_postprocess
from torchvision.ops import nms

from diffusiondet.config import add_diffusiondet_config
from diffusiondet.predictor import VisualizationDemo
from diffusiondet.util.model_ema import add_model_ema_configs, may_build_model_ema, may_get_ema_checkpointer, EMAHook, \
    apply_model_ema_and_restore, EMADetectionCheckpointer

from diffusiondet.detector import DiffusionDet as ddet
import glob
import pandas as pd
from tqdm import tqdm


import imageio
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

import csv
from scipy.optimize import linear_sum_assignment

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configuration of DiffusionDet

Here we use a pretrained DiffusionDet which uses a ResNet101 trained on MS-COCO.(https://github.com/ShoufaChen/DiffusionDet). 

We use a confidence_threshold equal to 0.5. 

In [3]:
#Configuration of the DiffusionDet with Resnet101 trained on MS-COCO

confidence_threshold = 0.5

def setup_cfg():#args):
  # load config from file and command-line arguments
  cfg = get_cfg()
  # To use demo for Panoptic-DeepLab, please uncomment the following two lines.
  # from detectron2.projects.panoptic_deeplab import add_panoptic_deeplab_config  # no need
  # add_panoptic_deeplab_config(cfg)
  add_diffusiondet_config(cfg)
  add_model_ema_configs(cfg)
  cfg.merge_from_file('configs/diffdet.mot17.swinbase.yaml')
  cfg.MODEL.WEIGHTS = 'models/diffdet_coco_swinbase.pth'
  cfg.MODEL.DiffusionDet.USE_NMS = True
  #cfg.merge_from_list(args.opts)
  # Set score_threshold for builtin models
  cfg.MODEL.RETINANET.SCORE_THRESH_TEST = confidence_threshold
  cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold
  cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = confidence_threshold
  cfg.MODEL.DiffusionDet.NUM_PROPOSALS = 500 # To Adapt
  cfg.MODEL.DiffusionDet.SAMPLE_STEP = 1 # To Adapt
  cfg.freeze()
  return cfg

In [4]:

def compute_iou(bbox1, bbox2):
  #Returns the ratio of the intersection over the union of the two boxes

  # Compute the coordinates of the intersection rectangle
  x_left = max(bbox1[0], bbox2[0])
  y_top = max(bbox1[1], bbox2[1])
  x_right = min(bbox1[2], bbox2[2])
  y_bottom = min(bbox1[3], bbox2[3])

  # Check if the intersection is valid
  if x_right < x_left or y_bottom < y_top:
      return 0.0

  # Compute the area of the intersection
  intersection_area = (x_right - x_left) * (y_bottom - y_top)

  # Compute the areas of the bounding boxes
  bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
  bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])

  # Compute the IoU
  iou = intersection_area / (bbox1_area + bbox2_area - intersection_area)
  return iou

## Object tracking on a video file

We first configure the DiffusionDet with the previously imported weights and parameters. 

In [5]:
#Configuration of the DiffusionDet
cfg = setup_cfg()
demo = VisualizationDemo(cfg)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Then we load the folder containing the videos and intialize the frame numbers. 

In the following we iterate over every frame of every videos. 
For each frame :
- we detect the bounding boxes
- we add the boxes to the tracks
- for each box:
  - find the one that maximises IoU (Intersection over Union)
    - if the maximum IoU is over 0.5 than we consider that it is the same box
    - otherwise, create a new box with an unused ID

  - print the corresponding box on the frame
- add the frame to the tracked video


In [None]:
tracks = {}
# Initialize the dictionary of track IDs
track_ids = {}

for video in videos:
    vid_path = os.path.join(video_dir, video)
    cap = cv2.VideoCapture(vid_path)
    success, frame = cap.read()

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_name = video.split(".")[0] + "_tracked.mp4"
    video_path = os.path.join("/content/drive/MyDrive/DiffusionDet/diffusiondet/tracked_videos", video_name)
    out = cv2.VideoWriter(video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (frame.shape[1], frame.shape[0]))

    # Iterate through frames in the video
    while success:
        # Extract bounding boxes, labels, and probabilities using demo.run_on_image
        # Replace this with your own code for running object detection
        
        if frame_number not in tracks:
            tracks[frame_number] = []

        print(str(frame_number) + '/301')
        detections = demo.run_on_image(frame)
        # Format the detections for convenience
        # Replace this with your own code for formatting the detections

        bboxes = detections[0]['instances']._fields['pred_boxes'].tensor
        labels = detections[0]['instances']._fields['pred_classes']
        scores = detections[0]['instances']._fields['scores']
        keep = nms(bboxes, scores, 0.5)
        bboxes = bboxes[keep]
        scores = scores[keep]
        labels = labels[keep]

        for bbox, label, score in zip(bboxes, labels, scores):
            # Initialize a new track
            track = {'bbox': bbox, 'conf': score}

            # Find the closest track in the previous frame based on the IoU
            max_iou = 0
            best_match = None
            if frame_number > 1 and tracks[frame_number - 1]:
                for previous_track in tracks[frame_number - 1]:
                    iou = compute_iou(bbox, previous_track['bbox'])
                    if iou > max_iou:
                        max_iou = iou
                        best_match = previous_track
                        
            if max_iou > 0.5:
                # Update the track with the new bounding box and confidence
                best_match['bbox'] = bbox
                best_match['conf'] = score
                if 'id' in track:  # check if the track already has an ID
                    track['id'] = track['id']  # use the existing ID
                else:
                    track['id'] = best_match['id']  # assign the ID of the best matching track from the previous frame
            else:
                if track_ids:  # check if the dictionary of track IDs is not empty
                    # Assign a new ID that hasn't been used yet
                    unused_ids = [id for id in track_ids if id not in track_ids.values()]
                    if unused_ids:  # check if there are any unused IDs
                        track['id'] = unused_ids[0]
                        track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
                    else:
                        track['id'] = max(track_ids) + 1  # assign a new ID
                        track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
                else:
                    track['id'] = 1
                    track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
            tracks[frame_number].append(track)  # add the track to the list of tracks

        colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255),(128, 0, 0), (0, 128, 0), (0, 0, 128), (128, 128, 0), (128, 0, 128), (0, 128, 128),           (64, 0, 0), (0, 64, 0), (0, 0, 64)]

        for track in tracks[frame_number]:
            xmin, ymin, xmax, ymax = track['bbox']
            color = colors[track['id'] % len(colors)]
            cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
            cv2.putText(frame, str(track['id']) + ' ' + str("{:.2f}".format(track['conf'].item())), (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

        out.write(frame)
        success, frame = cap.read()
        frame_number += 1

    # Release the VideoCapture and VideoWriter objects
    cap.release()
    out.release()



## Object Tracking with Diffusion Det

In this section we perform the object tracking task on a video from MOT17. While we perform the object tracking task, we also write a csv file with the following information for every frame : 

frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z

or

frame, id, bb_left, bb_top, bb_width, bb_height, conf


In the previous line, we have : 
- frame: the frame number
- id: the class if of the box (a "car" for instance)
- bb_left: left side of the box x-axis coordinate
- bb_top: upper side of the box y-axis coordinate
- bb_width: width of the box
- bb_height : height of the box
- conf : confidence detection
- x,y and z : they can be ignored in a 2D detection model

In the next section, we consider MOT17-02-SDP which doesn't includes the x,y,z coordinate in the csv file. 

In [7]:
def object_tracking_frame_txt_writing(frames_folder,video_name,detection_file_name, reverse=False,video_output=True):
    #Load the file of interest : (worked on this one because it is one of the shortest of the MOT17)
    frames_dir = "datasets/MOT17/train/"+ frames_folder
    frames_files_list = [f for f in os.listdir(frames_dir) if f.endswith(".jpg")]
    frames_files_list.sort(reverse=reverse)
    output_folder = 'results_proposals_500'
  

    #We initialise the frame number
    frame_number = 1
  
    # Initialize the dictionary of track IDs
    tracks = {}
    track_ids = {}

    #We initialise the detection function
    detections_list = []

    #We initialize the output video
    if video_output==True:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_path = os.path.join(output_folder, video_name)
        frame_0 = cv2.imread(frames_dir + '/' + frames_files_list[0])
        out = cv2.VideoWriter(video_path, fourcc, 30, (frame_0.shape[1], frame_0.shape[0]))

    #We intialize the csv files that writes the boxes detected
    with open(output_folder + '/' + detection_file_name, 'w', newline='') as txtfile:
    
        # Iterate through frames in the video
        for i,frame_name in enumerate(tqdm(frames_files_list)):
            frame = cv2.imread(frames_dir+'/'+frame_name)
              #print(frame_name)

            if frame_number not in tracks:
                  tracks[frame_number] = []

            #print(str(frame_number) + '/' +str(len(frames_files_list)))

            detections = demo.run_on_image(frame)
            # Format the detections for convenience
            # Replace this with your own code for formatting the detections
            bboxes = detections[0]['instances']._fields['pred_boxes'].tensor
            labels = detections[0]['instances']._fields['pred_classes']
            scores = detections[0]['instances']._fields['scores']
            # keep only detections of pedestrians
            idx_pedestrians = torch.squeeze((labels == 0).nonzero(), dim=1)
            bboxes = bboxes[idx_pedestrians, :]
            scores = scores[idx_pedestrians]
            labels = labels[idx_pedestrians]
            
            keep = nms(bboxes, scores, 0.5)
            bboxes = bboxes[keep]
            scores = scores[keep]
            labels = labels[keep]





            for bbox, label, score in zip(bboxes, labels, scores):
                # Initialize a new track
                track = {'bbox': bbox, 'conf': score}

                # Find the closest track in the previous frame based on the IoU
                max_iou = 0
                best_match = None
                if frame_number > 1 and tracks[frame_number - 1]:
                    for previous_track in tracks[frame_number - 1]:
                        iou = compute_iou(bbox, previous_track['bbox'])
                        if iou > max_iou:
                            max_iou = iou
                            best_match = previous_track

                if max_iou > 0.5:
                    # Update the track with the new bounding box and confidence
                    best_match['bbox'] = bbox
                    best_match['conf'] = score
                    if 'id' in track:  # check if the track already has an ID
                        track['id'] = track['id']  # use the existing ID
                    else:
                        track['id'] = best_match['id']  # assign the ID of the best matching track from the previous frame
                else:
                    if track_ids:  # check if the dictionary of track IDs is not empty
                        # Assign a new ID that hasn't been used yet
                        unused_ids = [id for id in track_ids if id not in track_ids.values()]
                        if unused_ids:  # check if there are any unused IDs
                            track['id'] = unused_ids[0]
                            track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
                        else:
                            track['id'] = max(track_ids) + 1  # assign a new ID
                            track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
                    else:
                        track['id'] = 1
                        track_ids.update({track['id']: track['id']})  # add the new ID to the dictionary of track IDs
                tracks[frame_number].append(track)  # add the track to the list of tracks

                #Now we write each box in the file
                #frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z
                bbox_cpu = bbox.to(torch.device('cpu')).numpy()
                label_cpu = label.to(torch.device('cpu')).numpy()
                score_cpu = score.to(torch.device('cpu')).numpy()
                width_bbox = np.abs(bbox_cpu[0]-bbox_cpu[2])
                height_bbox = np.abs(bbox_cpu[1]-bbox_cpu[3])
                if(reverse==False):
                    txtfile.write(str(frame_number)+','+str(label_cpu)+','+str(bbox_cpu[0])+','+str(bbox_cpu[1])+','+str(width_bbox)+','+str(height_bbox)+','+str(track['id'])+','+str(score_cpu)+ '\n')
                    detections_list.append([frame_number,int(label_cpu),bbox_cpu[0],bbox_cpu[1],width_bbox,height_bbox,float(score_cpu)])
                if(reverse==True):
                    txtfile.write(str(len(frames_files_list)-frame_number+1)+','+str(label_cpu)+','+str(bbox_cpu[0])+','+str(bbox_cpu[1])+','+str(width_bbox)+','+str(height_bbox)+','+str(track['id'])+','+str(score_cpu)+ '\n')
                    detections_list.append([len(frames_files_list)-frame_number+1,int(label_cpu),bbox_cpu[0],bbox_cpu[1],width_bbox,height_bbox,float(score_cpu)])
            #We only create a video output if required
            if video_output==True:
                colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255),(128, 0, 0), (0, 128, 0), (0, 0, 128), (128, 128, 0), (128, 0, 128), (0, 128, 128),(64, 0, 0), (0, 64, 0), (0, 0, 64)]
                for track in tracks[frame_number]:
                    xmin, ymin, xmax, ymax = track['bbox']
                    color = colors[track['id'] % len(colors)]
                    cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color, 2)
                    cv2.putText(frame, str(track['id']) + ' ' + str("{:.2f}".format(track['conf'].item())), (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                out.write(frame)

            frame_number += 1
    if(video_output==True):
        out.release() 
    return detections_list

Now we test the above function. 

In [14]:
test_detections = object_tracking_frame_txt_writing(frames_folder='MOT17-13-DPM/img1',video_name = "13_DPM_tracked_new_test.mp4",
                                  detection_file_name='13_DPM_reversed_det.txt',reverse=False,video_output=True)

100%|████████████████████████████████████████████████████████████████████████| 750/750 [09:06<00:00,  1.37it/s]


##Performance evaluation

In order to compare the performance of the model we get the ground truth.

In [16]:
# Load the ground truth annotations and predictions for the model
gt_path = 'datasets/MOT17/train/MOT17-02-DPM/gt/gt.txt'
predictions_path = 'results/02_DPM_reversed_det.txt'

predicted_bbs = []
predicted_frame_bbs = [] #frame number of the corresponding predicated box
gt_bbs = []
gt_frame_bbs = [] #frame number of the corresponding ground true box
#Box are structured in the following way x_left,y_top,x_right,y_bottom
with open(predictions_path, 'r') as txt_file:
  for line in txt_file:
    frame, bb_id, bb_left, bb_top, bb_width, bb_height, conf = map(float, line.strip().split(','))
    predicted_bbs.append((int(frame),int(bb_id),(bb_left, bb_top, bb_left + bb_width, bb_top + bb_height)))
    predicted_frame_bbs.append(frame)
with open(gt_path, 'r') as txt_file:
  for line in txt_file:
    frame, bb_id, bb_left, bb_top, bb_width, bb_height, conf,_,_ = map(float, line.strip().split(','))
    gt_bbs.append((int(frame),int(bb_id),(bb_left, bb_top, bb_left + bb_width, bb_top + bb_height)))
    gt_frame_bbs.append(frame)

### Precision and Recall

Let's look into the precision and recall of our model. 
Here is an example of how you might calculate precision and recall for object detection:
- For each image in the test dataset, use your object detection system to generate a set of predicted object bounding boxes.
- For each ground truth object in the image, find the predicted object bounding box that has the highest overlap (intersection over union) with the ground truth box.
- If the overlap is above a certain threshold (e.g., 0.5), then the predicted object is considered a true positive. Otherwise, it is considered a false positive.
- Calculate the precision as the number of true positives divided by the total number of detected objects (true positives + false positives).
- Calculate the recall as the number of true positives divided by the total number of true objects (true positives + false negatives).
- Repeat this process for all images in the test dataset, and average the precision and recall across all images to get an overall estimate of the object detection system's performance.


We then compute the MOTA score of our object.


In [15]:
def compute_MOTA(path_pred_file, path_gt_file):
    df_pred = pd.read_csv(path_pred_file, sep=',', header=None, names=['frame', 'label', 'bb_left', 'bb_top', 'bb_width', 'bb_height','track_id', 'conf'])
    df_gt = pd.read_csv(path_gt_file, sep=',', header=None, names=['frame', 'id_nb', 'bb_left', 'bb_top', 'bb_width', 'bb_height', 'conf_score', 'class', 'visibility'])
    
    # Get only detections of people - no need now that only pedestrian detections are displayed
    #df_pred = df_pred[df_pred['label'] == 0]
    
    # In GT, get detections of people, but also distractors and person in vehicles that might get detected
    df_gt = df_gt.loc[df_gt['class'].isin([1,2,7,8,12])]
    
    
    nb_frames = np.max(df_gt['frame'].tolist())
    sum_num = 0
    sum_denom = 0
    precision = []
    list_FN = []
    list_FP = []
    corresp = {}
    
    for i in tqdm(range(1, nb_frames + 1)):
        # Dataframe corresponding to the detections of the current frame i
        df_pred_bb = df_pred[df_pred['frame'] == i] 
        df_gt_bb = df_gt[df_gt['frame'] == i]
        

        
        df_pred_bb = df_pred_bb.reset_index(drop=True)
        df_gt_bb = df_gt_bb.reset_index(drop=True)
        
        
        # Calculate IoUs between all predicted and ground truth bounding boxes
        iou_matrix = np.zeros((len(df_pred_bb.index), len(df_gt_bb.index)))
        #print("IoU matrix shape: ", iou_matrix.shape)
        for j, row_pred in df_pred_bb.iterrows():
            for k, row_gt in df_gt_bb.iterrows():
                # Convert left, top, width, height coordinates to left, top, right, down
                pred_bb = (row_pred['bb_left'], row_pred['bb_top'], row_pred['bb_left'] + row_pred['bb_width'], row_pred['bb_top'] + row_pred['bb_height'])
                gt_bb = (row_gt['bb_left'], row_gt['bb_top'], row_gt['bb_left'] + row_gt['bb_width'], row_gt['bb_top'] + row_gt['bb_height'])
                iou_matrix[j, k] = compute_iou(pred_bb, gt_bb)
        
        # Use Hungarian matching to determine optimal matching of predicted to ground truth bounding boxes
        row_ind, col_ind = linear_sum_assignment(-iou_matrix)
        
        FP = 0
        matches_true = []
        matches_dist = []
        list_classes = df_gt_bb['class'].tolist()
        list_visibility = df_gt_bb['conf_score'].tolist()
        for pred_match, gt_match in zip(row_ind, col_ind):
            if iou_matrix[pred_match, gt_match] >= 0.5:
                if list_classes[gt_match] == 1 and list_visibility[gt_match] == 1:
                    matches_true.append((pred_match, gt_match))
                else:
                    matches_dist.append((pred_match, gt_match))
            else:
                FP +=1
                
        TP = len(matches_true)
        
        
        df_gt_only_target = df_gt_bb[(df_gt_bb['class'] == 1) & (df_gt_bb['conf_score'] == 1)]
        
        
        FN = len(df_gt_only_target.index) - len(matches_true)
        GT = len(df_gt_only_target.index)
        
        list_FN.append(FN)
        list_FP.append(FP)
        #print("True positives:", TP)
        #print("False positives:", FP)
        #print("False negatives: ", FN)
        #print("Ground truth: ", GT)
        
        
        sum_num += FN + FP
        sum_denom += GT
        if (TP+FP != 0):
            precision.append(TP/(TP+FP))

    MOTA_score = 1 - sum_num/sum_denom
    print("Final MOTA score: ", MOTA_score)
    print("Mean Precision @IOU50: ", np.mean(precision))
    print("Mean False Negatives: ", np.mean(list_FN))
    print("Mean False Positives: ", np.mean(list_FP))

In [36]:
compute_MOTA('results_proposals_500/13_DPM_reversed_det.txt', 'datasets/MOT17/train/MOT17-13-DPM/gt/gt.txt')

100%|████████████████████████████████████████████████████████████████████████| 750/750 [00:12<00:00, 59.01it/s]

Final MOTA score:  0.46632880948290667
Mean Precision @IOU50:  0.8186485674971743
Mean False Negatives:  6.101333333333334
Mean False Positives:  2.1826666666666665





## Video analysis in both directions

In this section, we try to improve our results by performing the tracking task in both direction and then taking the average position of each box. 

In [None]:
def tracking_both_direction(frames_folder,video_name,detected_once_threshold,surface_min_iou,video_output=False):
  #This function performs the tracking task by taking the analysing the video moving forward and backwards

  #First we compute the detection with the frames going forward
  forward_det = object_tracking_frame_txt_writing(frames_folder=frames_folder,video_name = "",
                                  detection_file_name=video_name+'forward_det.txt',reverse=False,video_output=False)
  #Then we compute going backwards :
  backward_det = object_tracking_frame_txt_writing(frames_folder=frames_folder,video_name = "",
                                  detection_file_name=video_name+'backwards_det.txt',reverse=True,video_output=False)
  
  #then we perform the averaging of the two detections
  nb_frames=1
  for line in forward_det:
    if line[0]>nb_frames:
      nb_frames = line[0]
  average_detections = []
  for i in range(nb_frames):
      #we extract the detections of the current frame
      frames_forward_pred = [cell for cell in forward_det if (cell[0]==i+1 )] #on ne tient pas rigueur de la classe
      frames_backward_pred = [cell for cell in backward_det if (cell[0]==i+1 )]

      # Calculate IoUs between all predicted and ground truth bounding boxes
      iou_matrix = np.zeros((len(frames_forward_pred), len(frames_backward_pred)))
      for j, forward_predicted_bb in enumerate(frames_forward_pred):
        for k, backward_predicted_bb in enumerate(frames_backward_pred):
          iou_matrix[j, k] = compute_iou([forward_predicted_bb[2],forward_predicted_bb[3],forward_predicted_bb[2]+forward_predicted_bb[4],
                                          forward_predicted_bb[3]+forward_predicted_bb[5]],
                                          [backward_predicted_bb[2],backward_predicted_bb[3],backward_predicted_bb[2]+backward_predicted_bb[4],
                                          backward_predicted_bb[3]+backward_predicted_bb[5]])

      # Use Hungarian matching to determine optimal matching of predicted to ground truth bounding boxes
      row_ind, col_ind = linear_sum_assignment(-iou_matrix)
      
      for j in range(len(row_ind)):
        if(iou_matrix[row_ind[j],col_ind[j]]>=surface_min_iou):
          bb_left_av = (frames_forward_pred[row_ind[j]][2] + frames_backward_pred[col_ind[j]][2])/2
          bb_top_av = (frames_forward_pred[row_ind[j]][3] + frames_backward_pred[col_ind[j]][3])/2
          bb_width_av = (frames_forward_pred[row_ind[j]][4] + frames_backward_pred[col_ind[j]][4])/2
          bb_height_av = (frames_forward_pred[row_ind[j]][5] + frames_backward_pred[col_ind[j]][5])/2
          score_av = (frames_forward_pred[row_ind[j]][6] + frames_backward_pred[col_ind[j]][6])/2
          average_box = [frames_forward_pred[row_ind[j]][0],frames_forward_pred[row_ind[j]][1],bb_left_av,bb_top_av,bb_width_av,bb_height_av,score_av]
          average_detections.append(average_box)
        else :
          average_detections.append(frames_forward_pred[row_ind[j]])
          average_detections.append(frames_backward_pred[col_ind[j]])

      for j in range(len(frames_forward_pred)):
        if j not in row_ind and frames_forward_pred[j][6]>detected_once_threshold:
          average_detections.append(frames_forward_pred[j])
      for j in range(len(frames_backward_pred)):
        if j not in col_ind and frames_backward_pred[j][6]>detected_once_threshold:
          average_detections.append(frames_backward_pred[j])
      
  #We intialize the txt files that writes the boxes detected
  with open('/content/drive/MyDrive/DiffusionDet/diffusiondet/tracked_videos/'+ video_name+ 'both_directions.txt', 'w', newline='') as txtfile:
    for cell in average_detections:
      txtfile.write(str(cell[0])+','+str(cell[1])+','+str(cell[2])+','+str(cell[3])+','+str(cell[4])+','+str(cell[5])+','+str(cell[6])+ '\n')

  #We initialize the output video
  if video_output==True:
    frames_dir = "/content/drive/MyDrive/DiffusionDet/diffusiondet/videos/"+ frames_folder
    frames_files_list = [f for f in os.listdir(frames_dir) if f.endswith(".jpg")]
    frames_files_list.sort()
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_path = os.path.join("/content/drive/MyDrive/DiffusionDet/diffusiondet/tracked_videos", video_name)
    frame_0 = cv2.imread(frames_dir + '/' + frames_files_list[0])
    out = cv2.VideoWriter(video_path, fourcc, 30, (frame_0.shape[1], frame_0.shape[0]))
    for i,frame_name in enumerate(frames_files_list):
      frame_detected_bbs = [cell for cell in average_detections if (cell[0]==i+1 )]
      frame = cv2.imread(frames_dir+'/'+frame_name)
      for bbs in frame_detected_bbs:
        cv2.rectangle(frame, (int(bbs[2]), int(bbs[3])), (int(bbs[2]+bbs[4]), int(bbs[3]+bbs[5])), (255, 0, 0), 2)
      out.write(frame)
    out.release()
  return None 

In [None]:
tracking_both_direction(frames_folder='MOT17-02-DPM/img1',video_name= "02_DPM_tracked_both_directions.mp4",
                        detected_once_threshold = 0.4,surface_min_iou=0.5,video_output=True)


000001.jpg
1/20
000002.jpg
2/20
000003.jpg
3/20
000004.jpg
4/20
000005.jpg
5/20
000006.jpg
6/20
000007.jpg
7/20
000008.jpg
8/20
000009.jpg
9/20
000010.jpg
10/20
000011.jpg
11/20
000012.jpg
12/20
000013.jpg
13/20
000014.jpg
14/20
000015.jpg
15/20
000016.jpg
16/20
000017.jpg
17/20
000018.jpg
18/20
000019.jpg
19/20
000020.jpg
20/20
000020.jpg
1/20
000019.jpg
2/20
000018.jpg
3/20
000017.jpg
4/20
000016.jpg
5/20
000015.jpg
6/20
000014.jpg
7/20
000013.jpg
8/20
000012.jpg
9/20
000011.jpg
10/20
000010.jpg
11/20
000009.jpg
12/20
000008.jpg
13/20
000007.jpg
14/20
000006.jpg
15/20
000005.jpg
16/20
000004.jpg
17/20
000003.jpg
18/20
000002.jpg
19/20
000001.jpg
20/20
