C API for batch inference? #7541

stephanecharette · 2021-03-25T16:54:48Z

@AlexeyAB, is there a working C API for batch inference?

I see what you recently wrote here about using OpenCV: #7487 (comment)

...but is there a way to pass multiple images to darknet and have it run inference on a batch size of 2, 4, etc? Does it work?

Looking through the darknet.h file, I can see load_network_custom() that takes a int batch parameter, and other APIs that look promising such as network_predict_batch(). Before I sink time into trying this, I thought I'd check with you to see if I'm on the right track and if the API is known to work, or legacy code from a long time ago.

The text was updated successfully, but these errors were encountered:

stephanecharette · 2021-04-25T18:37:28Z

@AlexeyAB can you help us out with this?

AlexeyAB · 2021-04-25T21:56:40Z

@stephanecharette

It was added for Pytorch example, but currently it is commented because as I remember there were some issues (with NMS which removes duplicates across different images in batch, while should remove only inside 1 image, or something else), so if you want, you can try to test and/or fix these issues.

darknet/darknet_images.py

Lines 232 to 235 in 4f794aa

    
           if __name__ == "__main__": 
        
               # unconmment next line for an example of batch processing 
        
               # batch_detection_example() 
        
               main()

darknet/darknet_images.py

Lines 173 to 190 in 4f794aa

    
           def batch_detection_example(): 
        
               args = parser() 
        
               check_arguments_errors(args) 
        
               batch_size = 3 
        
               random.seed(3)  # deterministic bbox colors 
        
               network, class_names, class_colors = darknet.load_network( 
        
                   args.config_file, 
        
                   args.data_file, 
        
                   args.weights, 
        
                   batch_size=batch_size 
        
               ) 
        
               image_names = ['data/horses.jpg', 'data/horses.jpg', 'data/eagle.jpg'] 
        
               images = [cv2.imread(image) for image in image_names] 
        
               images, detections,  = batch_detection(network, images, class_names, 
        
                                                      class_colors, batch_size=batch_size) 
        
               for name, image in zip(image_names, images): 
        
                   cv2.imwrite(name.replace("data/", ""), image) 
        
               print(detections)

darknet/darknet_images.py

Lines 119 to 135 in 4f794aa

    
           def batch_detection(network, images, class_names, class_colors, 
        
                               thresh=0.25, hier_thresh=.5, nms=.45, batch_size=4): 
        
               image_height, image_width, _ = check_batch_shape(images, batch_size) 
        
               darknet_images = prepare_batch(images, network) 
        
               batch_detections = darknet.network_predict_batch(network, darknet_images, batch_size, image_width, 
        
                                                                image_height, thresh, hier_thresh, None, 0, 0) 
        
               batch_predictions = [] 
        
               for idx in range(batch_size): 
        
                   num = batch_detections[idx].num 
        
                   detections = batch_detections[idx].dets 
        
                   if nms: 
        
                       darknet.do_nms_obj(detections, num, len(class_names), nms) 
        
                   predictions = darknet.remove_negatives(detections, class_names, num) 
        
                   images[idx] = darknet.draw_boxes(predictions, images[idx], class_colors) 
        
                   batch_predictions.append(predictions) 
        
               darknet.free_batch_detections(batch_detections, batch_size) 
        
               return images, batch_predictions

It uses C functions:

darknet/darknet.py

Line 315 in 4f794aa

network_predict_batch = lib.network_predict_batch
darknet/darknet.py

Line 259 in 4f794aa

free_batch_detections = lib.free_batch_detections
darknet/darknet.py

Line 275 in 4f794aa

load_net_custom = lib.load_network_custom

There are several C-functions related to the batch-inference:

network parse_network_cfg_custom(char *filename, int batch, int time_steps)
det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter)
make_network_boxes_batch
fill_network_boxes_batch
yolo_num_detections_batch
get_yolo_detections_batch

You can look at this code:

darknet/src/parser.c

Line 1342 in 4f794aa

network parse_network_cfg_custom(char *filename, int batch, int time_steps)

darknet/src/network.c

Lines 1059 to 1072 in 4f794aa

    
           det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter) 
        
           { 
        
               network_predict(*net, im.data); 
        
               det_num_pair *pdets = (struct det_num_pair *)calloc(batch_size, sizeof(det_num_pair)); 
        
               int num; 
        
               int batch; 
        
               for(batch=0; batch < batch_size; batch++){ 
        
                   detection *dets = make_network_boxes_batch(net, thresh, &num, batch); 
        
                   fill_network_boxes_batch(net, w, h, thresh, hier, map, relative, dets, letter, batch); 
        
                   pdets[batch].num = num; 
        
                   pdets[batch].dets = dets; 
        
               } 
        
               return pdets; 
        
           }

darknet/src/yolo_layer.c

Lines 1048 to 1061 in 4f794aa

    
           int yolo_num_detections_batch(layer l, float thresh, int batch) 
        
           { 
        
               int i, n; 
        
               int count = 0; 
        
               for (i = 0; i < l.w*l.h; ++i){ 
        
                   for(n = 0; n < l.n; ++n){ 
        
                       int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4); 
        
                       if(l.output[obj_index] > thresh){ 
        
                           ++count; 
        
                       } 
        
                   } 
        
               } 
        
               return count; 
        
           }

darknet/src/yolo_layer.c

Lines 1128 to 1162 in 4f794aa

    
           int get_yolo_detections_batch(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter, int batch) 
        
           { 
        
               int i,j,n; 
        
               float *predictions = l.output; 
        
               //if (l.batch == 2) avg_flipped_yolo(l); 
        
               int count = 0; 
        
               for (i = 0; i < l.w*l.h; ++i){ 
        
                   int row = i / l.w; 
        
                   int col = i % l.w; 
        
                   for(n = 0; n < l.n; ++n){ 
        
                       int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4); 
        
                       float objectness = predictions[obj_index]; 
        
                       //if(objectness <= thresh) continue;    // incorrect behavior for Nan values 
        
                       if (objectness > thresh) { 
        
                           //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n); 
        
                           int box_index = entry_index(l, batch, n*l.w*l.h + i, 0); 
        
                           dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.new_coords); 
        
                           dets[count].objectness = objectness; 
        
                           dets[count].classes = l.classes; 
        
                           if (l.embedding_output) { 
        
                               get_embedding(l.embedding_output, l.w, l.h, l.n*l.embedding_size, l.embedding_size, col, row, n, batch, dets[count].embeddings); 
        
                           } 
        
                           for (j = 0; j < l.classes; ++j) { 
        
                               int class_index = entry_index(l, batch, n*l.w*l.h + i, 4 + 1 + j); 
        
                               float prob = objectness*predictions[class_index]; 
        
                               dets[count].prob[j] = (prob > thresh) ? prob : 0; 
        
                           } 
        
                           ++count; 
        
                       } 
        
                   } 
        
               } 
        
               correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter); 
        
               return count; 
        
           }

darknet/src/network.c

Lines 760 to 776 in 4f794aa

    
           float *network_predict(network net, float *input) 
        
           { 
        
           #ifdef GPU 
        
               if(gpu_index >= 0)  return network_predict_gpu(net, input); 
        
           #endif 
        
               network_state state = {0}; 
        
               state.net = net; 
        
               state.index = 0; 
        
               state.input = input; 
        
               state.truth = 0; 
        
               state.train = 0; 
        
               state.delta = 0; 
        
               forward_network(net, state); 
        
               float *out = get_network_output(net); 
        
               return out; 
        
           }

darknet/src/network_kernels.cu

Lines 678 to 734 in 4f794aa

    
           float *network_predict_gpu(network net, float *input) 
        
           { 
        
               if (net.gpu_index != cuda_get_device()) 
        
                   cuda_set_device(net.gpu_index); 
        
               int size = get_network_input_size(net) * net.batch; 
        
               network_state state; 
        
               state.index = 0; 
        
               state.net = net; 
        
               //state.input = cuda_make_array(input, size);   // memory will be allocated in the parse_network_cfg_custom() 
        
               state.input = net.input_state_gpu; 
        
               memcpy(net.input_pinned_cpu, input, size * sizeof(float)); 
        
               state.truth = 0; 
        
               state.train = 0; 
        
               state.delta = 0; 
        
               //cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec; 
        
               static cudaGraphExec_t instance; 
        
               if ((*net.cuda_graph_ready) == 0) { 
        
                   static cudaGraph_t graph; 
        
                   if (net.use_cuda_graph == 1) { 
        
                       int i; 
        
                       for (i = 0; i < 16; ++i) switch_stream(i); 
        
                       cudaStream_t stream0 = switch_stream(0); 
        
                       CHECK_CUDA(cudaDeviceSynchronize()); 
        
                       printf("Try to capture graph... \n"); 
        
                       //cudaGraph_t graph = (cudaGraph_t)net.cuda_graph; 
        
                       CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal)); 
        
                   } 
        
                   cuda_push_array(state.input, net.input_pinned_cpu, size); 
        
                   forward_network_gpu(net, state); 
        
                   if (net.use_cuda_graph == 1) { 
        
                       cudaStream_t stream0 = switch_stream(0); 
        
                       CHECK_CUDA(cudaStreamEndCapture(stream0, &graph)); 
        
                       CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0)); 
        
                       (*net.cuda_graph_ready) = 1; 
        
                       printf(" graph is captured... \n"); 
        
                       CHECK_CUDA(cudaDeviceSynchronize()); 
        
                   } 
        
                   CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream())); 
        
               } 
        
               else { 
        
                   cudaStream_t stream0 = switch_stream(0); 
        
                   //printf(" cudaGraphLaunch \n"); 
        
                   CHECK_CUDA( cudaGraphLaunch(instance, stream0) ); 
        
                   CHECK_CUDA( cudaStreamSynchronize(stream0) ); 
        
                   //printf(" ~cudaGraphLaunch \n"); 
        
               } 
        
               float *out = get_network_output_gpu(net); 
        
               reset_wait_stream_events(); 
        
               //cuda_free(state.input);   // will be freed in the free_network() 
        
               return out; 
        
           }

stephanecharette mentioned this issue Apr 23, 2021

Demo in batch size > 1 #7639

Open

stephanecharette mentioned this issue Aug 9, 2021

DarkHelp does not support batch inference stephanecharette/DarkHelp#10

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

C API for batch inference? #7541

C API for batch inference? #7541

stephanecharette commented Mar 25, 2021

stephanecharette commented Apr 25, 2021

AlexeyAB commented Apr 25, 2021

C API for batch inference? #7541

C API for batch inference? #7541

Comments

stephanecharette commented Mar 25, 2021

stephanecharette commented Apr 25, 2021

AlexeyAB commented Apr 25, 2021