Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C API for batch inference? #7541

Open
stephanecharette opened this issue Mar 25, 2021 · 2 comments
Open

C API for batch inference? #7541

stephanecharette opened this issue Mar 25, 2021 · 2 comments

Comments

@stephanecharette
Copy link
Collaborator

@AlexeyAB, is there a working C API for batch inference?

I see what you recently wrote here about using OpenCV: #7487 (comment)

...but is there a way to pass multiple images to darknet and have it run inference on a batch size of 2, 4, etc? Does it work?

Looking through the darknet.h file, I can see load_network_custom() that takes a int batch parameter, and other APIs that look promising such as network_predict_batch(). Before I sink time into trying this, I thought I'd check with you to see if I'm on the right track and if the API is known to work, or legacy code from a long time ago.

@stephanecharette
Copy link
Collaborator Author

@AlexeyAB can you help us out with this?

@AlexeyAB
Copy link
Owner

@stephanecharette

It was added for Pytorch example, but currently it is commented because as I remember there were some issues (with NMS which removes duplicates across different images in batch, while should remove only inside 1 image, or something else), so if you want, you can try to test and/or fix these issues.

  • darknet/darknet_images.py

    Lines 232 to 235 in 4f794aa

    if __name__ == "__main__":
    # unconmment next line for an example of batch processing
    # batch_detection_example()
    main()
  • darknet/darknet_images.py

    Lines 173 to 190 in 4f794aa

    def batch_detection_example():
    args = parser()
    check_arguments_errors(args)
    batch_size = 3
    random.seed(3) # deterministic bbox colors
    network, class_names, class_colors = darknet.load_network(
    args.config_file,
    args.data_file,
    args.weights,
    batch_size=batch_size
    )
    image_names = ['data/horses.jpg', 'data/horses.jpg', 'data/eagle.jpg']
    images = [cv2.imread(image) for image in image_names]
    images, detections, = batch_detection(network, images, class_names,
    class_colors, batch_size=batch_size)
    for name, image in zip(image_names, images):
    cv2.imwrite(name.replace("data/", ""), image)
    print(detections)
  • darknet/darknet_images.py

    Lines 119 to 135 in 4f794aa

    def batch_detection(network, images, class_names, class_colors,
    thresh=0.25, hier_thresh=.5, nms=.45, batch_size=4):
    image_height, image_width, _ = check_batch_shape(images, batch_size)
    darknet_images = prepare_batch(images, network)
    batch_detections = darknet.network_predict_batch(network, darknet_images, batch_size, image_width,
    image_height, thresh, hier_thresh, None, 0, 0)
    batch_predictions = []
    for idx in range(batch_size):
    num = batch_detections[idx].num
    detections = batch_detections[idx].dets
    if nms:
    darknet.do_nms_obj(detections, num, len(class_names), nms)
    predictions = darknet.remove_negatives(detections, class_names, num)
    images[idx] = darknet.draw_boxes(predictions, images[idx], class_colors)
    batch_predictions.append(predictions)
    darknet.free_batch_detections(batch_detections, batch_size)
    return images, batch_predictions

It uses C functions:


There are several C-functions related to the batch-inference:

  • network parse_network_cfg_custom(char *filename, int batch, int time_steps)
  • det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter)
  • make_network_boxes_batch
  • fill_network_boxes_batch
  • yolo_num_detections_batch
  • get_yolo_detections_batch

You can look at this code:

  • network parse_network_cfg_custom(char *filename, int batch, int time_steps)
  • darknet/src/network.c

    Lines 1059 to 1072 in 4f794aa

    det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter)
    {
    network_predict(*net, im.data);
    det_num_pair *pdets = (struct det_num_pair *)calloc(batch_size, sizeof(det_num_pair));
    int num;
    int batch;
    for(batch=0; batch < batch_size; batch++){
    detection *dets = make_network_boxes_batch(net, thresh, &num, batch);
    fill_network_boxes_batch(net, w, h, thresh, hier, map, relative, dets, letter, batch);
    pdets[batch].num = num;
    pdets[batch].dets = dets;
    }
    return pdets;
    }
  • darknet/src/yolo_layer.c

    Lines 1048 to 1061 in 4f794aa

    int yolo_num_detections_batch(layer l, float thresh, int batch)
    {
    int i, n;
    int count = 0;
    for (i = 0; i < l.w*l.h; ++i){
    for(n = 0; n < l.n; ++n){
    int obj_index = entry_index(l, batch, n*l.w*l.h + i, 4);
    if(l.output[obj_index] > thresh){
    ++count;
    }
    }
    }
    return count;
    }
  • darknet/src/yolo_layer.c

    Lines 1128 to 1162 in 4f794aa

    int get_yolo_detections_batch(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter, int batch)
    {
    int i,j,n;
    float *predictions = l.output;
    //if (l.batch == 2) avg_flipped_yolo(l);
    int count = 0;
    for (i = 0; i < l.w*l.h; ++i){
    int row = i / l.w;
    int col = i % l.w;
    for(n = 0; n < l.n; ++n){
    int obj_index = entry_index(l, batch, n*l.w*l.h + i, 4);
    float objectness = predictions[obj_index];
    //if(objectness <= thresh) continue; // incorrect behavior for Nan values
    if (objectness > thresh) {
    //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n);
    int box_index = entry_index(l, batch, n*l.w*l.h + i, 0);
    dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.new_coords);
    dets[count].objectness = objectness;
    dets[count].classes = l.classes;
    if (l.embedding_output) {
    get_embedding(l.embedding_output, l.w, l.h, l.n*l.embedding_size, l.embedding_size, col, row, n, batch, dets[count].embeddings);
    }
    for (j = 0; j < l.classes; ++j) {
    int class_index = entry_index(l, batch, n*l.w*l.h + i, 4 + 1 + j);
    float prob = objectness*predictions[class_index];
    dets[count].prob[j] = (prob > thresh) ? prob : 0;
    }
    ++count;
    }
    }
    }
    correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
    return count;
    }
  • darknet/src/network.c

    Lines 760 to 776 in 4f794aa

    float *network_predict(network net, float *input)
    {
    #ifdef GPU
    if(gpu_index >= 0) return network_predict_gpu(net, input);
    #endif
    network_state state = {0};
    state.net = net;
    state.index = 0;
    state.input = input;
    state.truth = 0;
    state.train = 0;
    state.delta = 0;
    forward_network(net, state);
    float *out = get_network_output(net);
    return out;
    }
  • float *network_predict_gpu(network net, float *input)
    {
    if (net.gpu_index != cuda_get_device())
    cuda_set_device(net.gpu_index);
    int size = get_network_input_size(net) * net.batch;
    network_state state;
    state.index = 0;
    state.net = net;
    //state.input = cuda_make_array(input, size); // memory will be allocated in the parse_network_cfg_custom()
    state.input = net.input_state_gpu;
    memcpy(net.input_pinned_cpu, input, size * sizeof(float));
    state.truth = 0;
    state.train = 0;
    state.delta = 0;
    //cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec;
    static cudaGraphExec_t instance;
    if ((*net.cuda_graph_ready) == 0) {
    static cudaGraph_t graph;
    if (net.use_cuda_graph == 1) {
    int i;
    for (i = 0; i < 16; ++i) switch_stream(i);
    cudaStream_t stream0 = switch_stream(0);
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("Try to capture graph... \n");
    //cudaGraph_t graph = (cudaGraph_t)net.cuda_graph;
    CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal));
    }
    cuda_push_array(state.input, net.input_pinned_cpu, size);
    forward_network_gpu(net, state);
    if (net.use_cuda_graph == 1) {
    cudaStream_t stream0 = switch_stream(0);
    CHECK_CUDA(cudaStreamEndCapture(stream0, &graph));
    CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
    (*net.cuda_graph_ready) = 1;
    printf(" graph is captured... \n");
    CHECK_CUDA(cudaDeviceSynchronize());
    }
    CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
    }
    else {
    cudaStream_t stream0 = switch_stream(0);
    //printf(" cudaGraphLaunch \n");
    CHECK_CUDA( cudaGraphLaunch(instance, stream0) );
    CHECK_CUDA( cudaStreamSynchronize(stream0) );
    //printf(" ~cudaGraphLaunch \n");
    }
    float *out = get_network_output_gpu(net);
    reset_wait_stream_events();
    //cuda_free(state.input); // will be freed in the free_network()
    return out;
    }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants