In [1]:
import argparse
import time
from pathlib import Path
import cv2
import torch

# Conclude setting / general reprocessing / plots / metrices / datasets
from utils.utils import \
    time_synchronized,select_device, increment_path,\
    scale_coords,xyxy2xywh,non_max_suppression,split_for_trace_model,\
    driving_area_mask,lane_line_mask,plot_one_box,show_seg_result,\
    AverageMeter,\
    LoadImages

In [2]:
def make_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default='data/weights/yolopv2.pt', help='model.pt path(s)')
    parser.add_argument('--source', type=str, default='data/example.jpg', help='source')  # file/folder, 0 for webcam
    parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
    parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
    parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--project', default='runs/detect', help='save results to project/name')
    parser.add_argument('--name', default='exp', help='save results to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    return parser

(GPT-4)

This code segment defines a function `make_parser` that creates and returns an instance of `argparse.ArgumentParser`. The `ArgumentParser` object is used to parse command-line arguments. The function adds several arguments to the parser using the `add_argument` method. Each argument has a name, type, default value, and help text. The arguments can be used to specify various options for the program such as the path to the weights file, the source of the input data, the image size, confidence threshold, IOU threshold for NMS, device to use for computation, whether to save confidences and results to text files, whether to save images/videos, classes to filter by, whether to use class-agnostic NMS, and the project and name for saving results. The `exist-ok` argument specifies whether it is okay to overwrite existing project/name or not.

In [3]:
def detect():
    # setting and directories
    source, weights,  save_txt, imgsz = opt.source, opt.weights,  opt.save_txt, opt.img_size
    save_img = not opt.nosave and not source.endswith('.txt')  # save inference images

    save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

    inf_time = AverageMeter()
    waste_time = AverageMeter()
    nms_time = AverageMeter()

    # Load model
    stride =32
    model  = torch.jit.load(weights)
    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA
    model = model.to(device)

    if half:
        model.half()  # to FP16  
    model.eval()

    # Set Dataloader
    vid_path, vid_writer = None, None
    dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # Run inference
    if device.type != 'cpu':
        model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
    t0 = time.time()
    for path, img, im0s, vid_cap in dataset:
        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0

        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Inference
        t1 = time_synchronized()
        [pred,anchor_grid],seg,ll= model(img)
        t2 = time_synchronized()

        # waste time: the incompatibility of  torch.jit.trace causes extra time consumption in demo version 
        # but this problem will not appear in offical version 
        tw1 = time_synchronized()
        pred = split_for_trace_model(pred,anchor_grid)
        tw2 = time_synchronized()

        # Apply NMS
        t3 = time_synchronized()
        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
        t4 = time_synchronized()

        da_seg_mask = driving_area_mask(seg)
        ll_seg_mask = lane_line_mask(ll)

        # Process detections
        for i, det in enumerate(pred):  # detections per image
          
            p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0)

            p = Path(p)  # to Path
            save_path = str(save_dir / p.name)  # img.jpg
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # img.txt
            s += '%gx%g ' % img.shape[2:]  # print string
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

                # Print results
                for c in det[:, -1].unique():
                    n = (det[:, -1] == c).sum()  # detections per class
                    #s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

                # Write results
                for *xyxy, conf, cls in reversed(det):
                    if save_txt:  # Write to file
                        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                        line = (cls, *xywh, conf) if opt.save_conf else (cls, *xywh)  # label format
                        with open(txt_path + '.txt', 'a') as f:
                            f.write(('%g ' * len(line)).rstrip() % line + '\n')

                    if save_img :  # Add bbox to image
                        plot_one_box(xyxy, im0, line_thickness=3)

            # Print time (inference)
            print(f'{s}Done. ({t2 - t1:.3f}s)')
            show_seg_result(im0, (da_seg_mask,ll_seg_mask), is_demo=True)

            # Save results (image with detections)
            if save_img:
                if dataset.mode == 'image':
                    cv2.imwrite(save_path, im0)
                    print(f" The image with the result is saved in: {save_path}")
                else:  # 'video' or 'stream'
                    if vid_path != save_path:  # new video
                        vid_path = save_path
                        if isinstance(vid_writer, cv2.VideoWriter):
                            vid_writer.release()  # release previous video writer
                        if vid_cap:  # video
                            fps = vid_cap.get(cv2.CAP_PROP_FPS)
                            #w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                            #h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                            w,h = im0.shape[1], im0.shape[0]
                        else:  # stream
                            fps, w, h = 30, im0.shape[1], im0.shape[0]
                            save_path += '.mp4'
                        vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                    vid_writer.write(im0)

    inf_time.update(t2-t1,img.size(0))
    nms_time.update(t4-t3,img.size(0))
    waste_time.update(tw2-tw1,img.size(0))
    print('inf : (%.4fs/frame)   nms : (%.4fs/frame)' % (inf_time.avg,nms_time.avg))
    print(f'Done. ({time.time() - t0:.3f}s)')

(GPT-4)

The function `detect` performs object detection on images or videos. It first extracts several options from the global `opt` variable such as the source of the input data, the path to the weights file, whether to save results to text files, and the image size. The `save_img` variable is set to `True` if images should be saved and the input source is not a text file. The `save_dir` variable is set to the path of the directory where results will be saved. The function creates several `AverageMeter` objects to keep track of inference time, waste time, and NMS time.

The function then loads the model from the specified weights file using `torch.jit.load` and moves it to the selected device using the `to` method. If the device is not a CPU, the model is converted to half precision using the `half` method. The model is then set to evaluation mode using the `eval` method.

The function creates a `LoadImages` object to load images from the specified source. If the device is not a CPU, the function runs the model once on a dummy input to warm up the GPU. The function then enters a loop that iterates over the images in the dataset. For each image, the function converts it to a PyTorch tensor, moves it to the selected device, converts it to half precision if necessary, and normalizes its pixel values to the range [0, 1]. If the image has only 3 dimensions (i.e., it is a single image rather than a batch of images), an extra dimension is added using the `unsqueeze` method.

The function then performs inference on the image by passing it through the model. The model returns three outputs: `pred`, `anchor_grid`, and `seg`. The function records the time taken for inference and waste time (the extra time consumption caused by incompatibility of torch.jit.trace in demo version). Finally, `pred` and `anchor_grid` are passed through a function called `split_for_trace_model`.

The function applies non-maximum suppression (NMS) to the predicted bounding boxes using the `non_max_suppression` function. The function records the time taken for NMS. The `da_seg_mask` and `ll_seg_mask` variables are set to the outputs of the `driving_area_mask` and `lane_line_mask` functions applied to `seg` and `ll`, respectively.

The function then enters a loop that iterates over the predicted bounding boxes for each image. For each image, the function sets several variables such as the path of the image, a string to hold information about the image, and the frame number if the input source is a video. The function constructs the paths of the files where the results will be saved using the `save_dir` and `p` variables. The function also computes a normalization gain tensor `gn` that will be used to normalize bounding box coordinates.

If there are any predicted bounding boxes for the current image, the function rescales their coordinates from the size of the input image to the size of the original image using the `scale_coords` function. The function then enters a loop that iterates over the unique class indices in the predicted bounding boxes. For each class index, the function counts the number of predicted bounding boxes with that class index.

The function then enters another loop that iterates over each predicted bounding box in reverse order. If results should be saved to text files (`save_txt` is `True`), the function converts the bounding box coordinates from `(x1, y1, x2, y2)` format to `(x_center, y_center, width, height)` format using the `xyxy2xywh` function and normalizes them using the `gn` tensor. The function then constructs a line of text containing either 5 or 6 values depending on whether confidences should be saved or not (`opt.save_conf`). The line is written to a text file at `txt_path`. If images should be saved (`save_img` is `True`), the function draws a bounding box on the image using the `plot_one_box` function.



(GPT-3.5)

The code then begins by printing the time taken for the inference process, which is the time duration between `t1` and `t2`. It uses string formatting to display the elapsed time in seconds with three decimal places.

The `show_seg_result()` function is called to display the segmentation results on the original image (`im0`). The `is_demo` parameter is set to `True`, indicating that this is a demonstration mode.

The code checks if `save_img` is set to `True`, indicating that the results with detections need to be saved as an image or a video.

If the dataset mode is set to `'image'`, it saves the resulting image (`im0`) at the specified `save_path` using the OpenCV `cv2.imwrite()` function. It then prints the path where the image is saved.

If the dataset mode is not `'image'`, which means it's either `'video'` or `'stream'`, the code checks if the `vid_path` is different from the `save_path`. If it is, it releases the previous video writer (if any) using the `release()` method. It then retrieves the frames per second (`fps`), width (`w`), and height (`h`) of the input video (if `vid_cap` is not None) or sets default values for `fps`, `w`, and `h` if `vid_cap` is None, indicating that it's a video stream. It also appends `'.mp4'` to the `save_path` if it's a video stream.

Next, the code initializes a new video writer using the `cv2.VideoWriter()` function with the specified `save_path`, `cv2.VideoWriter_fourcc(*'mp4v')` as the codec for writing video in mp4 format, `fps` as the frames per second, and `(w, h)` as the frame size.

The resulting image (`im0`) is written to the video writer using the `write()` method of the `vid_writer` object.

The code calculates and updates the average inference time per frame (`inf_time.avg`), average non-maximum suppression time per frame (`nms_time.avg`), and average waste time per frame (`waste_time.avg`) based on the time durations between `t1` and `t2`, `t3` and `t4`, and `tw1` and `tw2`, respectively.

Finally, the code prints the average inference time per frame (`inf_time.avg`) and average non-maximum suppression time per frame (`nms_time.avg`), and the total time taken for the entire `detect()` function to complete (`time.time() - t0`).

In [4]:
if __name__ == '__main__':
    opt, unknown =  make_parser().parse_known_args()
    print(opt)

    with torch.no_grad():
            detect()

Namespace(weights='data/weights/yolopv2.pt', source='data/example.jpg', img_size=640, conf_thres=0.3, iou_thres=0.45, device='0', save_conf=False, save_txt=False, nosave=False, classes=None, agnostic_nms=False, project='runs/detect', name='exp', exist_ok=False)


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


384x640 Done. (2.934s)
 The image with the result is saved in: runs\detect\exp7\example.jpg
inf : (2.9344s/frame)   nms : (0.0695s/frame)
Done. (3.241s)


(GPT-4)

This code segment appears to be written in Python. The `if __name__ == '__main__':` statement checks if the script is being run as the main program and not being imported as a module. If it is being run as the main program, the code within the if block will be executed.

The `opt, unknown =  make_parser().parse_known_args()` line calls the `make_parser` function and then calls the `parse_known_args` method on the returned object. This method returns a tuple containing two values: `opt` and `unknown`. The `opt` variable contains the parsed command line arguments while `unknown` contains any unrecognized arguments.

The `print(opt)` line prints the value of the `opt` variable.

The `with torch.no_grad():` statement creates a context in which gradients are not computed by PyTorch. This can be useful when performing inference with a neural network since gradients are not needed in this case.

Finally, within this context, the `detect()` function is called.