In [4]:
!git clone https://github.com/RizwanMunawar/yolov7-pose-estimation.git

Cloning into 'yolov7-pose-estimation'...
remote: Enumerating objects: 193, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 193 (delta 42), reused 32 (delta 32), pack-reused 138[K
Receiving objects: 100% (193/193), 3.75 MiB | 8.35 MiB/s, done.
Resolving deltas: 100% (83/83), done.


In [5]:
cd yolov7-pose-estimation

/content/yolov7-pose-estimation


In [6]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.1.2


In [7]:
!pip install -r requirements.txt

Collecting thop (from -r requirements.txt (line 28))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch!=1.12.0,>=1.7.0->-r requirements.txt (line 11))
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch!=1.12.0,>=1.7.0->-r requirements.txt (line 11))
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch!=1.12.0,>=1.7.0->-r requirements.txt (line 11))
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch!=1.12.0,>=1.7.0->-r requirements.txt (line 11))
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch!=1.12.0,>=1.7.0->-r requiremen

## IMPORTANT. After running the above cells. Replace the pose-estimate.py with the following scripts based on whether you want to do pose estimation for images or for videos.

### For image


In [None]:
import cv2
import time
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
from utils.datasets import letterbox
from utils.torch_utils import select_device
from models.experimental import attempt_load
from utils.general import non_max_suppression_kpt,strip_optimizer,xyxy2xywh
from utils.plots import output_to_keypoint, plot_skeleton_kpts,colors,plot_one_box_kpt


@torch.no_grad()
def run(poseweights="yolov7-w6-pose.pt", source='rugby.jpg', device='cpu', view_img=False):
    print(source)
    device = select_device(device)  # Select device
    model = attempt_load(poseweights, map_location=device)  # Load model
    _ = model.eval()
    names = model.module.names if hasattr(model, 'module') else model.names  # Get class names

    # Read image
    orig_image = cv2.imread(source)
    if orig_image is None:
        print('Error while trying to read image. Please check path again')
        raise SystemExit()

    def letterbox1(image, new_shape=(640, 640), color=(0, 0, 0), stride=32, auto=False):

      height, width = image.shape[:2]
      new_width, new_height = new_shape
      scale = min(new_height / height, new_width / width)
      nw, nh = int(scale * width), int(scale * height)
      image_resized = cv2.resize(image, (nw, nh))

      top = (new_height - nh) // 2
      bottom = new_height - nh - top
      left = (new_width - nw) // 2
      right = new_width - nw - left

      image_padded = cv2.copyMakeBorder(image_resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
      return image_padded


    image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
    image = letterbox1(image, new_shape=(640, 640), stride=64, auto=True)
    image_ = image.copy()
    image = transforms.ToTensor()(image)
    image = torch.tensor(np.array([image.numpy()]))

    image = image.to(device)
    image = image.float()

    # Inference
    with torch.no_grad():
        output_data, _ = model(image)

    output_data = non_max_suppression_kpt(output_data,
                                          0.25,  # Conf. Threshold.
                                          0.65,  # IoU Threshold.
                                          nc=model.yaml['nc'],  # Number of classes.
                                          nkpt=model.yaml['nkpt'],  # Number of keypoints.
                                          kpt_label=True)

    # Draw keypoints
    im0 = image[0].permute(1, 2, 0) * 255
    im0 = im0.cpu().numpy().astype(np.uint8)
    im0 = cv2.cvtColor(im0, cv2.COLOR_RGB2BGR)
    im0_kpts = np.zeros((640, 640, 3), dtype=np.uint8)

    def plot_keypoints_only(image, keypoints, color, line_thickness=2):
      if keypoints is not None:
          for kpt in keypoints:
              # Example: Draw each keypoint
              cv2.circle(image, (int(kpt[0]), int(kpt[1])), 3, color, -1)  # Draw keypoints
          # Optionally, add connections between keypoints if needed
          # This part would depend on the structure of your keypoints and the specific connections you want to show

    for pose in output_data:
        if len(pose):
            for det, (*xyxy, conf, cls) in enumerate(pose):
                c = int(cls)
                kpts = pose[det, 6:]
                # label = f'{names[c]} {conf:.2f}' # use this if u want bounding box, and conf, but label=label in plot func below
                plot_one_box_kpt(xyxy, im0, color=colors(c, True),
                                 line_thickness=3, kpt_label=True, kpts=kpts, steps=3,
                                 orig_shape=im0.shape[:2])

                plot_one_box_kpt(xyxy, im0_kpts, color=colors(c, True),
                                 line_thickness=3, kpt_label=True, kpts=kpts, steps=3,
                                 orig_shape=im0.shape[:2])



    output_filename = f"{source.split('.')[0]}_keypoint.jpg"
    cv2.imwrite(output_filename, im0)
    print(f"Output saved as {output_filename}")

    output_filename_kpts = f"{source.split('.')[0]}_keypoints_only.jpg"
    cv2.imwrite(output_filename_kpts, im0_kpts)
    print(f"Keypoints only output saved as {output_filename_kpts}")

    if view_img:
      cv2.imshow("Pose Estimation Result", im0)
      cv2.waitKey(0)
      cv2.destroyAllWindows()


import argparse
def parse_args():
    parser = argparse.ArgumentParser(description="Run pose estimation on an image.")
    parser.add_argument('--source', type=str, default='rugby.jpg', help='Path to the image file')
    parser.add_argument('--device', type=str, default='cpu', help='Device to run the model on')
    parser.add_argument('--poseweights', type=str, default='yolov7-w6-pose.pt', help='Path to model weights')
    parser.add_argument('--view-img', action='store_true', help='Display the result image if specified')
    return parser.parse_args()

def main():
    args = parse_args()
    run(poseweights=args.poseweights, source=args.source, device=args.device, view_img=args.view_img)

if __name__ == "__main__":
    main()

### For Video


In [None]:
import cv2
import time
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
from utils.datasets import letterbox
from utils.torch_utils import select_device
from models.experimental import attempt_load
from utils.general import non_max_suppression_kpt,strip_optimizer,xyxy2xywh
from utils.plots import output_to_keypoint, plot_skeleton_kpts,colors,plot_one_box_kpt

@torch.no_grad()
def run(poseweights="yolov7-w6-pose.pt",source="football1.mp4",device='cpu',view_img=False,
        save_conf=False,line_thickness = 3,hide_labels=False, hide_conf=True):

    frame_count = 0  #count no of frames
    total_fps = 0  #count total fps
    time_list = []   #list to store time
    fps_list = []    #list to store fps

    device = select_device(opt.device) #select device
    half = device.type != 'cpu'

    model = attempt_load(poseweights, map_location=device)  #Load model
    _ = model.eval()
    names = model.module.names if hasattr(model, 'module') else model.names  # get class names

    if source.isnumeric() :
        cap = cv2.VideoCapture(int(source))    #pass video to videocapture object
    else :
        cap = cv2.VideoCapture(source)    #pass video to videocapture object

    if (cap.isOpened() == False):   #check if videocapture not opened
        print('Error while trying to read video. Please check path again')
        raise SystemExit()

    else:
        frame_width = int(cap.get(3))  #get video frame width
        frame_height = int(cap.get(4)) #get video frame height
        def letterbox1(image, new_shape=(640, 640), color=(0, 0, 0), stride=32, auto=False):
          height, width = image.shape[:2]
          new_width, new_height = new_shape
          scale = min(new_height / height, new_width / width)
          nw, nh = int(scale * width), int(scale * height)
          image_resized = cv2.resize(image, (nw, nh))

          top = (new_height - nh) // 2
          bottom = new_height - nh - top
          left = (new_width - nw) // 2
          right = new_width - nw - left

          image_padded = cv2.copyMakeBorder(image_resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
          return image_padded

        vid_write_image = letterbox1(cap.read()[1], new_shape=(640, 640), stride=64, auto=True) #init videowriter
        out_video_name = f"{source.split('/')[-1].split('.')[0]}"
        out = cv2.VideoWriter(f"{source}_keypoint.mp4",
                            cv2.VideoWriter_fourcc(*'mp4v'), 30,
                            (640, 640))

        out_kpts = cv2.VideoWriter(f"{out_video_name}_keypoints_only.mp4",
                           cv2.VideoWriter_fourcc(*'mp4v'), 30,
                           (640, 640))

        while(cap.isOpened): #loop until cap opened or video not complete

            print("Frame {} Processing".format(frame_count+1))

            ret, frame = cap.read()  #get frame and success from video capture

            if ret: #if success is true, means frame exist
                orig_image = frame #store frame
                image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB) #convert frame to RGB
                image = letterbox1(image, new_shape=(640, 640), stride=64, auto=True)
                image_ = image.copy()
                image = transforms.ToTensor()(image)
                image = torch.tensor(np.array([image.numpy()]))

                image = image.to(device)  #convert image data to device
                image = image.float() #convert image to float precision (cpu)
                start_time = time.time() #start time for fps calculation

                with torch.no_grad():  #get predictions
                    output_data, _ = model(image)

                output_data = non_max_suppression_kpt(output_data,   #Apply non max suppression
                                            0.25,   # Conf. Threshold.
                                            0.65, # IoU Threshold.
                                            nc=model.yaml['nc'], # Number of classes.
                                            nkpt=model.yaml['nkpt'], # Number of keypoints.
                                            kpt_label=True)

                output = output_to_keypoint(output_data)

                im0 = image[0].permute(1, 2, 0) * 255 # Change format [b, c, h, w] to [h, w, c] for displaying the image.
                im0 = im0.cpu().numpy().astype(np.uint8)

                im0 = cv2.cvtColor(im0, cv2.COLOR_RGB2BGR) #reshape image format to (BGR)
                gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
                im0_kpts = np.zeros((640, 640, 3), dtype=np.uint8)
                for i, pose in enumerate(output_data):  # detections per image

                    if len(output_data):  #check if no pose
                        for c in pose[:, 5].unique(): # Print results
                            n = (pose[:, 5] == c).sum()  # detections per class
                            print("No of Objects in Current Frame : {}".format(n))

                        for det_index, (*xyxy, conf, cls) in enumerate(reversed(pose[:,:6])): #loop over poses for drawing on frame
                            c = int(cls)  # integer class
                            kpts = pose[det_index, 6:]
                            # label = None if opt.hide_labels else (names[c] if opt.hide_conf else f'{names[c]} {conf:.2f}')
                            # put label = label if want see conf level
                            plot_one_box_kpt(xyxy, im0, color=colors(c, True),
                                        line_thickness=opt.line_thickness,kpt_label=True, kpts=kpts, steps=3,
                                        orig_shape=im0.shape[:2])
                            plot_one_box_kpt(xyxy, im0_kpts, color=colors(c, True),
                                        line_thickness=opt.line_thickness,kpt_label=True, kpts=kpts, steps=3,
                                        orig_shape=im0.shape[:2])


                end_time = time.time()  #Calculatio for FPS
                fps = 1 / (end_time - start_time)
                total_fps += fps
                frame_count += 1

                fps_list.append(total_fps) #append FPS in list
                time_list.append(end_time - start_time) #append time in list

                # Stream results
                if view_img:
                    cv2.imshow("YOLOv7 Pose Estimation Demo", im0)
                    cv2.waitKey(1)  # 1 millisecond

                out.write(im0)  #writing the video frame
                out_kpts.write(im0_kpts)

            else:
                break

        cap.release()
        out.release()  # Release the main video writer
        out_kpts.release()
        # cv2.destroyAllWindows()
        avg_fps = total_fps / frame_count
        print(f"Average FPS: {avg_fps:.3f}")

        #plot the comparision graph
        plot_fps_time_comparision(time_list=time_list,fps_list=fps_list)


def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--poseweights', nargs='+', type=str, default='yolov7-w6-pose.pt', help='model path(s)')
    parser.add_argument('--source', type=str, default='football1.mp4', help='video/0 for webcam') #video source
    parser.add_argument('--device', type=str, default='cpu', help='cpu/0,1,2,3(gpu)')   #device arugments
    parser.add_argument('--view-img', action='store_true', help='display results')  #display results
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') #save confidence in txt writing
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)') #box linethickness
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels') #box hidelabel
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences') #boxhideconf
    opt = parser.parse_args()
    return opt

#function for plot fps and time comparision graph
def plot_fps_time_comparision(time_list,fps_list):
    plt.figure()
    plt.xlabel('Time (s)')
    plt.ylabel('FPS')
    plt.title('FPS and Time Comparision Graph')
    plt.plot(time_list, fps_list,'b',label="FPS & Time")
    plt.savefig("FPS_and_Time_Comparision_pose_estimate.png")


#main function
def main(opt):
    run(**vars(opt))

if __name__ == "__main__":
    opt = parse_opt()
    strip_optimizer(opt.device,opt.poseweights)
    main(opt)

In [8]:
#download weights for pose estimation.
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6-pose.pt

--2024-07-15 09:40:27--  https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6-pose.pt
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/511187726/ad063dcb-fb9a-4511-b4d7-499601326cd8?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240715%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240715T094028Z&X-Amz-Expires=300&X-Amz-Signature=85d644824768cbe41fa699fd659f64e47514241403c084f720246efb6c74d428&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=511187726&response-content-disposition=attachment%3B%20filename%3Dyolov7-w6-pose.pt&response-content-type=application%2Foctet-stream [following]
--2024-07-15 09:40:28--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/511187726/ad063dcb-fb9a-4511-b4d7-499601326cd8?X-A

In [9]:
!python pose-estimate.py --source "video_name.mp4" --device 0

Optimizer stripped from yolov7-w6-pose.pt, 161.1MB
Fusing layers... 
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Frame 1 Processing
No of Objects in Current Frame : 1
Frame 2 Processing
No of Objects in Current Frame : 1
Frame 3 Processing
No of Objects in Current Frame : 1
Frame 4 Processing
No of Objects in Current Frame : 1
Frame 5 Processing
No of Objects in Current Frame : 1
Frame 6 Processing
No of Objects in Current Frame : 1
Frame 7 Processing
No of Objects in Current Frame : 1
Frame 8 Processing
No of Objects in Current Frame : 1
Frame 9 Processing
No of Objects in Current Frame : 1
Frame 10 Processing
No of Objects in Current Frame : 1
Frame 11 Processing
No of Objects in Current Frame : 1
Frame 12 Processing
No of Objects in Current Frame : 1
Frame 13 Processing
No of Objects in Current Frame : 1
Frame 14 Processing
No of Objects in Current Frame : 1
Frame 15 Processing
No of Objects in Current Frame : 1
Frame 16 Processing
No of Objects in Current