In [1]:
import pandas as pd
import os
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
import torch
from collections import defaultdict
import json
from tqdm import tqdm
from IPython.display import Video
import subprocess
from PIL import Image

from ultralytics import YOLO

In [2]:
# Dataset dir
dataset_dir = '/app/tennis_data/stroke_recognition_dataset/master/v1.0'
match_info_json_path = os.path.join(dataset_dir, "match_info.json")

match_info_df = pd.read_json(match_info_json_path)
dataset_df = pd.read_csv(os.path.join(dataset_dir, "clean_dataset_info.csv"))

In [3]:
match_info_df

Unnamed: 0,match_id,filename,match_start_top,match_start_bottom,fps,first_point_start_frame,surface,tournament,year,round
0,0,000_Novak_Djokovic_v_Andy_Murray_Australian_Op...,Novak Djokovic,Andy Murray,25.0,41250,hardcourt,Australian Open,2016,F
1,1,001_Novak_Djokovic_v_Rafael_Nadal_Australian_O...,Novak Djokovic,Rafael Nadal,25.0,18000,hardcourt,Australian Open,2012,F
2,2,002_Novak_Djokovic_v_Roger_Federer_US_Open_201...,Roger Federer,Novak Djokovic,29.97003,500,hardcourt,US Open,2015,F
3,3,003_Novak_Djokovic_v_Roger_Federer_Wimbledon_2...,Roger Federer,Novak Djokovic,25.0,350,grass,Wimbledon,2019,F
4,4,004_Rafael_Nadal_v_Nick_Kyrgios_Wimbledon_2019...,Rafael Nadal,Nick Kyrgios,25.0,0,grass,Wimbledon,2019,R2
5,5,005_Roger_Federer_v_Rafael_Nadal_Wimbledon_201...,Roger Federer,Rafael Nadal,25.0,450,grass,Wimbledon,2019,SF
6,7,007_Stan_Wawrinka_v_Novak_Djokovic_US_Open_201...,Novak Djokovic,Stan Wawrinka,23.976024,0,hardcourt,US Open,2016,F
7,8,008_Novak_Djokovic_v_Rafael_Nadal_Australian_O...,Rafael Nadal,Novak Djokovic,29.941,7440,hardcourt,Australian Open,2019,F
8,22,022_Novak_Djokovic_v_Rafael_Nadal_Wimbledon_20...,Novak Djokovic,Rafael Nadal,25.0,0,grass,Wimbledon,2018,SF
9,23,023_Alexander_Zverev_v_Dominic_Thiem_US_Open_2...,Dominic Thiem,Alexander Zverev,29.97003,0,hardcourt,US Open,2020,F


In [4]:
dataset_df.head()

Unnamed: 0,filename,original_filename,top_cls,btm_cls,match_id,fps,ballhit_match_timestamp,ballhit_match_frame_pos,surface
0,t_serve_00001_b_other_00001_m000.mp4,m000_p000_bh00_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:27:31.896,41297,hardcourt
1,t_serve_00002_b_other_00002_m000.mp4,m000_p000_bh01_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:27:41.370,41534,hardcourt
2,t_other_00001_b_forehand_00001_m000.mp4,m000_p000_bh02_forehand_bottom_Andy_Murray.mp4,other,forehand,0,25.0,00:27:42.113,41552,hardcourt
3,t_serve_00003_b_other_00003_m000.mp4,m000_p001_bh00_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:28:02.918,42072,hardcourt
4,t_other_00002_b_backhand_00001_m000.mp4,m000_p001_bh01_backhand_bottom_Andy_Murray.mp4,other,backhand,0,25.0,00:28:03.661,42091,hardcourt


In [5]:
# subset_df = dataset_df[dataset_df['match_id'].isin([0, 1, 2])]
# subset_df = dataset_df[dataset_df['match_id'].isin([3, 4, 5])]
# subset_df = dataset_df[dataset_df['match_id'].isin([7, 8, 22])]
# subset_df = dataset_df[dataset_df['match_id'].isin([23, 24, 25])]
# subset_df = dataset_df[dataset_df['match_id'].isin([100, 101, 102, 103, 104])]
# subset_df = dataset_df[dataset_df['match_id'].isin([105, 106, 107])]
# subset_df = dataset_df[dataset_df['match_id'].isin([108, 109, 110, 111])]
# subset_df = dataset_df[dataset_df['match_id'].isin([112, 113, 114])]

In [6]:
subset_df = dataset_df.sample(500, random_state=2)
subset_df.shape

(500, 9)

In [100]:
subset_df.head()

Unnamed: 0,filename,original_filename,top_cls,btm_cls,match_id,fps,ballhit_match_timestamp,ballhit_match_frame_pos,surface
2140,t_other_01087_b_forehand_00377_m003.mp4,m003_p391_bh02_forehand_bottom_Novak_Djokovic.mp4,other,forehand,3,25.0,04:34:21.212,411530,grass
8171,t_serve_01018_b_other_04201_m106.mp4,m106_p230_bh00_serve_top_Rafael_Nadal.mp4,serve,other,106,25.000057,03:19:22.299,299058,clay
4205,t_backhand_00701_b_other_02162_m023.mp4,m023_p003_bh05_backhand_bottom_Alexander_Zvere...,backhand,other,23,29.97003,00:01:52.135,3360,hardcourt
1205,t_other_00598_b_backhand_00232_m002.mp4,m002_p231_bh05_backhand_bottom_Novak_Djokovic.mp4,other,backhand,2,29.97003,02:07:33,229373,hardcourt
174,t_other_00089_b_serve_00017_m000.mp4,m000_p049_bh00_serve_bottom_Andy_Murray.mp4,other,serve,0,25.0,01:03:57.173,95929,hardcourt


In [7]:
# model = YOLO("yolov8x-pose-p6.pt")  # load an official model

In [8]:
filepaths = [os.path.join(dataset_dir, 'final_dataset_20_frames', filename)
             for filename in subset_df.filename.tolist()]

In [9]:
len(filepaths)

500

In [10]:
def draw_bounding_boxes_on_video(video_path, output_path, pose_data_json_path, track_ids=None, draw_bb=True, draw_pose=False):
    # Load the JSON file
    with open(pose_data_json_path, 'r') as file:
        pose_data = json.load(file)

    # Open the video file
    cap = cv.VideoCapture(video_path)
    
    # Get video properties
    frame_width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv.CAP_PROP_FPS)
    
    # Define the codec and create VideoWriter object
    fourcc = cv.VideoWriter_fourcc(*'mp4v')  # You can change codec if needed
    out = cv.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
    
    # Frame counter
    frame_id = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Process the current frame
        frame = process_frame(frame, frame_id, pose_data, track_ids, draw_bb, draw_pose)

        # Write the frame to the output video
        out.write(frame)

        frame_id += 1
    
    # Release everything
    cap.release()
    out.release()

In [11]:
KEYPOINT_CONNECTIONS = [
    (0, 1), (1, 3),  # Nose to Left Eye to Left Ear
    (0, 2), (2, 4),  # Nose to Right Eye to Right Ear
    (5, 6),  # Left Shoulder to Right Shoulder
    (5, 7), (7, 9),  # Left Shoulder to Left Elbow to Left Wrist
    (6, 8), (8, 10),  # Right Shoulder to Right Elbow to Right Wrist
    (5, 11), (6, 12),  # Left Shoulder to Left Hip and Right Shoulder to Right Hip
    (11, 12),  # Left Hip to Right Hip
    (11, 13), (13, 15),  # Left Hip to Left Knee to Left Ankle
    (12, 14), (14, 16)  # Right Hip to Right Knee to Right Ankle
]

KEYPOINT_COLORS = [
    # 0: Nose    1: L Eye      2: R Eye       3: L Ear       4: R Ear
    (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255), 
    # 5: L Shoulder 6: R Shoulder 7: L Elbow      8: R Elbow      9: L Wrist
    (255, 0, 255), (128, 128, 0), (128, 0, 128), (0, 128, 128), (255, 128, 0),
    # 10: R Wrist   11: L Hip       12: R Hip     13: L Knee     14: R Knee
    (0, 255, 128), (128, 0, 255), (255, 0, 128), (128, 255, 0), (0, 128, 255),
    # 15: L Ankle     16: R Ankle
    (128, 128, 128), (255, 255, 255)
]

CONNECTION_COLORS = [
    (0, 255, 255), (255, 0, 255),  # (0, 1), (1, 3)
    (255, 255, 0), (128, 128, 255),  # (0, 2), (2, 4)
    (255, 128, 128),  # (5, 6)
    (128, 255, 128), (128, 128, 128),  # (5, 7), (7, 9)
    (128, 255, 128), (128, 128, 128),  # (6, 8), (8, 10)
    (255, 128, 255), (128, 255, 255),  # (5, 11), (6, 12)
    (255, 255, 128), (192, 192, 192),  # (11, 12)
    (64, 64, 64), (0, 0, 128),  # (11, 13), (13, 15)
    (128, 0, 0), (0, 128, 64)  # (12, 14), (14, 16)
]

def process_frame(frame, frame_id, pose_data, track_ids=None, draw_bb=True, draw_pose=True):
    for track_id, track_info in pose_data.items():
        if track_ids == None or track_id in track_ids:
            for frame_info in track_info:
                if frame_info['frame_id'] == frame_id:
                    if draw_bb:
                        x0, y0, x1, y1 = frame_info['boxes_xyxy']
                        # Draw the bounding box and track ID
                        cv.rectangle(frame, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 2)
                        cv.putText(frame, f'ID: {track_id}', (int(x0), int(y0) - 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                    if draw_pose:
                        # Get data for pose keypoints and connections
                        keypoints = frame_info['keypoints_xy']
                        valid_keypoints = {idx: (int(kp[0]), int(kp[1])) for idx, kp in enumerate(keypoints) if kp != [0, 0]}

                        # Draw pose connections
                        for i, (start_idx, end_idx) in enumerate(KEYPOINT_CONNECTIONS):
                            if start_idx in valid_keypoints and end_idx in valid_keypoints:
                                start_point = valid_keypoints[start_idx]
                                end_point = valid_keypoints[end_idx]
                                cv.line(frame, start_point, end_point, CONNECTION_COLORS[i], 2)

                        # Draw pose keypoints
                        for idx, point in valid_keypoints.items():
                            if idx > 5:
                                color = KEYPOINT_COLORS[idx]
                                cv.circle(frame, point, 5, color, -1)

    return frame

In [12]:
file_id = 1
fp = filepaths[file_id]
pose_video = 'temp_pose.mp4'
print(fp)
json_path = os.path.join(dataset_dir, "final_dataset_json", os.path.basename(fp).replace('.mp4', '.json'))

draw_bounding_boxes_on_video(fp, pose_video, json_path, draw_bb=False, draw_pose=True)

/app/tennis_data/stroke_recognition_dataset/master/v1.0/final_dataset_20_frames/t_serve_01018_b_other_04201_m106.mp4


In [13]:
# input_video = filepaths[0]
output_video = 'temp_output.mp4'

if os.path.exists(output_video):
    print(f"Output file {output_video} already exists. Deleting it.")
    os.remove(output_video)

ffmpeg_command = [
    'ffmpeg', '-i', pose_video,
    '-vcodec', 'libx264', '-acodec', 'aac',
    '-vf', 'scale=640:480',  # Resizing the video (optional)
    output_video
]

subprocess.run(ffmpeg_command, check=True)

Output file temp_output.mp4 already exists. Deleting it.


ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

CompletedProcess(args=['ffmpeg', '-i', 'temp_pose.mp4', '-vcodec', 'libx264', '-acodec', 'aac', '-vf', 'scale=640:480', 'temp_output.mp4'], returncode=0)

In [14]:
Video(output_video, width=640, height=480, embed=True)

In [15]:
KPS = {
    "Nose": 0,
    "Left Eye": 1,
    "Right Eye": 2,
    "Left Ear": 3,
    "Right Ear": 4,
    "Left Shoulder": 5,
    "Right Shoulder": 6,
    "Left Elbow": 7,
    "Right Elbow": 8,
    "Left Wrist": 9,
    "Right Wrist": 10,
    "Left Hip": 11,
    "Right Hip": 12,
    "Left Knee": 13,
    "Right Knee": 14,
    "Left Ankle": 15,
    "Right Ankle": 16
}

In [111]:
def find_bottom_player_track_id(json_data):
    """
    Note: keypoints coordinates are upside-down if plotted. y is 0 on the top and frame height at the bottom.
    """
    
    # bottom_player_track_id = None

    pose_score_by_id = defaultdict(lambda: 0)
    for track_id, pose_data_list in json_data.items():
        # print(track_id)
        
        for pose_data in pose_data_list:
            
            fh, fw = pose_data['img_shape']
            # print(fw, fh)
            pose_data_xy = np.array(pose_data['keypoints_xy'])
            # print(pose_data_xy.shape)
            # print(pose_data_xy)
            
            # Check if player's legs are below 50% of the screen
            # print(pose_data_xy[:, 1]])
            if np.max(pose_data_xy[:, 1]) > (fh / 2):
                pose_score_by_id[track_id] += 1000
            else:
                pose_score_by_id[track_id] -= 2000
                
            # Additionally, add that max y to distinct between bottom poses
            pose_score_by_id[track_id] += np.max(pose_data_xy[:, 1]) / 2
            
            # Find the most consistent pose - add len(pose_data)
            # means adding number of frames pose is detected in
            pose_score_by_id[track_id] += len(pose_data) * 10
            # print("len(pose_data) * 10: ", len(pose_data) * 10)
            
            # Use pose height
            pose_score_by_id[track_id] += pose_data['boxes_xywh'][3] * 5
            # print("pose_data['boxes_xywh'][3] * 0.1", pose_data['boxes_xywh'][3] * 0.1)
            
            # Use class confidence
            pose_score_by_id[track_id] += pose_data['class_conf'] * 200
            # print("pose_data['class_conf'] * 100", pose_data['class_conf'] * 100)
            
            # Use bad rows (0, 0)
            # pose_score_by_id[track_id] += sum([1 for xy in pose_data['keypoints_xy'][5:]
            #                                    if xy == [0, 0]]) * 100
            
    # print(pose_score_by_id)
            
    return str(max(pose_score_by_id, key=pose_score_by_id.get))

In [62]:
def make_translation_image(skeleton):
    
    # based on https://arxiv.org/pdf/1704.05645.pdf
    # from 2D data

    c_0 = min([elem[0] for elem in skeleton ] + [0])  # TODO fix zero hack
    c_1 = min([elem[1] for elem in skeleton ] + [0])
    c_2 = min([elem[2] for elem in skeleton ] + [0])

    C_0 = max([elem[0] for elem in skeleton ] + [0])
    C_1 = max([elem[1] for elem in skeleton ] + [0])
    C_2 = max([elem[2] for elem in skeleton ] + [0])

    tmp = max([C_0 - c_0, C_1 - c_1, C_2 - c_2])

    r_column = []
    g_column = []
    b_column = []

    for joint in skeleton:

        if joint is None:
            r_column.append(0)
            g_column.append(0)
            b_column.append(0)

        else:
            p_r = int(np.floor(255 * (joint[0] - c_0) / tmp))
            r_column.append(p_r)

            p_g = int(np.floor(255 * (joint[1] - c_1) / tmp))
            g_column.append(p_g)

            p_b = int(np.floor(255 * (joint[2] - c_2) / tmp))
            b_column.append(p_b)

    return r_column, g_column, r_column

In [96]:
folders = [
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'top/forehand'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'top/backhand'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'top/serve'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'top/other'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/forehand'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/backhand'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/serve'),
    os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/other')
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("Folders created successfully!")

Folders created successfully!


In [97]:
FOLDER = {
    'backhand': os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/backhand'),
    'forehand': os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/forehand'),
    'serve': os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/serve'),
    'other': os.path.join(dataset_dir, 'action_images_dataset_v1.0', 'bottom/other')
}

In [133]:
for index, row in tqdm(dataset_df.iterrows(), desc="Creating action images"):
    if index < 6330:
        continue
    fp = os.path.join(dataset_dir, 'final_dataset_20_frames', row['filename'])
    
    json_path = os.path.join(dataset_dir, "final_dataset_json", os.path.basename(fp).replace('.mp4', '.json'))
    with open(json_path, 'r') as file:
        json_data = json.load(file)
    if not json_data:
        continue
    
    bottom_player_track_id = find_bottom_player_track_id(json_data)
    bottom_player_pose_data = json_data[bottom_player_track_id]
    
    action_image = None
    if bottom_player_track_id != None:
        # pose_video_output_path = os.path.join("test_bottom_player/", os.path.basename(fp))
        # draw_bounding_boxes_on_video(fp, pose_video_output_path, json_path, track_ids=[bottom_player_track_id],
        #                              draw_bb=True, draw_pose=True)
        
        r_channel = []
        g_channel = []
        b_channel = []

        for bppd in bottom_player_pose_data:
            a = np.array(bppd["keypoints_xy"][5:])
            b = np.array(bppd["keypoints_conf"][5:]).reshape(12, 1)
            c = np.hstack((a, b))
            r_column, g_column, b_column = make_translation_image(c)

            r_channel.append(r_column)
            g_channel.append(g_column)
            b_channel.append(b_column)

        final_image = cv.merge((np.asarray(r_channel), np.asarray(g_channel), np.asarray(b_channel)))
        
        cv.imwrite(os.path.join(FOLDER[row['btm_cls']], 
                                f"{os.path.basename(fp)[:-4]}.jpg"), 
                   final_image)
        
        action_image.save(os.path.join("test_action_images/", f"{os.path.basename(fp)[:-4]}.jpg"))
        # action_image.show()
    else:
        # No action_image is added in the dataset for the current video
        # This will likely never happen
        print("No track ids detected!")

Creating action images: 12195it [00:24, 500.22it/s] 
