In [1]:
import sys
sys.path.append("/app/src")

In [2]:
import os
import pandas as pd

In [3]:
# Frames folder
frames_folder ='frames/'

# Temp folder
temp_dir = "temp/"

# Data folder
data_dir = "/app/data/"

video_folder = '/app/tennis_data/match_videos/master_addition'

# Output folder - store all outputs here
output_dir = "/app/notebooks/output/"

# Point by point results dir
pbp_dir = '/app/tennis_data/point_by_point_results/master'
pbp_raw_dir = 'raw'
pbp_processed_dir = 'processed'

# Match segmentation data dir
seg_dir = '/app/tennis_data/match_segmentation_data/master'

# Audio output dir
audio_output_dir = "/app/tennis_data/match_audios/master"

# Subtitles (ballhit detections) dir
srt_output_dir = "/app/tennis_data/ballhit/srt/master"
srt_filtered_dir = "/app/tennis_data/ballhit/srt/master/filtered"
srt_filtered_processed_dir = "/app/tennis_data/ballhit/srt/master/filtered_and_processed"

# JSON
match_info_json_path = "match_info.json"

# Dataset dir
dataset_dir = '/app/tennis_data/stroke_recognition_dataset/master/v1.0'
dataset_rev = '/app/tennis_data/stroke_recognition_dataset/master/v1.0/dataset_info_manual_revision.csv'

# Model specific
to_sample = 100

In [4]:
match_info = pd.read_json(match_info_json_path)
match_info.head()

Unnamed: 0,match_id,filename,match_start_top,match_start_bottom,fps,first_point_start_frame,surface,tournament,year,round
0,0,000_Novak_Djokovic_v_Andy_Murray_Australian_Op...,Novak Djokovic,Andy Murray,25.0,41250,hardcourt,Australian Open,2016,F
1,1,001_Novak_Djokovic_v_Rafael_Nadal_Australian_O...,Novak Djokovic,Rafael Nadal,25.0,18000,hardcourt,Australian Open,2012,F
2,2,002_Novak_Djokovic_v_Roger_Federer_US_Open_201...,Roger Federer,Novak Djokovic,29.97003,500,hardcourt,US Open,2015,F
3,3,003_Novak_Djokovic_v_Roger_Federer_Wimbledon_2...,Roger Federer,Novak Djokovic,25.0,350,grass,Wimbledon,2019,F
4,4,004_Rafael_Nadal_v_Nick_Kyrgios_Wimbledon_2019...,Rafael Nadal,Nick Kyrgios,25.0,0,grass,Wimbledon,2019,R2


In [5]:
rev_df = pd.read_csv(dataset_rev)
rev_df.head()

Unnamed: 0,filename,match_id,match_fps,point_id,ballhit_id,timestamp,frame_pos,class,new_class_top,new_class_bottom,discard/comment,DUMMY,stroke_description,player,position,opponent,surface,tournament,round,year
0,m000_p000_bh00_serve_top_Novak_Djokovic.mp4,0,25.0,0,0,00:27:31.896,41297,serve,,,,,"1st serve down the T, fault (net)",Novak Djokovic,top,Andy Murray,hardcourt,Australian Open,F,2016
1,m000_p000_bh01_serve_top_Novak_Djokovic.mp4,0,25.0,0,1,00:27:41.370,41534,serve,,,,,2nd serve wide,Novak Djokovic,top,Andy Murray,hardcourt,Australian Open,F,2016
2,m000_p000_bh02_forehand_bottom_Andy_Murray.mp4,0,25.0,0,2,00:27:42.113,41552,forehand,,,,,"forehand return crosscourt (long), unforced er...",Andy Murray,bottom,Novak Djokovic,hardcourt,Australian Open,F,2016
3,m000_p001_bh00_serve_top_Novak_Djokovic.mp4,0,25.0,1,0,00:28:02.918,42072,serve,,,,,1st serve wide,Novak Djokovic,top,Andy Murray,hardcourt,Australian Open,F,2016
4,m000_p001_bh01_backhand_bottom_Andy_Murray.mp4,0,25.0,1,1,00:28:03.661,42091,backhand,,,,,backhand slice return crosscourt (very deep),Andy Murray,bottom,Novak Djokovic,hardcourt,Australian Open,F,2016


In [6]:
# folders = [
#     os.path.join(dataset_dir, 'relabeled_dataset', 'top/forehand'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'top/backhand'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'top/serve'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'top/other'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'bottom/forehand'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'bottom/backhand'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'bottom/serve'),
#     os.path.join(dataset_dir, 'relabeled_dataset', 'bottom/other')
# ]

# for folder in folders:
#     os.makedirs(folder, exist_ok=True)

# print("Folders created successfully!")

In [7]:
def bad_row(row):
    return (
        row['discard/comment'] in ['angle', 'ballhit'] and
        pd.isna(row['new_class_top']) and pd.isna(row['new_class_bottom'])
    )

In [8]:
clean_dataset = []

stroke_cls = ['forehand', 'backhand', 'serve', 'other']
stroke_counts = {
    'top': dict.fromkeys(stroke_cls, 0),
    'bottom': dict.fromkeys(stroke_cls, 0)
}

for index, row in rev_df.iterrows():
    if not bad_row(row):
        # determine new stroke type for bottom and for top player
        if pd.isna(row['new_class_top']) and pd.isna(row['new_class_bottom']):  # keep old labels
            top_cls = row['class'] if row['position'] == 'top' else 'other'
            btm_cls = row['class'] if row['position'] == 'bottom' else 'other'
        else:
            top_cls = row['new_class_top']
            btm_cls = row['new_class_bottom']

        stroke_counts['top'][top_cls] += 1
        stroke_counts['bottom'][btm_cls] += 1
            
        # new filename and path
        new_filename = (
            f"t_{top_cls}_{stroke_counts['top'][top_cls]:05d}_"
            f"b_{btm_cls}_{stroke_counts['bottom'][btm_cls]:05d}_"
            f"m{row['match_id']:03d}.mp4"
        )
        
        vid_info = {
            'filename': new_filename,
            'original_filename': row['filename'],
            'top_cls': top_cls,
            'btm_cls': btm_cls,
            'match_id': row['match_id'],
            'fps': match_info.loc[match_info['match_id'] == row['match_id'], 'fps'].values[0],
            'ballhit_match_timestamp': row['timestamp'],
            'ballhit_match_frame_pos': row['frame_pos'],
            'surface':  match_info.loc[match_info['match_id'] == row['match_id'], 'surface'].values[0]
        }

        clean_dataset.append(vid_info)
    else:
        pass
        # print(index)

In [9]:
clean_dataset_df = pd.DataFrame(clean_dataset)
clean_dataset_df.head()

Unnamed: 0,filename,original_filename,top_cls,btm_cls,match_id,fps,ballhit_match_timestamp,ballhit_match_frame_pos,surface
0,t_serve_00001_b_other_00001_m000.mp4,m000_p000_bh00_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:27:31.896,41297,hardcourt
1,t_serve_00002_b_other_00002_m000.mp4,m000_p000_bh01_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:27:41.370,41534,hardcourt
2,t_other_00001_b_forehand_00001_m000.mp4,m000_p000_bh02_forehand_bottom_Andy_Murray.mp4,other,forehand,0,25.0,00:27:42.113,41552,hardcourt
3,t_serve_00003_b_other_00003_m000.mp4,m000_p001_bh00_serve_top_Novak_Djokovic.mp4,serve,other,0,25.0,00:28:02.918,42072,hardcourt
4,t_other_00002_b_backhand_00001_m000.mp4,m000_p001_bh01_backhand_bottom_Andy_Murray.mp4,other,backhand,0,25.0,00:28:03.661,42091,hardcourt


In [10]:
clean_dataset_df.to_csv(os.path.join(dataset_dir, 'clean_dataset_info.csv'), index=False)

In [11]:
test_df = pd.read_csv(os.path.join(dataset_dir, 'clean_dataset_info.csv'))
test_df[7500:7505]

Unnamed: 0,filename,original_filename,top_cls,btm_cls,match_id,fps,ballhit_match_timestamp,ballhit_match_frame_pos,surface
7500,t_other_03857_b_forehand_01438_m105.mp4,m105_p211_bh09_forehand_bottom_Novak_Djokovic.mp4,other,forehand,105,25.000066,02:47:38.449,251461,clay
7501,t_backhand_01254_b_other_03859_m105.mp4,m105_p211_bh10_forehand_top_Stefanos_Tsitsipas...,backhand,other,105,25.000066,02:47:39.935,251499,clay
7502,t_serve_00947_b_other_03860_m105.mp4,m105_p215_bh00_serve_top_Stefanos_Tsitsipas.mp4,serve,other,105,25.000066,02:50:12.629,255316,clay
7503,t_other_03858_b_backhand_01193_m105.mp4,m105_p215_bh01_backhand_bottom_Novak_Djokovic.mp4,other,backhand,105,25.000066,02:50:13.558,255339,clay
7504,t_forehand_01446_b_other_03861_m105.mp4,m105_p215_bh02_forehand_top_Stefanos_Tsitsipas...,forehand,other,105,25.000066,02:50:15.787,255395,clay


In [34]:
# da smo samo izbacivali ako ne valja, bez relabeliranja
# stroke_counts
# {'top': {'forehand': 2091, 'backhand': 1713, 'serve': 1513, 'other': 5282},
#  'bottom': {'forehand': 2071, 'backhand': 1664, 'serve': 1547, 'other': 5317}}

{'top': {'forehand': 2091, 'backhand': 1713, 'serve': 1513, 'other': 5282},
 'bottom': {'forehand': 2071, 'backhand': 1664, 'serve': 1547, 'other': 5317}}

In [68]:
# naš proces
stroke_counts

{'top': {'forehand': 2375, 'backhand': 2002, 'serve': 1603, 'other': 6215},
 'bottom': {'forehand': 2357, 'backhand': 1948, 'serve': 1642, 'other': 6248}}