In [235]:
import os
os.chdir("/usr/src/app")
import pandas as pd
from os.path import join as jp
import cv2
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import time
import re

In [276]:
# Path videos
PATH_VIDEOS_DATASET = "videos/tennistv"
PATH_VIDEOS_GS_DATASET = "videos/grand_slams"
PATH_IMAGES_DATASET = "images/tennistv"
PATH_IMAGES_REFCOURT = jp(PATH_IMAGES_DATASET, "ref_court")

# Path master
PATH_MASTER = "deepsetstats/dataset/tennistv/parquet/master.parquet"
PATH_MASTER_GS = "deepsetstats/dataset/grandslams/parquet/master.parquet"

# Path tournaments
PATH_TOURNAMENTS = "deepsetstats/dataset/tournaments/parquet/tournaments_tennistv.parquet"

# Reference videos output
PATH_REFERENCE_VIDEOS = "deepsetstats/dataset/tennistv/parquet/reference_videos.parquet"
PATH_REFERENCE_GS_VIDEOS = "deepsetstats/dataset/grandslams/parquet/reference_videos.parquet"

# Functions

In [237]:
def parse_img_ref(string):
    pattern_vid = r'___v(.*?)___f'
    pattern_tourn = r'___t(.*?)___v'
    match = re.search(pattern, string)
    match_tourn = re.search(pattern_tourn, string)
    
    if match and match_tourn:
        video_id = match.group(1)
        tourn_id = match_tourn.group(1)
        return video_id, int(tourn_id)
    else:
        return None, None

In [238]:
def bgr_to_rgb(img_bgr):
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    return img_rgb

def rgb_to_bgr(img_rgb):
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    return img_bgr

In [239]:
def plot_img_rgb(img_rgb):

    # Plot the first frame using matplotlib
    plt.imshow(img_rgb)
    plt.axis("off")  # Turn off axis labels and ticks
    plt.show()

In [240]:
def save_bgr_img(path, img_bgr):
    cv2.imwrite(path, img_bgr)

In [277]:
def get_random_frame(video_id, frame_num_input=None, is_grand_slam=False):
    filename = f'{video_id}.mp4'

    if not is_grand_slam:
        path_video_id = jp(PATH_VIDEOS_DATASET, filename)
    else:
        path_video_id = jp(PATH_VIDEOS_GS_DATASET, filename)
    if not os.path.exists(path_video_id):
        print(f"Warning! does not exist path: {path_video_id}")
        return False, False, 0

    # Open the video capture object
    cap = cv2.VideoCapture(path_video_id)
    
    # Get the total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if frame_num_input is None:
        # Select a random frame
        random_frame_index = random.randint(0, total_frames - 1)
    else:
        random_frame_index = frame_num_input
    
    # Set the frame position to the random index
    cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame_index)
    
    # Read the frame at the random index
    ret, frame_bgr = cap.read()
    
    # Convert the frame from BGR to RGB
    frame_rgb = bgr_to_rgb(frame_bgr)
    
    # Plot the first frame using matplotlib
    plot_img_rgb(frame_rgb)

    
    return frame_bgr, True, random_frame_index

In [242]:
def list_videos(path):
    extension = ".mp4"
    l_videos_downloaded = os.listdir(path)

    # Set of already downloaded videos
    s_videos_downloaded = set()
    
    for vid in l_videos_downloaded:
        if vid.endswith(extension):
            vid_id = vid.split(extension)[0]
            s_videos_downloaded.add(vid_id)
    return s_videos_downloaded

# <font size=20> Tennis TV

# Get Downloaded Videos Id

In [279]:
sv = list_videos(PATH_VIDEOS_DATASET)
print("Videos downloaded:", len(sv))

Videos downloaded: 800


# Load Master table

In [280]:
df = pd.read_parquet(PATH_MASTER, engine="pyarrow")
df_tt = pd.read_parquet(PATH_TOURNAMENTS, engine="pyarrow")

In [281]:
df_data = df[df["video_id"].isin(sv)].copy()
df_data = pd.merge(df_data, df_tt[["tournament_name", "level"]].drop_duplicates(), on=["tournament_name"], how="left")

In [304]:
df

Unnamed: 0,video_id,player_id,tournament_id,tournament_name,name,title
0,dsALS4dDxDs,1,64,ATP Finals,Novak Djokovic,Novak Djokovic vs Alexander Zverev Extended Hi...
1,dsALS4dDxDs,14,64,ATP Finals,Alexander Zverev,Novak Djokovic vs Alexander Zverev Extended Hi...
2,l6Dx9KzyRig,0,10,Cincinnati,Carlos Alcaraz,Alcaraz Battles Purcell; Djokovic Plays Fritz ...
3,l6Dx9KzyRig,1,10,Cincinnati,Novak Djokovic,Alcaraz Battles Purcell; Djokovic Plays Fritz ...
4,l6Dx9KzyRig,8,10,Cincinnati,Taylor Fritz,Alcaraz Battles Purcell; Djokovic Plays Fritz ...
...,...,...,...,...,...,...
6719,N_Es3ax5JKU,3057,11,Shanghai,David Ferrer,Tennis Masters Cup Shanghai 2007 - Federer v F...
6720,7mC4cC9BGls,138,11,Shanghai,Rafael Nadal,Tennis Masters Cup Shanghai 2007 Semi-Final Hi...
6721,7mC4cC9BGls,2059,11,Shanghai,Roger Federer,Tennis Masters Cup Shanghai 2007 Semi-Final Hi...
6722,7mC4cC9BGls,3057,11,Shanghai,David Ferrer,Tennis Masters Cup Shanghai 2007 Semi-Final Hi...


In [302]:
df_tt[["tournament_name", "level"]].drop_duplicates()

Unnamed: 0,tournament_name,level
0,Australian Open,grandslam
1,Roland Garros,grandslam
2,Wimbledon,grandslam
3,US Open,grandslam
4,Indian Wells,1000
...,...,...
60,Stockholm,250
61,Antwerp,250
62,Metz,250
63,Tel Aviv,250


In [301]:
df_tt[df_tt["tournament_name"] == "Adelaide"]

Unnamed: 0,level,city,country,court_type,month,name,date_start,date_end,tournament_name,tournament_id
26,250,Adelaide,Australia,Outdoor Hard,Jan,Adelaide International 1,2023-01-01,2023-01-08,Adelaide,26
29,250,Adelaide,Australia,Outdoor Hard,Jan,Adelaide International 2,2023-01-09,2023-01-14,Adelaide,29


In [282]:
xx = df_data.groupby(["tournament_name", "level"])["video_id"].count().reset_index().sort_values("level", ascending=False).head(120)

In [283]:
xx.shape

(55, 3)

In [298]:
df[df["video_id"] == "59dAgrtP4qc"]

Unnamed: 0,video_id,player_id,tournament_id,tournament_name,name,title
1514,59dAgrtP4qc,31,26,Adelaide,Roberto Bautista Agut,Roberto Bautista Agut vs Soonwoo Kwon For The ...
1515,59dAgrtP4qc,2127,26,Adelaide,Soon Woo Kwon,Roberto Bautista Agut vs Soonwoo Kwon For The ...


# Select one video Id per tournament

In [284]:
court_reference_videos = df_data.groupby(["tournament_name", "level"])["video_id"].first().reset_index()
court_reference_videos = court_reference_videos.sort_values("level")

In [294]:
court_reference_videos

Unnamed: 0,tournament_name,level,video_id,tournament_id
0,Rome,1000,cu-AoDpkJJU,8
1,Indian Wells,1000,1LoMyDZP9v0,4
2,Paris,1000,rGT0FsEuc3U,12
3,Shanghai,1000,LhDFuGXXvGA,11
4,Madrid,1000,JMk6pvXzg_0,7
5,Monte Carlo,1000,VK7b5oE7zpA,6
6,Toronto,1000,YLZUFP2bkoE,9
7,Miami,1000,EiEHSxQIjpQ,5
8,Cincinnati,1000,l6Dx9KzyRig,10
9,Kitzbuhel,250,_7xeq_ycIMY,55


In [285]:
cols = ["tournament_name", "tournament_id",]
court_reference_videos = pd.merge(
    court_reference_videos,
    df_tt[cols].drop_duplicates(),
    on = cols[0],
    how="left"
)

In [286]:
# Save the selected reference videos
court_reference_videos.to_parquet(PATH_REFERENCE_VIDEOS, engine="pyarrow")

# Select reference court frame

Select a frame in which the court of that tournament can be clearly seen

In [288]:
# Already existing tournaments in the reference
l_existing_ref = os.listdir(PATH_IMAGES_REFCOURT)

existing_tournament_ids_ref = set()
existing_video_ids_ref = set()
for name_imref in l_existing_ref:
    if name_imref.endswith(".png"):
        vid_id, tour_id = parse_img_ref(name_imref)
        existing_video_ids_ref.add(vid_id)
        existing_tournament_ids_ref.add(tour_id)

In [289]:
skipped_vids = set()

# Selector of the best reference court images
for i,row in court_reference_videos.iterrows():
    tournament_id = row["tournament_id"]
    video_id = row["video_id"]
    tour_name = row["tournament_name"]
    level = row["level"]

    if tournament_id in existing_tournament_ids_ref:
        print(f"Tournament: {tour_name}, already done, next one please!")
        continue
        
    if video_id in existing_video_ids_ref:
        print(f"Video: {video_id}, already done, next one please!")
        continue

    while True:
        print("==" * 30)
        print(f"Tournament: {tour_name}")
        print("==" * 30)
        img_bgr, frame_ok, frame_num = get_random_frame(video_id)
        user_input = input(f"Do you want to save? (y/n/s(skip): ")
        clear_output(wait=True)
        time.sleep(0.2)
        
        if user_input.lower() == "y":
            # Save the image and break the while loop to go to the next
            name_img = f"ref___l{level}___t{tournament_id}___v{video_id}___f{frame_num}.png"
            path_img_ref = jp(PATH_IMAGES_REFCOURT, name_img)
            save_bgr_img(path=path_img_ref, img_bgr=img_bgr)
            print(f"Saved image: {name_img}")
            break
        elif user_input.lower() == "s":
            print(f"Skipping video")
            skipped_vids.add(video_id)
            break
        else:
            print("Not saving ! Generating a new image")
            time.sleep(0.5)
            clear_output(wait=True)
            continue

Saved image: ref___l500___t24___v2fRc4mUq5aw___f5743.png
Tournament: Hamburg, already done, next one please!
Tournament: Halle, already done, next one please!
Tournament: Dubai, already done, next one please!
Tournament: Basel, already done, next one please!
Tournament: Barcelona, already done, next one please!
Tournament: Acapulco, already done, next one please!
Tournament: Rio, already done, next one please!
Tournament: ATP Finals, already done, next one please!


# <font size=20> Grand Slams

# Get Downloaded Videos Id

In [259]:
sv = list_videos(PATH_VIDEOS_GS_DATASET)
print("Videos downloaded:", len(sv))

Videos downloaded: 4


# Load Master table

In [261]:
df = pd.read_parquet(PATH_MASTER_GS, engine="pyarrow")
df_tt = pd.read_parquet(PATH_TOURNAMENTS, engine="pyarrow")

In [262]:
df

Unnamed: 0,video_id,player_id,tournament_id,tournament_name,name,title
0,k5Azrtqr_ug,123,3,US Open,Fabio Fognini,Jakub Mensik vs. Fabio Fognini Highlights | 20...
1,WdaEsoL4_ak,166,3,US Open,Vasek Pospisil,Vasek Pospisil vs. Pedro Martinez Highlights |...
2,WdaEsoL4_ak,4145,3,US Open,Alberto Martin,Vasek Pospisil vs. Pedro Martinez Highlights |...
3,TGR4PnD0cnM,162,3,US Open,Denis Kudla,Tennys Sandgren vs. Denis Kudla Highlights | 2...
4,TGR4PnD0cnM,217,3,US Open,Tennys Sandgren,Tennys Sandgren vs. Denis Kudla Highlights | 2...
...,...,...,...,...,...,...
3456,KgTLdOI-CfA,3140,2,Wimbledon,Benjamin Becker,2013 Day 1 Highlights: Andy Murray v Benjamin ...
3457,6f1YyGnldJk,138,2,Wimbledon,Rafael Nadal,2013 Day 1 Highlights: Rafael Nadal v Steve Da...
3458,6f1YyGnldJk,2386,2,Wimbledon,Steve Darcis,2013 Day 1 Highlights: Rafael Nadal v Steve Da...
3459,dCs_NNf2RmM,2059,2,Wimbledon,Roger Federer,2013 Day 1 Highlights: Roger Federer v Victor ...


In [263]:
df_data = df[df["video_id"].isin(sv)].copy()
df_data = pd.merge(df_data, df_tt[["tournament_name", "level"]].drop_duplicates(), on=["tournament_name"], how="left")

In [264]:
xx = df_data.groupby(["tournament_name", "level"])["video_id"].count().reset_index().sort_values("level", ascending=False).head(120)

In [265]:
xx.shape

(4, 3)

In [267]:
df_data

Unnamed: 0,video_id,player_id,tournament_id,tournament_name,name,title,level
0,XmW-ArFVZJQ,0,3,US Open,Carlos Alcaraz,Carlos Alcaraz vs. Frances Tiafoe Extended Hig...,grandslam
1,XmW-ArFVZJQ,9,3,US Open,Frances Tiafoe,Carlos Alcaraz vs. Frances Tiafoe Extended Hig...,grandslam
2,QNgE9-0sNjQ&t,48,1,Roland Garros,Stan Wawrinka,Rafael Nadal v Stan Wawrinka Highlights - Men'...,grandslam
3,QNgE9-0sNjQ&t,138,1,Roland Garros,Rafael Nadal,Rafael Nadal v Stan Wawrinka Highlights - Men'...,grandslam
4,F9ZGyxlCyBU,2,0,Australian Open,Daniil Medvedev,Rafael Nadal v Daniil Medvedev Condensed Match...,grandslam
5,F9ZGyxlCyBU,138,0,Australian Open,Rafael Nadal,Rafael Nadal v Daniil Medvedev Condensed Match...,grandslam
6,dvBr9Wr8BCY,0,2,Wimbledon,Carlos Alcaraz,Carlos Alcaraz vs Novak Djokovic: Extended Hig...,grandslam
7,dvBr9Wr8BCY,1,2,Wimbledon,Novak Djokovic,Carlos Alcaraz vs Novak Djokovic: Extended Hig...,grandslam


# Select one video Id per tournament

In [268]:
court_reference_videos = df_data.groupby(["tournament_name", "level"])["video_id"].first().reset_index()
court_reference_videos = court_reference_videos.sort_values("level")

In [269]:
cols = ["tournament_name", "tournament_id",]
court_reference_videos = pd.merge(
    court_reference_videos,
    df_tt[cols].drop_duplicates(),
    on = cols[0],
    how="left"
)

In [270]:
court_reference_videos

Unnamed: 0,tournament_name,level,video_id,tournament_id
0,Australian Open,grandslam,F9ZGyxlCyBU,0
1,Roland Garros,grandslam,QNgE9-0sNjQ&t,1
2,US Open,grandslam,XmW-ArFVZJQ,3
3,Wimbledon,grandslam,dvBr9Wr8BCY,2


In [271]:
# Save the selected reference videos
court_reference_videos.to_parquet(PATH_REFERENCE_GS_VIDEOS, engine="pyarrow")

# Select reference court frame

Select a frame in which the court of that tournament can be clearly seen

In [273]:
# Already existing tournaments in the reference
l_existing_ref = os.listdir(PATH_IMAGES_REFCOURT)

existing_tournament_ids_ref = set()
existing_video_ids_ref = set()
for name_imref in l_existing_ref:
    if name_imref.endswith(".png"):
        vid_id, tour_id = parse_img_ref(name_imref)
        existing_video_ids_ref.add(vid_id)
        existing_tournament_ids_ref.add(tour_id)

In [278]:
skipped_vids = set()

# Selector of the best reference court images
for i,row in court_reference_videos.iterrows():
    tournament_id = row["tournament_id"]
    video_id = row["video_id"]
    tour_name = row["tournament_name"]
    level = row["level"]

    if tournament_id in existing_tournament_ids_ref:
        print(f"Tournament: {tour_name}, already done, next one please!")
        continue
        
    if video_id in existing_video_ids_ref:
        print(f"Video: {video_id}, already done, next one please!")
        continue

    while True:
        print("==" * 30)
        print(f"Tournament: {tour_name}")
        print("==" * 30)
        img_bgr, frame_ok, frame_num = get_random_frame(video_id, is_grand_slam=True)
        user_input = input(f"Do you want to save? (y/n/s(skip): ")
        clear_output(wait=True)
        time.sleep(0.2)
        
        if user_input.lower() == "y":
            # Save the image and break the while loop to go to the next
            name_img = f"ref___l{level}___t{tournament_id}___v{video_id}___f{frame_num}.png"
            path_img_ref = jp(PATH_IMAGES_REFCOURT, name_img)
            save_bgr_img(path=path_img_ref, img_bgr=img_bgr)
            print(f"Saved image: {name_img}")
            break
        elif user_input.lower() == "s":
            print(f"Skipping video")
            skipped_vids.add(video_id)
            break
        else:
            print("Not saving ! Generating a new image")
            time.sleep(0.5)
            clear_output(wait=True)
            continue

Saved image: ref___lgrandslam___t2___vdvBr9Wr8BCY___f1611.png
