# Scraping and Pose capture from annotations
The following script sets out steps for scraping the videos and then capturing the pose data from them using the annotaion files.

Dictionaries of the 2019 recording links

In [15]:
men_video_links = {
"109+":"https://www.youtube.com/watch?v=lEriREH2eVs",
"109":"https://www.youtube.com/watch?v=pcDOw9RkmbM",
"102":"https://www.youtube.com/watch?v=VDrDUh-ZIXE",
"96":"https://www.youtube.com/watch?v=PdYtEusK4I8",
"89":"https://www.youtube.com/watch?v=D-JPFuKovcU",
"81":"https://www.youtube.com/watch?v=8nfA8rAfHSI",
"73":"https://www.youtube.com/watch?v=34JJ12OOpng",
"67":"https://www.youtube.com/watch?v=0gGjCEcBG_I",
"61":"https://www.youtube.com/watch?v=plFZt23A-Fk",
"55":"https://www.youtube.com/watch?v=mlAht5v2Uvo"
}

## 1 Download and Rename all the recordings

In [5]:
# https://www.the-analytics.club/download-youtube-videos-in-python/
# Step I: Install Pytube using pip

# Step II : In your script import the YouTube class from pytube package.
from pytube import YouTube
from os import getcwd as wd
import os

In [3]:

def download_yt_videos(video_url:str,folder_path:str):
    """
    The function downloads a video stream to a folder path
    """
    # Step III : Create an object of YouTube, by passing the video URL
    yt = YouTube(video_url)
    # Step IV : Use the filter method to specify the download format of the video
    mp4_files = yt.streams.filter(file_extension="mp4")
    # Step V : Get the video you want by specifying the resolution
    mp4_down_files = mp4_files.get_by_resolution("720p")
    # Step VI : Save the downloaded video to the local file system
    mp4_down_files.download(folder_path)

In [None]:
folder_path_men = f"{wd()}/data/videos/men"

for item in men_video_links:
    download_yt_videos(men_video_links[item],folder_path_men)

#### Rename the videos in the folder (original names are the full video titles)
 - Get list of the files in the folder
 - For item in the folder look if the name contains the weight string.
 - save the file in the folder with a simple name string with weight in the tittle.

In [23]:
def rename_files_in_folder(folder_path:str,
                        new_names_list:str,
                        addiional_str:str):
    """
    Function renames the filenames of the downloaded videos    
    """

    files_list = os.listdir(folder_path)
    for file_name in files_list:
        for name in new_names_list:
            if f"{name}kg" in file_name:
                print(name)
                original_file_name = "{}/{}".format(folder_path,file_name)
                updated_file_name = "{}/{}{}.mp4".format(folder_path,name,addiional_str)
                os.rename(original_file_name,updated_file_name)


rename_files_in_folder(folder_path_men,men_video_links.keys(),"_men_2019")

109
109+


## 2 Create the folder structure
 - annotation files
 - name files
 - initial data pose files

In [24]:
import datetime
import csv

def create_csv_in_path(name_str,folder_path,columns,add_date = True):
    """
    Create a csv file with columns
    """
    
    # Create name with date
    if add_date:
        csv_path = "{}/{}_{}.csv".format(
            folder_path,name_str,datetime.datetime.now().strftime("%Y_%m_%d__%H_%M"))
    else:
        csv_path = "{}/{}.csv".format(folder_path,name_str)


    with open(csv_path, mode='x', newline='') as f:
        csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(columns)

    # Check if the line was written to the file
    bash_command = "cat {}".format(csv_path)    
    os.system(bash_command)

    return csv_path


In [25]:
# Create annotation csv files.
columns = [
    "name",
    "weight",
    "lift_start",
    "lift_end",    
    "success"
]

for name in men_video_links.keys():
    create_csv_in_path("{}_annot".format(name),folder_path_men,columns,False)

# Create name csv files.
columns = ["name","country"]

for name in men_video_links.keys():
    create_csv_in_path("{}_name".format(name),folder_path_men,columns,False)

# Create pose csv's
columns = [
    "id",
    "class",
    "time_ms",
    "success",
    "weightclass",
    "name",
    "country",
    "weight"
]
columns

select_features = [f'{dimension}{item}' for item in range(33) for dimension in ["x","y","z","v"]]

columns.extend(select_features)
columns

for name in men_video_links.keys():
    create_csv_in_path("{}_pose".format(name),folder_path_men,columns,False)

## 3 Process annotations

 - Read the annotation file.
 - Read the name file.
 - Create combined csv file.

In [27]:
import pandas as pd

Create dictionanries with paths to different files

In [40]:
files_list = os.listdir(folder_path_men)
df_name_path = {}
df_annotations_path = {}
df_pose_path = {}
df_video_path = {}
df_combined_path = {}


for weight in men_video_links.keys():
    for item in files_list:
        if "{}_".format(weight) in item:
            if "name" in item:
                df_name_path[weight] = "{}/{}".format(folder_path_men,item)
                # print(df_name_path)
            elif "annot" in item:
                df_annotations_path[weight] = "{}/{}".format(folder_path_men,item)
                # print(df_annotations_path)
            elif "pose" in item:
                df_pose_path[weight] = "{}/{}".format(folder_path_men,item)
                # print(df_pose_path)
            elif "mp4" in item:
                df_video_path[weight] = "{}/{}".format(folder_path_men,item)
                # print(df_video_path)
            
    df_combined_path[weight] = "{}/men/{}_combined.csv".format(folder_path_men,weight)


Create a combined dataframe with corrected times to seconds

In [13]:
def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)


In [27]:
for category in men_video_links.keys(): 
    df_name = pd.read_csv(df_name_path[category])
    # Get the annotations
    df_annotations = pd.read_csv(df_annotations_path[category])

    #!!!!!!! Insert athletes names and countries !!!!!!!

    # Insert an extra coulumn to be replaced by the country name
    df_annotations.insert(1,"country",df_annotations["name"])
    # Insert the Names of athletes
    df_annotations["name"] = df_annotations["name"].apply(lambda x:df_name["name"][x-1])
    # Insert the countries of athletes
    df_annotations["country"] = df_annotations["country"].apply(lambda x:df_name["country"][x-1])

    #!!!!!!! Correct the time format in the dataframe !!!!!!!
    df_annotations["lift_start"] = df_annotations["lift_start"].apply(get_sec)
    df_annotations["lift_end"] = df_annotations["lift_start"] + df_annotations["lift_end"]

    df_annotations.head()

    uf.save_df_to_csv(df_annotations,df_combined_path[category])

28,TALAKHADZE,GEO,220,2745,2748,1
29,MARTIROSYAN,ARM,199,2693,2698,1
35,JIN,KOR,183,2569,2574,0
32,RIVAS,COL,182,3037,3039,0
29,DAVITADZE,GEO,172,2365,2369,1
29,KARAPETYAN,ARM,172,2333,2335,0
28,SHIZ,CHN,166,2735,2739,1
32,ISMAYILOV,TUR,154,2673,2675,0
33,LI,CHN,145,2848,2851,1
23,OM,PRK,128,2111,2113,1


## 4 Process pose for the annotations collected
Using mediapipe pose detector, capture the pose data for each weight category

In [28]:
import numpy as np
import cv2 # Import opencv
import mediapipe as mp # Import mediapipe
# Import the pose capture methods from mediapipe
mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_pose = mp.solutions.pose # Mediapipe Solutions

In [61]:
def capture_and_save_pose(category, total_index):
    print("Processing {} weight category!".format(category))
    cap = cv2.VideoCapture(df_video_path[category]) # Using video file
    df = pd.read_csv(df_combined_path[category],index_col=0)
    
    start_set = False
    list_index = 0
    index_list = df.index.to_list()
    len_of_list = df.shape[0]
    
    # Initiate holistic model
    with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
        
        while cap.isOpened():

                if not start_set:
                    start_time_ms = df.loc[index_list[list_index]]["lift_start"] * 1000
                    end_time_ms = df.loc[index_list[list_index]]["lift_end"] * 1000
                    cap.set(cv2.CAP_PROP_POS_MSEC,start_time_ms)
                    start_set = not start_set
                
                current_time_ms = cap.get(cv2.CAP_PROP_POS_MSEC)

                if current_time_ms >= end_time_ms:
                    start_set = not start_set
                    list_index += 1
                    total_index += 1
                    if list_index >= len_of_list: break
                    continue
                
                success, frame = cap.read()


                if not success:
                    print("Camera frame empty!")
                    # continue # if webcam stream
                    break # if video stream
                
                # Going from 1080x1920p
                # Resize to 3600x640p (width, height) same aspect ratio
                # im_resized = cv2.resize(frame,(360,640))

                # Recolor Feed
                # image = cv2.cvtColor(im_resized, cv2.COLOR_BGR2RGB)
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                image.flags.writeable = False        
                
                # Make Detections
                results = pose.process(image)


                # Export coordinates
                try:
                    # Extract Pose landmarks
                    # pose_results = results.pose_world_landmarks.landmark # world landmarks - distances in meeters
                    pose_results = results.pose_landmarks.landmark # Normalised landmarks - distances [0,1]
                    # pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
                    pose_row = np.array([[item.x, item.y, item.z, item.visibility] for item in pose_results]).flatten().tolist()
                    
                    
                    # Append class name 
                    pose_row.insert(0, total_index)
                    pose_row.insert(1, "other")
                    pose_row.insert(2, current_time_ms)
                    pose_row.insert(3, df.loc[index_list[list_index]]["success"])
                    pose_row.insert(4, category)
                    pose_row.insert(5, df.loc[index_list[list_index]]["name"])
                    pose_row.insert(6, df.loc[index_list[list_index]]["country"])
                    pose_row.insert(7, df.loc[index_list[list_index]]["weight"])


                    # Export to CSV
                    with open(df_pose_path[category], mode='a', newline='') as f:
                        csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        csv_writer.writerow(pose_row) 
                    
                except Exception as e:
                    # print("Can't extract line") 
                    # print(e)
                    pass



                # Recolor image back to BGR for rendering
                image.flags.writeable = True   
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                
                
                # Draw Pose Detections
                mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS, 
                                        mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                        mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                                        )
                                
                cv2.imshow('Raw Webcam Feed', image)
            
                # Stop playing
                if (cv2.waitKey(1) & 0xFF == ord('q')):
                    input_value = input("What?")
                    if input_value == "q": break
                    if input_value == "p": 
                        print(pose_row)
                        continue


    cap.release()
    cv2.destroyAllWindows()
    print("Done! Total idx = {}".format(total_index))
    return total_index

In [62]:
total_idx = 0
for category in men_video_links.keys():
    total_idx = capture_and_save_pose(category,total_idx)

Processing 109+ weight category!
Done! Total idx = 29
Processing 109 weight category!
Done! Total idx = 59
Processing 102 weight category!
Done! Total idx = 95
Processing 96 weight category!
Done! Total idx = 128
Processing 89 weight category!
Done! Total idx = 158
Processing 81 weight category!
Done! Total idx = 188
Processing 73 weight category!
Done! Total idx = 217
Processing 67 weight category!
Done! Total idx = 250
Processing 61 weight category!
Done! Total idx = 284
Processing 55 weight category!
Done! Total idx = 308


Combine all the data into a single dataframe

In [63]:
frames = []
for category in men_video_links.keys():
    df = pd.read_csv(df_pose_path[category])
    frames.append(df)

df = pd.concat(frames)
df.shape

(30266, 140)

Save the final collected dataframe to a csv file.

In [None]:
import util
util.save_df_to_csv(df,f"{wd()}/data/combined_captured_pose_dataframe.csv")