In [3]:
import os

# Paths to the directories
path_videos = 'videos'
path_downloaded = '../downloaded/WLASL2000'

# Get the set of filenames in each directory
files_in_videos = set(os.listdir(path_videos))
files_in_downloaded = set(os.listdir(path_downloaded))

# Find overlapping, new, and old files
overlapping_files = files_in_videos.intersection(files_in_downloaded)
new_files = files_in_downloaded - files_in_videos
old_files = files_in_videos - files_in_downloaded

# Output results
print(f"Number of overlapping files: {len(overlapping_files)}")
print(f"Number of new files in '../downloaded/WLASL2000': {len(new_files)}")
print(f"Number of old files only in 'videos': {len(old_files)}")

# Optional: Print old file names if needed
print("Old files in 'videos' only:")
for file in old_files:
    print(file)


Number of overlapping files: 13613
Number of new files in '../downloaded/WLASL2000': 7482
Number of old files only in 'videos': 0
Old files in 'videos' only:


In [6]:
import json
import os
import pandas as pd

# Path to the metadata and video folder
file_path = 'WLASL_v0.3.json'
video_folder_path = '../downloaded/WLASL2000'  # Change this to your video folder path

with open(file_path) as ipf:
    content = json.load(ipf)

# Create a list to store gloss and the count of instances with videos
gloss_video_data = []

for ent in content:
    gloss = ent['gloss']
    instances = ent['instances']
    
    # Count the number of instances with a corresponding video file
    count_with_video = sum(1 for inst in instances if os.path.exists(os.path.join(video_folder_path, f"{inst['video_id']}.mp4")))
    gloss_video_data.append({'Gloss': gloss, 'Instances with Video': count_with_video})

# Create a pandas DataFrame
df = pd.DataFrame(gloss_video_data)

# Calculate summary statistics
average_instances = df['Instances with Video'].mean()
min_instances = df['Instances with Video'].min()
max_instances = df['Instances with Video'].max()

# Add summary row
summary = pd.DataFrame({
    'Gloss': ['Summary'],
    'Instances with Video': [f'Avg: {average_instances:.2f}, Min: {min_instances}, Max: {max_instances}']
})

# Append the summary to the DataFrame
df = pd.concat([df, summary], ignore_index=True)

df.to_csv('gloss_summary_new.csv', index=False)

df


Unnamed: 0,Gloss,Instances with Video
0,book,40
1,drink,35
2,computer,30
3,before,26
4,chair,26
...,...,...
1996,waterfall,7
1997,weigh,7
1998,wheelchair,7
1999,whistle,7


In [8]:
import os

# Paths to the directories
path_downloaded = '../downloaded/WLASL2000'
path_keypoints = '../data/pose_per_individual_videos'

# Get the set of new video filenames (without extension) in the downloaded folder
video_files = set(f.split('.')[0] for f in os.listdir(path_downloaded) if f.endswith('.mp4'))

# Get the set of keypoint folder names
keypoint_folders = set(os.listdir(path_keypoints))

# Find corresponding and missing keypoint folders
corresponding_keypoints = video_files.intersection(keypoint_folders)
missing_keypoints = video_files - keypoint_folders

# Output results
print(f"Number of corresponding keypoint folders: {len(corresponding_keypoints)}")
print(f"Number of missing keypoint folders: {len(missing_keypoints)}")

# Optional: Print missing keypoint folder names if needed
print("Missing keypoint folders for these videos:")
for video in missing_keypoints:
    print(video)


Number of corresponding keypoint folders: 21095
Number of missing keypoint folders: 0
Missing keypoint folders for these videos:
