In [2]:
import os
import pandas as pd
from glob import glob

# 1. Count the number of MP4 files in the directory
mp4_dir = 'labelbox_sidewalk_legacy_data'
mp4_files = [f for f in os.listdir(mp4_dir) if f.lower().endswith('.mp4')]
print(f"Number of MP4 files: {len(mp4_files)}")

Number of MP4 files: 1501


In [3]:
# 2. Read continuous_videos.csv and find videos not in labelbox_sidewalk_legacy_data
csv_path = 'data/continuous_videos.csv'
df = pd.read_csv(csv_path)

# Extract video names from the CSV (remove extensions, change to .mp4)
def mov_to_mp4(name):
    if name.lower().endswith('.mov'):
        return name[:-4] + '.mp4'
    return name
df['video_name_mp4'] = df['video_name'].apply(mov_to_mp4)

# Collect the names of all existing MP4 files
mp4_set = set(mp4_files)

# Find MP4 files that are not in the local directory
missing_mask = ~df['video_name_mp4'].isin(mp4_set)
missing_data_df = df[missing_mask].copy()
print(f"Missing videos count: {len(missing_data_df)}")
missing_data_df.head()

Missing videos count: 0


Unnamed: 0,video_name,collector_name,material,full_path,duration,video_name_mp4


## Inspect testbed data

In [2]:
import cv2
import re
import os
import pandas as pd
from glob import glob

testbed_dir = 'labelbox_sidewalk_testbed_data'
testbed_files = [f for f in os.listdir(testbed_dir) if f.lower().endswith('.mp4')]

def extract_year(filename):
    # Try to find a year in the format YYYY in the filename
    match = re.search(r'(20\d{2})', filename)
    if match:
        return int(match.group(1))
    return None

def get_video_duration(filepath):
    try:
        cap = cv2.VideoCapture(filepath)
        if not cap.isOpened():
            return None
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        if fps > 0:
            duration = frame_count / fps
        else:
            duration = None
        cap.release()
        return duration
    except Exception as e:
        return None

testbed_data = []
for fname in testbed_files:
    year = extract_year(fname)
    full_path = os.path.join(testbed_dir, fname)
    duration = get_video_duration(full_path)
    testbed_data.append({'file_name': fname, 'year': year, 'duration_sec': duration})

testbed_df = pd.DataFrame(testbed_data)
testbed_df.head()

Unnamed: 0,file_name,year,duration_sec
0,2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC...,2025,57.6
1,2025-07-14_14-22-35-043_Roller_Ball_Clear_BMCC...,2025,8.133333
2,2025-07-15_14-15-31-424_Roller_Ball_Clear_BMCC...,2025,21.0
3,2025-07-15_13-51-06-043_Roller_Ball_Clear_BMCC...,2025,109.0
4,0_2024-07-30 13:58:08.8070_none.mp4,2024,59.666667


In [3]:
testbed_df_2024 = testbed_df[testbed_df['year'] == 2024]
testbed_df_2025 = testbed_df[testbed_df['year'] == 2025]

In [4]:
testbed_df_2024.shape, testbed_df_2025.shape

((170, 3), (434, 3))

In [5]:
testbed_df_2025.head()

Unnamed: 0,file_name,year,duration_sec
0,2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC...,2025,57.6
1,2025-07-14_14-22-35-043_Roller_Ball_Clear_BMCC...,2025,8.133333
2,2025-07-15_14-15-31-424_Roller_Ball_Clear_BMCC...,2025,21.0
3,2025-07-15_13-51-06-043_Roller_Ball_Clear_BMCC...,2025,109.0
5,2025-07-14_15-14-25-839_Metal_Clear_BMCC_45.mp4,2025,9.533333


In [6]:
testbed_df_2025.iloc[0, 0]

'2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC_7.mp4'

In [7]:
def extract_cane_tip_wether_location_v2(filename):
    # Remove extension
    name = filename.rsplit('.', 1)[0]
    parts = name.split('_')
    # Format: YYYY-MM-DD_HH-MM-SS_CaneTip(_CaneTip2)_Wether_Location_Segment
    # Find the index of the time part (should be the second part, index 1)
    # The next part(s) are cane_tip, then wether, then location, then segment
    if len(parts) < 6:
        # Not enough parts to parse
        return None, None, None
    # The segment is always the last part
    segment = parts[-1]
    location = parts[-2]
    wether = parts[-3]
    # Everything between the time and wether is cane_tip (could be 1 or 2 parts)
    cane_tip_parts = parts[2:-3]
    cane_tip = '_'.join(cane_tip_parts) if cane_tip_parts else None
    return cane_tip, wether, location

# Apply new extraction to all rows
testbed_df[['cane_tip', 'wether', 'location']] = testbed_df['file_name'].apply(lambda x: pd.Series(extract_cane_tip_wether_location_v2(x)))
testbed_df_2025_test = testbed_df_2025.copy()
testbed_df_2025_test[['cane_tip', 'wether', 'location']] = testbed_df_2025_test['file_name'].apply(lambda x: pd.Series(extract_cane_tip_wether_location_v2(x)))
testbed_df_2025_test.head()

Unnamed: 0,file_name,year,duration_sec,cane_tip,wether,location
0,2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC...,2025,57.6,Roller_Ball,Clear,BMCC
1,2025-07-14_14-22-35-043_Roller_Ball_Clear_BMCC...,2025,8.133333,Roller_Ball,Clear,BMCC
2,2025-07-15_14-15-31-424_Roller_Ball_Clear_BMCC...,2025,21.0,Roller_Ball,Clear,BMCC
3,2025-07-15_13-51-06-043_Roller_Ball_Clear_BMCC...,2025,109.0,Roller_Ball,Clear,BMCC
5,2025-07-14_15-14-25-839_Metal_Clear_BMCC_45.mp4,2025,9.533333,Metal,Clear,BMCC


In [8]:
testbed_df.head()

Unnamed: 0,file_name,year,duration_sec,cane_tip,wether,location
0,2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC...,2025,57.6,Roller_Ball,Clear,BMCC
1,2025-07-14_14-22-35-043_Roller_Ball_Clear_BMCC...,2025,8.133333,Roller_Ball,Clear,BMCC
2,2025-07-15_14-15-31-424_Roller_Ball_Clear_BMCC...,2025,21.0,Roller_Ball,Clear,BMCC
3,2025-07-15_13-51-06-043_Roller_Ball_Clear_BMCC...,2025,109.0,Roller_Ball,Clear,BMCC
4,0_2024-07-30 13:58:08.8070_none.mp4,2024,59.666667,,,


In [10]:
testbed_df_2025_test.shape, testbed_df_2025_test.shape

((434, 6), (434, 6))

In [11]:
testbed_df_2025_test.to_csv('./data/testbed_2025.csv', index=False)

In [12]:
prefix = "https://storage.googleapis.com/labelbox-sidewalk-testbed-2025/"

testbed_df_2025_test['download_link'] = prefix + testbed_df_2025_test['file_name']

In [13]:
testbed_df_2025_test.head()

Unnamed: 0,file_name,year,duration_sec,cane_tip,wether,location,download_link
0,2025-07-15_13-45-34-116_Roller_Ball_Clear_BMCC...,2025,57.6,Roller_Ball,Clear,BMCC,https://storage.googleapis.com/labelbox-sidewa...
1,2025-07-14_14-22-35-043_Roller_Ball_Clear_BMCC...,2025,8.133333,Roller_Ball,Clear,BMCC,https://storage.googleapis.com/labelbox-sidewa...
2,2025-07-15_14-15-31-424_Roller_Ball_Clear_BMCC...,2025,21.0,Roller_Ball,Clear,BMCC,https://storage.googleapis.com/labelbox-sidewa...
3,2025-07-15_13-51-06-043_Roller_Ball_Clear_BMCC...,2025,109.0,Roller_Ball,Clear,BMCC,https://storage.googleapis.com/labelbox-sidewa...
5,2025-07-14_15-14-25-839_Metal_Clear_BMCC_45.mp4,2025,9.533333,Metal,Clear,BMCC,https://storage.googleapis.com/labelbox-sidewa...


In [14]:
testbed_df_2025_test.to_csv('./data/testbed_2025.csv', index=False)