In [6]:
%pip install moviepy

Note: you may need to restart the kernel to use updated packages.


In [12]:
from glob import glob
import pandas as pd
import os
from pathlib import Path
# from moviepy.editor import VideoFileClip

In [13]:
# Define the root directory where we'll search for videos
root_dir = Path('10_3_2023_all_data')

def clean_material_name(material):
    """Clean and standardize material names"""
    # Convert to lowercase first
    material = material.lower()
    
    # Standard replacements
    replacements = {
        'none': 'none',
        'subway+grate': 'subway_grate',
        'cellar+door': 'cellar_door',
        'concrete+tactile': 'concrete_tactile',
        'concret+tactile': 'concrete_tactile',
        'concrete+stones': 'concrete_stones',
        'storm drains': 'storm_drain',
        'floor art': 'floor_art'
    }
    
    # Replace + with _ and clean up spaces
    material = material.replace('+', '_').strip()
    
    # Apply standard replacements
    return replacements.get(material, material)

# Find all .MOV files recursively
video_files = []
for video_path in root_dir.rglob('*.MOV'):
    # Get the collector name (parent folder name)
    collector_name = video_path.parent.name
    
    # Extract material from filename
    filename = video_path.stem  # removes .MOV extension
    material = filename.split('_')[-1]  # get last part after underscore
    
    # Store information in a dictionary
    video_info = {
        'video_name': video_path.name,
        'collector_name': collector_name,
        'material': clean_material_name(material),
        'full_path': str(video_path)
    }
    video_files.append(video_info)

# Create a DataFrame
df = pd.DataFrame(video_files)

# Save to CSV
output_file = 'video_inventory.csv'
df.to_csv(output_file, index=False)
print(f"Found {len(video_files)} video files")
print(f"Data saved to {output_file}")
df.head()

Found 1525 video files
Data saved to video_inventory.csv


Unnamed: 0,video_name,collector_name,material,full_path
0,6_2023-06-30_11-15-18-5960_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...
1,1_2023-06-30_11-07-11-9360_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...
2,1_2023-06-16_11_59_12.9130_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...
3,3_2023-06-16_11_42_44.9410_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...
4,1_2023-06-30_10-58-42-3550_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...


In [14]:
# Show unique materials and their counts
material_counts = df['material'].value_counts()
print("Unique materials and their counts after cleaning:")
print(material_counts)

Unique materials and their counts after cleaning:
material
none                755
concrete            103
manhole              92
subway_grate         87
dirt                 83
brick                73
cellar_door          71
tactile              70
metal                61
asphalt              43
grass                33
grate                12
storm_drain          11
granite               6
carpet                5
concrete_tactile      5
concrete_stones       4
stone                 4
concret_tactile       4
floor_art             3
Name: count, dtype: int64


In [15]:
# Install opencv-python if not already installed
# %pip install opencv-python

import cv2

# Function to get video duration
def get_video_duration(video_path):
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return None
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        duration = frame_count / fps
        cap.release()
        return round(duration, 2)
    except Exception as e:
        print(f"Error reading {video_path}: {str(e)}")
        return None

# Add durations to our dataframe
print("Calculating video durations...")
df['duration'] = df['full_path'].apply(get_video_duration)

# Save updated CSV
df.to_csv(output_file, index=False)

# Show summary statistics of video durations
print("\nVideo duration statistics (in seconds):")
print(df['duration'].describe())

# Show a few example rows
print("\nExample rows with durations:")
print(df[['video_name', 'collector_name', 'material', 'duration']].head())

Calculating video durations...

Video duration statistics (in seconds):
count    1525.000000
mean       34.571351
std        19.344797
min         0.200000
25%        20.100000
50%        32.350000
75%        42.270000
max       185.160000
Name: duration, dtype: float64

Example rows with durations:
                            video_name collector_name material  duration
0  6_2023-06-30_11-15-18-5960_None.MOV          wayne     none     59.79
1  1_2023-06-30_11-07-11-9360_None.MOV          wayne     none     59.79
2  1_2023-06-16_11_59_12.9130_None.MOV          wayne     none     59.79
3  3_2023-06-16_11_42_44.9410_None.MOV          wayne     none     55.33
4  1_2023-06-30_10-58-42-3550_None.MOV          wayne     none     59.76

Video duration statistics (in seconds):
count    1525.000000
mean       34.571351
std        19.344797
min         0.200000
25%        20.100000
50%        32.350000
75%        42.270000
max       185.160000
Name: duration, dtype: float64

Example rows with du

In [16]:
df.head()

Unnamed: 0,video_name,collector_name,material,full_path,duration
0,6_2023-06-30_11-15-18-5960_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.79
1,1_2023-06-30_11-07-11-9360_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.79
2,1_2023-06-16_11_59_12.9130_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.79
3,3_2023-06-16_11_42_44.9410_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,55.33
4,1_2023-06-30_10-58-42-3550_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.76


In [17]:
df.to_csv(output_file, index=False)

In [18]:
# Separate videos into continuous (none material) and static (with material)
continuous_videos = df[df['material'] == 'none'].copy()
static_videos = df[df['material'] != 'none'].copy()

print(f"Number of continuous videos (no material): {len(continuous_videos)}")
print(f"Number of static videos (with material): {len(static_videos)}")

# Save both dataframes to separate CSV files
continuous_videos.to_csv('continuous_videos.csv', index=False)
static_videos.to_csv('static_videos.csv', index=False)

# Show sample from each dataframe
print("\nSample of continuous videos:")
print(continuous_videos[['video_name', 'collector_name', 'material', 'duration']].head())

print("\nSample of static videos:")
print(static_videos[['video_name', 'collector_name', 'material', 'duration']].head())

# Show material distribution in static videos
print("\nMaterial distribution in static videos:")
print(static_videos['material'].value_counts())

Number of continuous videos (no material): 755
Number of static videos (with material): 770

Sample of continuous videos:
                            video_name collector_name material  duration
0  6_2023-06-30_11-15-18-5960_None.MOV          wayne     none     59.79
1  1_2023-06-30_11-07-11-9360_None.MOV          wayne     none     59.79
2  1_2023-06-16_11_59_12.9130_None.MOV          wayne     none     59.79
3  3_2023-06-16_11_42_44.9410_None.MOV          wayne     none     55.33
4  1_2023-06-30_10-58-42-3550_None.MOV          wayne     none     59.76

Sample of static videos:
                                      video_name  collector_name  \
122  0_2023-08-21 13_31_45.3030_Subway+grate.MOV  jiawei_testbed   
123       0_2023-08-21 13_25_10.3210_Manhole.MOV  jiawei_testbed   
125  1_2023-08-21 13_31_45.3030_Subway+grate.MOV  jiawei_testbed   
126   0_2023-08-21 13_35_23.4820_Cellar+door.MOV  jiawei_testbed   
128       0_2023-08-21 13_24_23.9400_Manhole.MOV  jiawei_testbed   

     

In [19]:
# Check for duplicates in continuous_videos
duplicate_mask = continuous_videos.duplicated(subset=['video_name'], keep=False)
duplicated_videos = continuous_videos[duplicate_mask].sort_values('video_name')

print("Number of duplicated videos found:", len(duplicated_videos) // 2)
print("\nExample of duplicated videos:")
print(duplicated_videos[['video_name', 'collector_name', 'full_path']].head(10))

# Create a clean version without duplicates (keeping the first occurrence)
continuous_videos_clean = continuous_videos.drop_duplicates(subset=['video_name'], keep='first').copy()

print("\nOriginal continuous videos count:", len(continuous_videos))
print("Clean continuous videos count:", len(continuous_videos_clean))

# Save the clean version to a new CSV
continuous_videos_clean.to_csv('continuous_videos_clean.csv', index=False)

# Show distribution of collectors in original vs clean dataset
print("\nCollector distribution in original dataset:")
print(continuous_videos['collector_name'].value_counts())

print("\nCollector distribution in clean dataset:")
print(continuous_videos_clean['collector_name'].value_counts())

Number of duplicated videos found: 24

Example of duplicated videos:
                              video_name collector_name  \
4    1_2023-06-30_10-58-42-3550_None.MOV          wayne   
267  1_2023-06-30_10-58-42-3550_None.MOV          nihal   
266  1_2023-06-30_11-07-11-9360_None.MOV          nihal   
1    1_2023-06-30_11-07-11-9360_None.MOV          wayne   
280  1_2023-06-30_11-15-18-5960_None.MOV          nihal   
34   1_2023-06-30_11-15-18-5960_None.MOV          wayne   
287  1_2023-06-30_11-23-29-2950_None.MOV          nihal   
43   1_2023-06-30_11-23-29-2950_None.MOV          wayne   
272  2_2023-06-30_10-58-42-3550_None.MOV          nihal   
20   2_2023-06-30_10-58-42-3550_None.MOV          wayne   

                                             full_path  
4    10_3_2023_all_data/testbed/old_testbed/user_da...  
267  10_3_2023_all_data/testbed/user_data/nihal/1_2...  
266  10_3_2023_all_data/testbed/user_data/nihal/1_2...  
1    10_3_2023_all_data/testbed/old_testbed/user_da..

In [20]:
# Group duplicates by video name to see where they appear
duplicate_analysis = duplicated_videos.groupby('video_name').agg({
    'collector_name': lambda x: list(x),
    'full_path': lambda x: list(x)
}).reset_index()

print("Detailed analysis of some duplicated videos:")
print("\nShowing where the same video appears in different locations:")
for _, row in duplicate_analysis.head().iterrows():
    print(f"\nVideo: {row['video_name']}")
    print("Found in:")
    for collector, path in zip(row['collector_name'], row['full_path']):
        print(f"- Collector: {collector}")
        print(f"  Path: {path}")

# Count how many videos appear in which combinations of collectors
collector_combinations = duplicated_videos.groupby('video_name')['collector_name'].apply(lambda x: tuple(sorted(set(x))))
common_patterns = collector_combinations.value_counts()

print("\nCommon patterns of video duplication:")
print(common_patterns.head())

Detailed analysis of some duplicated videos:

Showing where the same video appears in different locations:

Video: 1_2023-06-30_10-58-42-3550_None.MOV
Found in:
- Collector: wayne
  Path: 10_3_2023_all_data/testbed/old_testbed/user_data/wayne/1_2023-06-30_10-58-42-3550_None.MOV
- Collector: nihal
  Path: 10_3_2023_all_data/testbed/user_data/nihal/1_2023-06-30_10-58-42-3550_None.MOV

Video: 1_2023-06-30_11-07-11-9360_None.MOV
Found in:
- Collector: nihal
  Path: 10_3_2023_all_data/testbed/user_data/nihal/1_2023-06-30_11-07-11-9360_None.MOV
- Collector: wayne
  Path: 10_3_2023_all_data/testbed/old_testbed/user_data/wayne/1_2023-06-30_11-07-11-9360_None.MOV

Video: 1_2023-06-30_11-15-18-5960_None.MOV
Found in:
- Collector: nihal
  Path: 10_3_2023_all_data/testbed/user_data/nihal/1_2023-06-30_11-15-18-5960_None.MOV
- Collector: wayne
  Path: 10_3_2023_all_data/testbed/old_testbed/user_data/wayne/1_2023-06-30_11-15-18-5960_None.MOV

Video: 1_2023-06-30_11-23-29-2950_None.MOV
Found in:
- Col

In [21]:
duplicated_videos

Unnamed: 0,video_name,collector_name,material,full_path,duration
4,1_2023-06-30_10-58-42-3550_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.76
267,1_2023-06-30_10-58-42-3550_None.MOV,nihal,none,10_3_2023_all_data/testbed/user_data/nihal/1_2...,59.76
266,1_2023-06-30_11-07-11-9360_None.MOV,nihal,none,10_3_2023_all_data/testbed/user_data/nihal/1_2...,59.79
1,1_2023-06-30_11-07-11-9360_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.79
280,1_2023-06-30_11-15-18-5960_None.MOV,nihal,none,10_3_2023_all_data/testbed/user_data/nihal/1_2...,59.79
34,1_2023-06-30_11-15-18-5960_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.79
287,1_2023-06-30_11-23-29-2950_None.MOV,nihal,none,10_3_2023_all_data/testbed/user_data/nihal/1_2...,59.81
43,1_2023-06-30_11-23-29-2950_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.81
272,2_2023-06-30_10-58-42-3550_None.MOV,nihal,none,10_3_2023_all_data/testbed/user_data/nihal/2_2...,59.76
20,2_2023-06-30_10-58-42-3550_None.MOV,wayne,none,10_3_2023_all_data/testbed/old_testbed/user_da...,59.76


In [22]:
import pandas as pd

# 1. 处理 static_videos.csv
static_csv = 'data/static_videos.csv'
static_df = pd.read_csv(static_csv)
static_prefix = 'https://storage.googleapis.com/labelbox-sidewalk-legacy-data-static/'

def mov_to_mp4(name):
    if name.lower().endswith('.mov'):
        return name[:-4] + '.mp4'
    return name

static_df['download_link'] = static_prefix + static_df['video_name'].apply(mov_to_mp4)
static_df.to_csv(static_csv, index=False)
print(f"static_videos_with_link.csv saved, shape: {static_df.shape}")

# 2. 处理 continuous_videos_clean.csv
cont_csv = 'data/continuous_videos_clean.csv'
cont_df = pd.read_csv(cont_csv)
cont_prefix = 'https://storage.googleapis.com/labelbox-sidewalk-legacy-data-continuous/'

cont_df['download_link'] = cont_prefix + cont_df['video_name'].apply(mov_to_mp4)
cont_df.to_csv(cont_csv, index=False)
print(f"continuous_videos_clean_with_link.csv saved, shape: {cont_df.shape}")

static_videos_with_link.csv saved, shape: (770, 6)
continuous_videos_clean_with_link.csv saved, shape: (731, 6)


In [23]:
static_df.head()

Unnamed: 0,video_name,collector_name,material,full_path,duration,download_link
0,0_2023-08-21 13_31_45.3030_Subway+grate.MOV,jiawei_testbed,subway_grate,10_3_2023_all_data/testbed/user_data/jiawei_te...,59.85,https://storage.googleapis.com/labelbox-sidewa...
1,0_2023-08-21 13_25_10.3210_Manhole.MOV,jiawei_testbed,manhole,10_3_2023_all_data/testbed/user_data/jiawei_te...,5.6,https://storage.googleapis.com/labelbox-sidewa...
2,1_2023-08-21 13_31_45.3030_Subway+grate.MOV,jiawei_testbed,subway_grate,10_3_2023_all_data/testbed/user_data/jiawei_te...,3.2,https://storage.googleapis.com/labelbox-sidewa...
3,0_2023-08-21 13_35_23.4820_Cellar+door.MOV,jiawei_testbed,cellar_door,10_3_2023_all_data/testbed/user_data/jiawei_te...,9.37,https://storage.googleapis.com/labelbox-sidewa...
4,0_2023-08-21 13_24_23.9400_Manhole.MOV,jiawei_testbed,manhole,10_3_2023_all_data/testbed/user_data/jiawei_te...,7.4,https://storage.googleapis.com/labelbox-sidewa...
