In [6]:
# Install yt-dlp
%pip install -U yt-dlp --quiet


Note: you may need to restart the kernel to use updated packages.


In [7]:
# Download video using command-line version with higher resolution options
# Choose one of the following format options:

# Option 1: Best quality video + best audio (may result in separate files that get merged)
# !yt-dlp -f "bestvideo+bestaudio" -o "%(title)s.%(ext)s" https://www.youtube.com/watch?v=RB_bNAHj0Kg

# Option 2: Specific resolution (uncomment to use)
# !yt-dlp -f "best[height<=1080]" -o "%(title)s.%(ext)s" https://www.youtube.com/watch?v=RB_bNAHj0Kg

# Option 3: Best available up to 4K (uncomment to use)
# !yt-dlp -f "best[height<=2160]" -o "%(title)s.%(ext)s" https://www.youtube.com/watch?v=RB_bNAHj0Kg

# Option 4: Prefer higher resolution with fallback (uncomment to use)
# !yt-dlp -f "bestvideo[height>=720]+bestaudio/best" -o "%(title)s.%(ext)s" https://www.youtube.com/watch?v=RB_bNAHj0Kg


In [1]:
import yt_dlp

url = "https://www.youtube.com/watch?v=RB_bNAHj0Kg"

# Option 1: Highest quality (recommended)
ydl_opts = {
    "format": "bestvideo+bestaudio",
    "outtmpl": "%(title)s.%(ext)s",
    "ignoreerrors": True,           # Don't abort on errors
    "no_warnings": False,           # Show warnings but continue
    "keepvideo": True,              # Keep video file even if merge fails
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

# Alternative options (uncomment to use instead):

# Option 2: Specific resolution up to 1080p
# ydl_opts = {
#     "format": "best[height<=1080]",
#     "outtmpl": "%(title)s.%(ext)s",
# }

# Option 3: Specific resolution up to 4K
# ydl_opts = {
#     "format": "best[height<=2160]",
#     "outtmpl": "%(title)s.%(ext)s",
# }

# Option 4: Prefer 720p+ with fallback
# ydl_opts = {
#     "format": "bestvideo[height>=720]+bestaudio/best",
#     "outtmpl": "%(title)s.%(ext)s",
# }


[youtube] Extracting URL: https://www.youtube.com/watch?v=RB_bNAHj0Kg
[youtube] RB_bNAHj0Kg: Downloading webpage
[youtube] RB_bNAHj0Kg: Downloading tv client config
[youtube] RB_bNAHj0Kg: Downloading tv player API JSON
[youtube] RB_bNAHj0Kg: Downloading ios player API JSON
[youtube] RB_bNAHj0Kg: Downloading m3u8 information
[info] RB_bNAHj0Kg: Downloading 1 format(s): 137+251
[download] Destination: “From Skin to Skeleton ：Towards Biomechanically Accurate 3D Digital Humans.f137.mp4
[download]  23.2% of   42.70MiB at    5.40MiB/s ETA 00:06   

ERROR: unable to download video data: HTTP Error 403: Forbidden


In [5]:
# Load GAVD dataset to get video URLs
import pandas as pd
import os

# Load first dataset part to get URLs
data_dir = '../GAVD/data/'
first_csv = os.path.join(data_dir, 'GAVD_Clinical_Annotations_1.csv')
df = pd.read_csv(first_csv)

# Display available columns
print("Available columns:", list(df.columns))
print(f"Dataset shape: {df.shape}")

# Check if URL column exists and show sample URLs
if 'url' in df.columns:
    unique_urls = df['url'].dropna().unique()
    print(f"\nFound {len(unique_urls)} unique video URLs")
    print("Sample URLs:")
    for i, url in enumerate(unique_urls[:5]):
        print(f"  {i+1}. {url}")
else:
    print("No 'url' column found in the dataset")


Available columns: ['seq', 'frame_num', 'cam_view', 'gait_event', 'dataset', 'gait_pat', 'bbox', 'vid_info', 'id', 'url']
Dataset shape: (91624, 10)

Found 69 unique video URLs
Sample URLs:
  1. https://www.youtube.com/watch?v=B5hrxKe2nP8
  2. https://www.youtube.com/watch?v=TgkxrrhnvlM
  3. https://www.youtube.com/watch?v=5SBtTbfELUU
  4. https://www.youtube.com/watch?v=0gAHlm79Bjo
  5. https://www.youtube.com/watch?v=jzkn287X-84


  df = pd.read_csv(first_csv)


In [None]:
# Download a single video from GAVD dataset (highest quality)
import yt_dlp

cookies_path = "../tools/youtube_cookies.txt"
# Select first available URL from the dataset
if 'url' in df.columns and len(df['url'].dropna()) > 0:
    sample_url = df['url'].dropna().iloc[0]
    print(f"Downloading video from: {sample_url}")
    
    ydl_opts = {
        "format": "best[ext=mp4]/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best",
        "merge_output_format": "mp4",
        "outtmpl": "GAVD_%(id)s.%(ext)s",  # saves as GAVD_<id>.mp4
        "cookiefile": cookies_path,
        "ignoreerrors": True,
        "continuedl": True,
        "retries": 3,
        "fragment_retries": 10,
        "writeinfojson": False,
        "keepvideo": False,
        "concurrent_fragment_downloads": 1,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([sample_url])
        print("Download completed successfully!")
    except Exception as e:
        print(f"Error downloading video: {e}")
else:
    print("No valid URLs found in the dataset")


Downloading video from: https://www.youtube.com/watch?v=B5hrxKe2nP8
[youtube] Extracting URL: https://www.youtube.com/watch?v=B5hrxKe2nP8
[youtube] B5hrxKe2nP8: Downloading webpage
[youtube] B5hrxKe2nP8: Downloading tv client config
[youtube] B5hrxKe2nP8: Downloading tv player API JSON
[info] B5hrxKe2nP8: Downloading 1 format(s): 18
[download] Destination: GAVD_B5hrxKe2nP8.mp4
[download] 100% of    5.25MiB in 00:00:02 at 1.96MiB/s     
Download completed successfully!


In [None]:
# BATCH DOWNLOAD: Process all GAVD annotation files and download unique videos
import yt_dlp
import os
import pandas as pd
from collections import defaultdict
import time

os.makedirs("GAVD-videos", exist_ok=True)

# Process all 5 annotation files
data_dir = '../GAVD/data/'
annotation_files = [
    'GAVD_Clinical_Annotations_1.csv',
    'GAVD_Clinical_Annotations_2.csv', 
    'GAVD_Clinical_Annotations_3.csv',
    'GAVD_Clinical_Annotations_4.csv',
    'GAVD_Clinical_Annotations_5.csv'
]

print("🔍 Processing all GAVD annotation files...")
all_videos = {}  # Dictionary to store unique videos: {video_id: url}
video_info = defaultdict(dict)  # Store additional info for each video

for i, filename in enumerate(annotation_files, 1):
    filepath = os.path.join(data_dir, filename)
    print(f"📁 Loading {filename}...")
    
    try:
        # Load with low_memory=False to avoid dtype warnings
        df_temp = pd.read_csv(filepath, low_memory=False)
        
        # Extract unique video URLs from this file
        valid_rows = df_temp[df_temp['url'].notna() & df_temp['id'].notna()]
        
        for _, row in valid_rows.iterrows():
            video_id = row['id']
            video_url = row['url']
            
            if video_id not in all_videos:
                all_videos[video_id] = video_url
                # Store additional metadata
                video_info[video_id] = {
                    'gait_pat': row.get('gait_pat', 'unknown'),
                    'dataset': row.get('dataset', 'unknown'),
                    'vid_info': row.get('vid_info', {}),
                    'source_file': filename
                }
        
        unique_in_file = len(valid_rows['id'].unique())
        print(f"   ✓ Found {unique_in_file} unique videos in {filename}")
        
    except Exception as e:
        print(f"   ❌ Error processing {filename}: {e}")

print(f"\n📊 SUMMARY:")
print(f"Total unique videos found: {len(all_videos)}")
print(f"Ready to download {len(all_videos)} videos")

# Show sample of what we found
print(f"\n📋 Sample videos to download:")
sample_videos = list(all_videos.items())[:5]
for video_id, url in sample_videos:
    gait_type = video_info[video_id]['gait_pat']
    print(f"  • {video_id} ({gait_type}): {url}")

if len(all_videos) > 5:
    print(f"  ... and {len(all_videos) - 5} more videos")

# Ask user confirmation before starting batch download
print(f"\n⚠️  This will download {len(all_videos)} videos. This may take a while and use significant bandwidth.")
print("💡 TIP: You can interrupt the process anytime with Ctrl+C")

# Download configuration
ydl_opts = {
    "format": "best[ext=mp4]/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best",
    "merge_output_format": "mp4",
    "outtmpl": "GAVD_%(id)s.%(ext)s",  # saves as GAVD_<id>.mp4
    "cookiefile": cookies_path,
    "ignoreerrors": True,
    "continuedl": True,
    "retries": 3,
    "fragment_retries": 10,
    "writeinfojson": False,
    "keepvideo": False,
    "concurrent_fragment_downloads": 1,
}



# Start batch download
print(f"\n🚀 Starting batch download of {len(all_videos)} videos...")
downloaded_count = 0
failed_count = 0
failed_videos = []

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    for i, (video_id, video_url) in enumerate(all_videos.items(), 1):
        gait_type = video_info[video_id]['gait_pat']
        
        print(f"\n[{i}/{len(all_videos)}] Downloading: {video_id} ({gait_type})")
        print(f"URL: {video_url}")
        
        try:
            ydl.download([video_url])
            downloaded_count += 1
            print(f"✅ Success! ({downloaded_count} completed)")
            
            # Small delay to be respectful to YouTube
            time.sleep(1)
            
        except Exception as e:
            failed_count += 1
            failed_videos.append((video_id, str(e)))
            print(f"❌ Failed: {e}")
            continue

# Final summary
print(f"\n🎉 BATCH DOWNLOAD COMPLETE!")
print(f"✅ Successfully downloaded: {downloaded_count} videos")
print(f"❌ Failed downloads: {failed_count} videos")

if failed_videos:
    print(f"\n📋 Failed videos:")
    for video_id, error in failed_videos[:10]:  # Show first 10 failures
        print(f"  • {video_id}: {error}")
    if len(failed_videos) > 10:
        print(f"  ... and {len(failed_videos) - 10} more failures")

print(f"\n📁 All downloaded videos are saved in: GAVD-videos/")
print("🔍 Each video is saved with its GAVD ID as filename")
print("💡 Note: You may have separate video (.mp4) and audio (.webm) files for highest quality")


🔍 Processing all GAVD annotation files...
📁 Loading GAVD_Clinical_Annotations_1.csv...
   ✓ Found 69 unique videos in GAVD_Clinical_Annotations_1.csv
📁 Loading GAVD_Clinical_Annotations_2.csv...
   ✓ Found 80 unique videos in GAVD_Clinical_Annotations_2.csv
📁 Loading GAVD_Clinical_Annotations_3.csv...
   ✓ Found 72 unique videos in GAVD_Clinical_Annotations_3.csv
📁 Loading GAVD_Clinical_Annotations_4.csv...
   ✓ Found 69 unique videos in GAVD_Clinical_Annotations_4.csv
📁 Loading GAVD_Clinical_Annotations_5.csv...
   ✓ Found 73 unique videos in GAVD_Clinical_Annotations_5.csv

📊 SUMMARY:
Total unique videos found: 348
Ready to download 348 videos

📋 Sample videos to download:
  • B5hrxKe2nP8 (parkinsons): https://www.youtube.com/watch?v=B5hrxKe2nP8
  • TgkxrrhnvlM (abnormal): https://www.youtube.com/watch?v=TgkxrrhnvlM
  • 5SBtTbfELUU (abnormal): https://www.youtube.com/watch?v=5SBtTbfELUU
  • 0gAHlm79Bjo (abnormal): https://www.youtube.com/watch?v=0gAHlm79Bjo
  • jzkn287X-84 (style): h

KeyboardInterrupt: 

In [3]:
# SEQUENCE EXTRACTION: Create individual sequence videos from downloaded videos
import os
import pandas as pd
import subprocess
import json
from collections import defaultdict
import glob

# Create sequences folder
sequences_dir = "GAVD-sequences"
os.makedirs(sequences_dir, exist_ok=True)

# Load all annotation files to get sequence information
data_dir = 'GAVD/data/'
annotation_files = [
    'GAVD_Clinical_Annotations_1.csv',
    'GAVD_Clinical_Annotations_2.csv', 
    'GAVD_Clinical_Annotations_3.csv',
    'GAVD_Clinical_Annotations_4.csv',
    'GAVD_Clinical_Annotations_5.csv'
]

print("🔍 Processing annotation files to extract sequence information...")

# Dictionary to store sequence info: {seq_id: {video_id, start_frame, end_frame, metadata}}
sequences_info = {}
video_sequences = defaultdict(list)  # Group sequences by video_id

for filename in annotation_files:
    filepath = os.path.join(data_dir, filename)
    print(f"📁 Loading {filename}...")
    
    try:
        df_temp = pd.read_csv(filepath, low_memory=False)
        
        # Group by sequence ID to get frame ranges
        for seq_id, seq_group in df_temp.groupby('seq'):
            if pd.isna(seq_id):
                continue
                
            video_id = seq_group['id'].iloc[0]
            if pd.isna(video_id):
                continue
                
            # Get frame range for this sequence
            start_frame = seq_group['frame_num'].min()
            end_frame = seq_group['frame_num'].max()
            
            # Store sequence metadata
            sequences_info[seq_id] = {
                'video_id': video_id,
                'start_frame': start_frame,
                'end_frame': end_frame,
                'gait_pat': seq_group['gait_pat'].iloc[0],
                'cam_view': seq_group['cam_view'].iloc[0],
                'dataset': seq_group['dataset'].iloc[0],
                'source_file': filename,
                'frame_count': end_frame - start_frame + 1
            }
            
            video_sequences[video_id].append(seq_id)
        
        print(f"   ✓ Processed {len(df_temp['seq'].unique())} sequences from {filename}")
        
    except Exception as e:
        print(f"   ❌ Error processing {filename}: {e}")

print(f"\n📊 SEQUENCE SUMMARY:")
print(f"Total sequences found: {len(sequences_info)}")
print(f"Videos with sequences: {len(video_sequences)}")

# Show sample sequences
print(f"\n📋 Sample sequences:")
sample_seqs = list(sequences_info.items())[:5]
for seq_id, info in sample_seqs:
    print(f"  • {seq_id}: {info['video_id']} frames {info['start_frame']}-{info['end_frame']} ({info['gait_pat']})")

# Check which sequences already exist
existing_sequences = set()
for seq_file in glob.glob(os.path.join(sequences_dir, "*.mp4")):
    seq_name = os.path.splitext(os.path.basename(seq_file))[0]
    existing_sequences.add(seq_name)

sequences_to_process = {k: v for k, v in sequences_info.items() if k not in existing_sequences}

print(f"\n🔄 PROCESSING STATUS:")
print(f"Already processed: {len(existing_sequences)} sequences")
print(f"To be processed: {len(sequences_to_process)} sequences")

if len(sequences_to_process) == 0:
    print("✅ All sequences have already been processed!")
else:
    print(f"\n⚠️  This will process {len(sequences_to_process)} sequences.")
    print("💡 TIP: You can interrupt the process anytime with Ctrl+C")

# Function to get video frame rate and duration
def get_video_info(video_path):
    """Get video information using ffprobe"""
    try:
        cmd = [
            'ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_streams', video_path
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        data = json.loads(result.stdout)
        
        for stream in data['streams']:
            if stream['codec_type'] == 'video':
                fps_str = stream.get('r_frame_rate', '30/1')
                fps = eval(fps_str) if '/' in fps_str else float(fps_str)
                duration = float(stream.get('duration', 0))
                return fps, duration
        return 30.0, 0.0  # Default values
    except Exception as e:
        print(f"   ⚠️  Could not get video info, using defaults: {e}")
        return 30.0, 0.0

# Function to extract sequence from video
def extract_sequence(video_path, seq_id, start_frame, end_frame, output_path):
    """Extract a sequence from video using ffmpeg"""
    try:
        # Get video info
        fps, duration = get_video_info(video_path)
        
        # Calculate time positions
        start_time = start_frame / fps
        end_time = end_frame / fps
        duration_seq = end_time - start_time
        
        # FFmpeg command to extract sequence
        cmd = [
            'ffmpeg', '-y',  # Overwrite output file
            '-i', video_path,
            '-ss', str(start_time),  # Start time
            '-t', str(duration_seq),  # Duration
            '-c:v', 'libx264',  # Video codec
            '-c:a', 'aac',  # Audio codec
            '-avoid_negative_ts', 'make_zero',
            output_path
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return True, ""
        
    except subprocess.CalledProcessError as e:
        return False, f"FFmpeg error: {e.stderr}"
    except Exception as e:
        return False, f"Error: {str(e)}"

# Check if ffmpeg is available
def check_ffmpeg():
    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
        return True
    except:
        return False

if not check_ffmpeg():
    print("❌ FFmpeg is not installed or not in PATH!")
    print("Please install FFmpeg to extract video sequences.")
    print("Download from: https://ffmpeg.org/download.html")
else:
    print("✅ FFmpeg is available")
    
    # Start sequence extraction
    if len(sequences_to_process) > 0:
        print(f"\n🚀 Starting sequence extraction...")
        
        processed_count = 0
        failed_count = 0
        failed_sequences = []
        
        for seq_id, seq_info in sequences_to_process.items():
            video_id = seq_info['video_id']
            start_frame = seq_info['start_frame']
            end_frame = seq_info['end_frame']
            gait_type = seq_info['gait_pat']
            
            # Find the downloaded video file (could be .mp4 or .webm)
            video_files = glob.glob(f"GAVD-videos/{video_id}.*")
            video_file = None
            
            # Prefer .mp4 files over .webm
            for vf in video_files:
                if vf.endswith('.mp4'):
                    video_file = vf
                    break
            if not video_file and video_files:
                video_file = video_files[0]  # Use any available file
            
            if not video_file:
                failed_count += 1
                failed_sequences.append((seq_id, f"Video file not found for {video_id}"))
                print(f"❌ [{processed_count+failed_count+1}/{len(sequences_to_process)}] {seq_id}: Video file not found")
                continue
            
            output_path = os.path.join(sequences_dir, f"{seq_id}.mp4")
            
            print(f"\n[{processed_count+failed_count+1}/{len(sequences_to_process)}] Extracting: {seq_id}")
            print(f"   Video: {video_id} ({gait_type})")
            print(f"   Frames: {start_frame}-{end_frame} ({seq_info['frame_count']} frames)")
            print(f"   Source: {os.path.basename(video_file)}")
            
            success, error = extract_sequence(video_file, seq_id, start_frame, end_frame, output_path)
            
            if success:
                processed_count += 1
                print(f"   ✅ Success! Saved to: {seq_id}.mp4")
                # delay for 1 second
                time.sleep(1)
            else:
                failed_count += 1
                failed_sequences.append((seq_id, error))
                print(f"   ❌ Failed: {error}")
        
        # Final summary
        print(f"\n🎉 SEQUENCE EXTRACTION COMPLETE!")
        print(f"✅ Successfully processed: {processed_count} sequences")
        print(f"❌ Failed extractions: {failed_count} sequences")
        
        if failed_sequences:
            print(f"\n📋 Failed sequences:")
            for seq_id, error in failed_sequences[:10]:  # Show first 10 failures
                print(f"  • {seq_id}: {error}")
            if len(failed_sequences) > 10:
                print(f"  ... and {len(failed_sequences) - 10} more failures")
        
        print(f"\n📁 All sequence videos are saved in: {sequences_dir}/")
        print("🔍 Each sequence is saved as: <sequence_id>.mp4")
        print("💡 Sequences are extracted based on frame numbers from the annotation data")


🔍 Processing annotation files to extract sequence information...
📁 Loading GAVD_Clinical_Annotations_1.csv...
   ✓ Processed 374 sequences from GAVD_Clinical_Annotations_1.csv
📁 Loading GAVD_Clinical_Annotations_2.csv...
   ✓ Processed 353 sequences from GAVD_Clinical_Annotations_2.csv
📁 Loading GAVD_Clinical_Annotations_3.csv...
   ✓ Processed 369 sequences from GAVD_Clinical_Annotations_3.csv
📁 Loading GAVD_Clinical_Annotations_4.csv...
   ✓ Processed 313 sequences from GAVD_Clinical_Annotations_4.csv
📁 Loading GAVD_Clinical_Annotations_5.csv...
   ✓ Processed 469 sequences from GAVD_Clinical_Annotations_5.csv

📊 SEQUENCE SUMMARY:
Total sequences found: 1874
Videos with sequences: 348

📋 Sample sequences:
  • cljan9b4p00043n6ligceanyp: B5hrxKe2nP8 frames 1757-2268 (parkinsons)
  • cljanb45y00083n6lmh1qhydd: B5hrxKe2nP8 frames 2532-2746 (parkinsons)
  • cljao8kyf000d3n6l0x9kgmav: TgkxrrhnvlM frames 1-148 (abnormal)
  • cljaoak47000i3n6lsrb9rit9: TgkxrrhnvlM frames 205-355 (abnormal)
 