In [3]:
import os
import subprocess
import random
import numpy as np
import cv2
import time
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
# Cell 1: Download Videos from Vimeo (Improved Version)



# Create directories to store videos and frames
os.makedirs('videos', exist_ok=True)
os.makedirs('frames/HR', exist_ok=True)
os.makedirs('frames/LR', exist_ok=True)

def download_vimeo_video(link, output_file, attempt=1, max_attempts=3):
    """
    Download a single video with better error handling and retries
    """
    # First try yt-dlp which is more up-to-date and has better Vimeo support
    try:
        cmd = [
            'yt-dlp',  # Use yt-dlp instead of youtube-dl
            link,
            '-f', 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/mp4',  # Limit to 1080p max
            '-o', output_file,
            '--no-warnings',
            '--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            '--add-header', 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            '--add-header', 'Accept-Language:en-US,en;q=0.5'
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
        
        # Check if download was successful
        if os.path.exists(output_file) and os.path.getsize(output_file) > 100000:
            return True
            
    except FileNotFoundError:
        # yt-dlp not installed, try youtube-dl
        pass
    except Exception as e:
        if attempt < max_attempts:
            print(f"Retry {attempt}/{max_attempts} for {link} due to: {str(e)}")
            time.sleep(5)  # Wait before retry
            return download_vimeo_video(link, output_file, attempt + 1, max_attempts)
        return False
    
    # If yt-dlp failed or not installed, try youtube-dl
    try:
        cmd = [
            'youtube-dl',
            link,
            '--no-check-certificate',  # Skip certificate validation
            '-f', 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/mp4/best[height<=1080]',
            '-o', output_file,
            '--no-warnings',
            '--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
        
        # Check if download was successful
        if os.path.exists(output_file) and os.path.getsize(output_file) > 100000:
            return True
        else:
            if attempt < max_attempts:
                print(f"Retry {attempt}/{max_attempts} for {link}")
                time.sleep(5)  # Wait before retry
                return download_vimeo_video(link, output_file, attempt + 1, max_attempts)
            return False
            
    except Exception as e:
        if attempt < max_attempts:
            print(f"Retry {attempt}/{max_attempts} for {link} due to: {str(e)}")
            time.sleep(5)  # Wait before retry
            return download_vimeo_video(link, output_file, attempt + 1, max_attempts)
        return False

def download_videos(links_file, output_dir='videos', target_count=1000, max_attempts=2000):
    """
    Download videos from links in a text file until reaching target_count successful downloads
    """
    # Read the links file
    with open(links_file, 'r') as f:
        links = [line.strip() for line in f if line.strip()]
    
    # Shuffle links to get a random sample
    random.shuffle(links)
    
    successful_downloads = 0
    attempted_links = 0
    
    print(f"Starting downloads, aiming for {target_count} videos...")
    
    for link in tqdm(links):
        if successful_downloads >= target_count or attempted_links >= max_attempts:
            break
            
        attempted_links += 1
        video_id = link.split('/')[-1]
        output_file = os.path.join(output_dir, f"{video_id}.mp4")
        
        # Skip if already downloaded
        if os.path.exists(output_file) and os.path.getsize(output_file) > 100000:
            successful_downloads += 1
            continue
        
        # Try to download with better error handling
        success = download_vimeo_video(link, output_file)
        
        if success:
            successful_downloads += 1
            print(f"Successfully downloaded {successful_downloads}/{target_count}: {video_id}")
        else:
            print(f"Failed to download {link}")
        
        # Add a small delay to avoid rate limiting
        time.sleep(2)
    
    return successful_downloads

# Check if yt-dlp is installed, otherwise recommend it
try:
    subprocess.run(['yt-dlp', '--version'], capture_output=True)
    print("Using yt-dlp for downloads")
except FileNotFoundError:
    print("yt-dlp not found. For better results, install it with: pip install yt-dlp")
    print("Falling back to youtube-dl")

# Alternative approach: use a small test set of videos if downloading fails
def create_test_dataset():
    """Create a small test dataset using video files from the system or sample sources"""
    print("Creating a small test dataset for development...")
    
    # Try to find some sample videos on the system
    sample_dirs = [
        '/usr/share/example-videos',  # Some Linux systems have sample videos
        os.path.expanduser('~/Videos'),  # User's video directory
        '.'  # Current directory
    ]
    
    found_videos = []
    for directory in sample_dirs:
        if os.path.exists(directory):
            for file in os.listdir(directory):
                if file.endswith(('.mp4', '.mkv', '.avi', '.mov')):
                    video_path = os.path.join(directory, file)
                    dest_path = os.path.join('videos', file)
                    if os.path.exists(video_path) and os.path.getsize(video_path) > 1000000:  # >1MB
                        if not os.path.exists(dest_path):
                            # Copy file to videos directory
                            try:
                                import shutil
                                shutil.copy2(video_path, dest_path)
                                found_videos.append(dest_path)
                                print(f"Added video for testing: {file}")
                            except:
                                pass
    
    # If no videos found, create a synthetic test video
    if not found_videos:
        try:
            # Create a simple 10-second test video
            print("Creating synthetic test video...")
            import numpy as np
            import cv2
            
            output_path = os.path.join('videos', 'synthetic_test.mp4')
            
            # Create a 10 second video with moving shapes
            fps = 30
            width, height = 640, 480
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
            
            for i in range(300):  # 10 seconds at 30fps
                frame = np.zeros((height, width, 3), dtype=np.uint8)
                
                # Moving circle
                cx = int(width/2 + width/4 * np.sin(i/30))
                cy = int(height/2 + height/4 * np.cos(i/30))
                cv2.circle(frame, (cx, cy), 50, (0, 0, 255), -1)
                
                # Moving rectangle
                rx = int(width/2 + width/5 * np.cos(i/40))
                ry = int(height/2 + height/5 * np.sin(i/40))
                cv2.rectangle(frame, (rx-40, ry-40), (rx+40, ry+40), (0, 255, 0), -1)
                
                # Add some text
                cv2.putText(frame, f"Frame {i}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                
                out.write(frame)
            
            out.release()
            found_videos.append(output_path)
            
        except Exception as e:
            print(f"Error creating synthetic video: {e}")
    
    return len(found_videos) > 0

# Load the links file and download videos
links_file = 'vimeo_links.txt'  # Replace with your links file

if os.path.exists(links_file):
    successful_count = download_videos(links_file, target_count=1000)
    print(f"Downloaded {successful_count} videos successfully")
    
    # If not enough videos downloaded, create test dataset
    if successful_count < 10:
        print("Not enough videos downloaded. Creating test dataset...")
        create_test_dataset()
else:
    print(f"Links file {links_file} not found. Creating test dataset...")
    create_test_dataset()



In [2]:
# Get list of downloaded videos
video_files = [os.path.join('videos', f) for f in os.listdir('videos') if f.endswith(('.mp4', '.mkv', '.avi', '.mov'))]
print(f"Total videos available: {len(video_files)}")

Total videos available: 711


In [3]:
# Cell 2: Extract Frame Sequences



def extract_frame_sequences(video_files, output_dir='frames/HR', sequences_per_video=3, sequence_length=5):
    """
    Extract random frame sequences from videos
    """
    # Create sequence record
    sequence_data = []
    
    for video_idx, video_path in enumerate(tqdm(video_files)):
        try:
            # Open video file
            cap = cv2.VideoCapture(video_path)
            
            # Check if video opened successfully
            if not cap.isOpened():
                print(f"Error opening video: {video_path}")
                continue
                
            # Get video properties
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            
            # Skip very short videos or low resolution videos
            if total_frames < 30 or width < 480 or height < 360:
                cap.release()
                continue
                
            # Calculate valid starting points for sequences (need sequence_length consecutive frames)
            valid_starts = max(0, total_frames - sequence_length)
            
            if valid_starts <= 0:
                cap.release()
                continue
                
            # Generate random starting points for sequences
            # Make sure sequences don't overlap by enforcing minimum gap
            min_gap = 30  # At least 30 frames between sequences
            sequence_starts = []
            
            attempts = 0
            while len(sequence_starts) < sequences_per_video and attempts < 20:
                attempts += 1
                candidate = random.randint(0, valid_starts)
                
                # Check if candidate is far enough from existing starts
                if all(abs(candidate - start) >= min_gap for start in sequence_starts):
                    sequence_starts.append(candidate)
            
            # Extract the sequences
            for seq_idx, start_frame in enumerate(sequence_starts):
                # Create sequence directory
                sequence_dir = os.path.join(output_dir, f"video_{video_idx:04d}_seq_{seq_idx}")
                os.makedirs(sequence_dir, exist_ok=True)
                
                # Set video to starting frame
                cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
                
                # Extract sequence_length frames
                sequence_frames = []
                for frame_idx in range(sequence_length):
                    ret, frame = cap.read()
                    if not ret:
                        break
                        
                    # Save frame
                    frame_path = os.path.join(sequence_dir, f"frame_{frame_idx:02d}.png")
                    cv2.imwrite(frame_path, frame)
                    sequence_frames.append(frame_path)
                
                # Record sequence information
                if len(sequence_frames) == sequence_length:
                    sequence_data.append({
                        'video_file': os.path.basename(video_path),
                        'sequence_dir': sequence_dir,
                        'start_frame': start_frame,
                        'frames': sequence_frames,
                        'width': width,
                        'height': height,
                        'fps': fps
                    })
            
            cap.release()
            
        except Exception as e:
            print(f"Error processing {video_path}: {str(e)}")
    
    # Save sequence metadata
    sequence_df = pd.DataFrame(sequence_data)
    sequence_df.to_csv('frame_sequences.csv', index=False)
    
    return sequence_df

# Extract frame sequences
sequence_df = extract_frame_sequences(video_files)
print(f"Extracted {len(sequence_df)} sequences")

  0%|          | 0/711 [00:00<?, ?it/s]

Extracted 2125 sequences


In [5]:

# Load the previously saved sequence data from CSV

sequence_df = pd.read_csv('frame_sequences.csv')



# Convert string representation of list back to actual list for 'frames' column

sequence_df['frames'] = sequence_df['frames'].apply(eval)  # This converts the string representation back to a list



print(f"Loaded {len(sequence_df)} sequences from CSV")


Loaded 2125 sequences from CSV


In [6]:
# Cell 3: Create LR Frames from HR Frames



def create_lr_frames(sequence_df, hr_dir='frames/HR', lr_dir='frames/LR', 
                     target_hr_size=(480, 854), target_lr_size=(120, 214)):  # Changed to 4x downscaling
    """
    Create low-resolution frames from high-resolution frames
    - Resizes HR frames to standard size (480p)
    - Creates corresponding LR frames by downscaling (4x factor)
    - Adds subtle degradation to mimic low bandwidth video
    """
    # Make sure directories exist
    os.makedirs(lr_dir, exist_ok=True)
    
    # Function for high-quality downscaling with subtle degradation
    def downscale_with_degradation(img, target_size):
        # Step 1: Basic downscale with INTER_AREA (clean downscaling)
        downscaled = cv2.resize(img, target_size[::-1], interpolation=cv2.INTER_AREA)
        
        # Step 2: Apply subtle degradation to mimic low bandwidth
        # Mild JPEG compression artifacts
        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 92]
        _, encoded_img = cv2.imencode('.jpg', downscaled, encode_param)
        degraded = cv2.imdecode(encoded_img, cv2.IMREAD_COLOR)
        
        # Subtle color subsampling effect (very mild)
        if random.random() < 0.5:  # Only apply to some frames for natural variation
            # Convert to YCrCb, slightly blur chroma channels, convert back
            ycrcb = cv2.cvtColor(degraded, cv2.COLOR_BGR2YCrCb)
            channels = list(cv2.split(ycrcb))  # Convert tuple to list so we can modify it
            
            # Apply very slight blur to chroma channels
            channels[1] = cv2.GaussianBlur(channels[1], (3, 3), 0.5)
            channels[2] = cv2.GaussianBlur(channels[2], (3, 3), 0.5)
            
            ycrcb = cv2.merge(channels)
            degraded = cv2.cvtColor(ycrcb, cv2.COLOR_YCrCb2BGR)
        
        return degraded
    
    for _, row in tqdm(sequence_df.iterrows(), total=len(sequence_df)):
        # Get sequence directory name
        seq_name = os.path.basename(row['sequence_dir'])
        
        # Create corresponding LR directory
        lr_seq_dir = os.path.join(lr_dir, seq_name)
        os.makedirs(lr_seq_dir, exist_ok=True)
        
        # Process each frame in the sequence
        for frame_path in row['frames']:
            # Get frame filename
            frame_name = os.path.basename(frame_path)
            
            # Load HR frame
            hr_frame = cv2.imread(frame_path)
            if hr_frame is None:
                print(f"Error loading frame: {frame_path}")
                continue
            
            # Resize HR frame to standard size (480p) with good quality
            hr_frame = cv2.resize(hr_frame, target_hr_size[::-1], interpolation=cv2.INTER_LANCZOS4)
            
            # Save standardized HR frame
            std_hr_path = os.path.join(os.path.dirname(frame_path), frame_name)
            cv2.imwrite(std_hr_path, hr_frame)
            
            # Create LR frame with 4x downscaling and subtle degradation
            lr_frame = downscale_with_degradation(hr_frame, target_lr_size)
            
            # Save LR frame with high quality
            lr_path = os.path.join(lr_seq_dir, frame_name)
            cv2.imwrite(lr_path, lr_frame, [cv2.IMWRITE_PNG_COMPRESSION, 0])
    
    print("Created LR frames for all sequences with 4x downscaling and subtle quality degradation")
    
    # Create train/test split
    sequence_dirs = sequence_df['sequence_dir'].apply(os.path.basename).unique()
    random.shuffle(sequence_dirs)
    
    split_idx = int(len(sequence_dirs) * 0.8)  # 80% train, 20% test
    train_sequences = sequence_dirs[:split_idx]
    test_sequences = sequence_dirs[split_idx:]
    
    # Save train/test split - using concat instead of append (which is deprecated)
    pd.concat([
        pd.DataFrame({'sequence': train_sequences, 'split': 'train'}),
        pd.DataFrame({'sequence': test_sequences, 'split': 'test'})
    ]).to_csv('data_split.csv', index=False)
    
    print(f"Created data split: {len(train_sequences)} train, {len(test_sequences)} test sequences")

# Create LR frames from HR frames
# We'll use 480p for HR and 120p for LR (4x upscaling)
create_lr_frames(sequence_df)


  0%|          | 0/2125 [00:00<?, ?it/s]

Created LR frames for all sequences with 4x downscaling and subtle quality degradation
Created data split: 1700 train, 425 test sequences
