# Adjust this to copy files from G Drive to the USB-C Drive without errors

## Naive

In [None]:
import shutil
import os
from pathlib import Path
import re

def sanitize_filename(filename):
    """Remove or replace invalid characters for Windows/USB drives"""
    # Replace problematic characters
    invalid_chars = r'[<>:"|?*]'
    filename = re.sub(invalid_chars, '_', filename)
    # Remove leading/trailing spaces and dots
    filename = filename.strip('. ')
    return filename

def copy_files_safe(source_dir, dest_dir, max_retries=3):
    """Copy files with error handling and retry logic"""
    source_path = Path(source_dir)
    dest_path = Path(dest_dir)
    
    # Create destination if it doesn't exist
    dest_path.mkdir(parents=True, exist_ok=True)
    
    copied = []
    failed = []
    skipped = []
    
    for item in source_path.rglob('*'):
        if item.is_file():
            try:
                # Calculate relative path
                rel_path = item.relative_to(source_path)
                
                # Sanitize each part of the path
                sanitized_parts = [sanitize_filename(part) for part in rel_path.parts]
                dest_file = dest_path / Path(*sanitized_parts)
                
                # Check if file already exists
                if dest_file.exists():
                    skipped.append(str(rel_path))
                    # print(f"⊘ Skipped (exists): {rel_path}")
                    continue
                
                # Create parent directories
                dest_file.parent.mkdir(parents=True, exist_ok=True)
                
                # Copy with retry
                for attempt in range(max_retries):
                    try:
                        shutil.copy2(item, dest_file)
                        copied.append(str(rel_path))
                        print(f"✓ Copied: {rel_path}")
                        break
                    except Exception as e:
                        if attempt == max_retries - 1:
                            raise
                        print(f"  Retry {attempt + 1} for {rel_path}")
                        
            except Exception as e:
                failed.append((str(item), str(e)))
                print(f"✗ Failed: {item} - {e}")
    
    print(f"\n{len(copied)} files copied successfully")
    print(f"{len(skipped)} files skipped (already exist)")
    if failed:
        print(f"{len(failed)} files failed:")
        for path, error in failed:
            print(f"  {path}: {error}")
    
    return copied, failed, skipped

source = r"G:\My Drive\Colab Notebooks\results"  # Google Drive
destination = r"D:\Ergodicity Simulations\2025-10-15 Run - Hier Freq and X Shape"  # USB drive

# copy_files_safe(source, destination, max_retries=10)

## More Optimized

In [1]:
import shutil
import os
from pathlib import Path
import re
import gc
import time
import threading

def sanitize_filename(filename):
    """Remove or replace invalid characters for Windows/USB drives"""
    invalid_chars = r'[<>:"|?*]'
    filename = re.sub(invalid_chars, '_', filename)
    filename = filename.strip('. ')
    return filename

def copy_with_timeout(src, dst, timeout=30):
    """Copy a file with a timeout to prevent hanging"""
    result = {'success': False, 'error': None}
    
    def copy_file():
        try:
            shutil.copy2(src, dst)
            result['success'] = True
        except Exception as e:
            result['error'] = e
    
    thread = threading.Thread(target=copy_file)
    thread.daemon = True
    thread.start()
    thread.join(timeout)
    
    if thread.is_alive():
        # Thread is still running - it hung
        result['error'] = TimeoutError(f"Copy operation timed out after {timeout} seconds")
        return result
    
    return result

def copy_files_safe_with_timeout(source_dir, dest_dir, max_retries=3, batch_size=1000, resume_from=0, copy_timeout=30):
    """Copy files with timeout handling to prevent hanging"""
    source_path = Path(source_dir)
    dest_path = Path(dest_dir)
    
    # Create destination if it doesn't exist
    dest_path.mkdir(parents=True, exist_ok=True)
    
    copied = []
    failed = []
    skipped = []
    processed = 0
    
    # Checkpoint file
    checkpoint_file = dest_path / "_copy_checkpoint.txt"
    
    print(f"Scanning source directory... (this may take a moment)")
    all_files = [f for f in source_path.rglob('*') if f.is_file()]
    total_files = len(all_files)
    print(f"Found {total_files} files to process")
    
    # Start from resume point if specified
    if resume_from > 0:
        print(f"Resuming from file {resume_from}")
        all_files = all_files[resume_from:]
    
    for idx, item in enumerate(all_files, start=resume_from):
        try:
            # Save checkpoint every 50 files
            if processed % 50 == 0:
                checkpoint_data = f"Index: {idx}\nProcessed: {processed}\nLast file: {item}\nTime: {time.strftime('%Y-%m-%d %H:%M:%S')}"
                checkpoint_file.write_text(checkpoint_data)
                print(f"Progress: {processed}/{total_files} files processed... [Checkpoint saved]")
            
            # Print current file being processed
            rel_path = item.relative_to(source_path)
            if processed % 10 == 0:
                print(f"  Processing: {rel_path}")
            
            # Check path length
            full_dest_path = dest_path / rel_path
            if len(str(full_dest_path)) > 250:
                print(f"⚠ Path too long, skipping: {rel_path}")
                failed.append((str(item), "Path exceeds Windows limit"))
                processed += 1
                continue
            
            # Sanitize path
            sanitized_parts = [sanitize_filename(part) for part in rel_path.parts]
            dest_file = dest_path / Path(*sanitized_parts)
            
            # Check if exists
            if dest_file.exists():
                skipped.append(str(rel_path))
                processed += 1
                continue
            
            # Create parent directories
            dest_file.parent.mkdir(parents=True, exist_ok=True)
            
            # Copy with timeout and retry
            copy_success = False
            for attempt in range(max_retries):
                try:
                    print(f"    Copying: {rel_path} (attempt {attempt + 1}/{max_retries})...")
                    result = copy_with_timeout(item, dest_file, timeout=copy_timeout)
                    
                    if result['success']:
                        copied.append(str(rel_path))
                        print(f"    ✓ Success: {rel_path}")
                        copy_success = True
                        break
                    else:
                        error = result['error']
                        if isinstance(error, TimeoutError):
                            print(f"    ⏱ Timeout on attempt {attempt + 1}: {rel_path}")
                            if attempt < max_retries - 1:
                                print(f"    Waiting 5 seconds before retry...")
                                time.sleep(5)
                        else:
                            raise error
                            
                except PermissionError as e:
                    print(f"⚠ Permission denied: {rel_path}, skipping")
                    failed.append((str(item), str(e)))
                    break
                except Exception as e:
                    if attempt == max_retries - 1:
                        print(f"    ✗ Failed after {max_retries} attempts: {e}")
                        failed.append((str(item), str(e)))
                    else:
                        print(f"    Retry {attempt + 1}/{max_retries} for {rel_path}: {e}")
                        time.sleep(2)
            
            if not copy_success and not any(str(item) in f[0] for f in failed):
                failed.append((str(item), "Copy timed out after all retries"))
            
            processed += 1
            
            # Periodic cleanup
            if processed % batch_size == 0:
                print(f"\n{'='*60}")
                print(f"BATCH CHECKPOINT at {processed} files")
                print(f"Copied: {len(copied)}, Skipped: {len(skipped)}, Failed: {len(failed)}")
                print(f"{'='*60}\n")
                gc.collect()
                
        except Exception as e:
            failed.append((str(item), str(e)))
            print(f"✗ Unexpected error: {item} - {e}")
            processed += 1
    
    # Final summary
    print("\n" + "="*60)
    print(f"COPY COMPLETE")
    print(f"{len(copied)} files copied successfully")
    print(f"{len(skipped)} files skipped (already exist)")
    print(f"{len(failed)} files failed")
    print("="*60)
    
    if failed:
        print(f"\nFailed files:")
        for path, error in failed[:20]:
            print(f"  {path}: {error}")
        if len(failed) > 20:
            print(f"  ... and {len(failed) - 20} more")
    
    # Save final report
    report_file = dest_path / "_copy_report.txt"
    with open(report_file, 'w') as f:
        f.write(f"Copy Report - {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total files: {total_files}\n")
        f.write(f"Copied: {len(copied)}\n")
        f.write(f"Skipped: {len(skipped)}\n")
        f.write(f"Failed: {len(failed)}\n\n")
        if failed:
            f.write("Failed files:\n")
            for path, error in failed:
                f.write(f"  {path}: {error}\n")
    
    return copied, failed, skipped

# Run the copy
source = r"G:\My Drive\Colab Notebooks\results"
destination = r"D:\2025-10-15 Run - Hier Freq and X Shape"

# Start fresh or resume from checkpoint
# Check _copy_checkpoint.txt to see where it stopped
copied, failed, skipped = copy_files_safe_with_timeout(
    source, 
    destination, 
    max_retries=10, 
    batch_size=1000,
    resume_from=0,  # Change this to resume from a specific index
    copy_timeout=30  # 30 second timeout per file
)

Scanning source directory... (this may take a moment)
Found 30491 files to process
Progress: 0/30491 files processed... [Checkpoint saved]
  Processing: Cap (75M) - Ded (100K) - LR (0.6) - Pol_Lim (75M) - X_Th_%le (0.0005) - X_Shape (0.38074584111791737) - X_Scale (1.0) - 1K Sims - 25 Yrs.pkl
  Processing: Cap (75M) - Ded (100K) - LR (0.6) - Pol_Lim (75M) - X_Th_%le (0.0005) - X_Shape (0.6635321154810417) - X_Scale (1.0) - 1K Sims - 25 Yrs.pkl
  Processing: Cap (75M) - Ded (100K) - LR (0.6) - Pol_Lim (75M) - X_Th_%le (0.0005) - X_Shape (0.8980832769930748) - X_Scale (1.0) - 1K Sims - 25 Yrs.pkl
  Processing: Cap (75M) - Ded (100K) - LR (0.6) - Pol_Lim (75M) - X_Th_%le (0.0005) - X_Shape (0.47517435937057084) - X_Scale (1.0) - 1K Sims - 25 Yrs.pkl
  Processing: Cap (75M) - Ded (100K) - LR (0.6) - Pol_Lim (75M) - X_Th_%le (0.0005) - X_Shape (0.322227134071213) - X_Scale (1.0) - 1K Sims - 25 Yrs.pkl
Progress: 50/30491 files processed... [Checkpoint saved]
  Processing: Cap (75M) - Ded (10

KeyboardInterrupt: 