In [None]:
#!/usr/bin/env python3
"""
SoundCloud Downloader for Linux - Fixed FFmpeg Integration
Properly handles ffmpeg path detection and passing to yt-dlp
"""

import os
import re
import subprocess
import sys
from datetime import datetime, timedelta
import yt_dlp
import requests
import time
import hashlib
import json
import shutil
from pathlib import Path


class SoundCloudDownloader:
    def __init__(self, output_dir="downloads"):
        self.output_dir = output_dir
        self.ffmpeg_path = None
        self.download_log = {}
        self.log_file = os.path.join(output_dir, ".download_log.json")
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Load download log
        self.load_download_log()
        
        # Setup ffmpeg on initialization
        self.setup_ffmpeg()
    
    def load_download_log(self):
        """Load the download log from file."""
        if os.path.exists(self.log_file):
            try:
                with open(self.log_file, 'r') as f:
                    self.download_log = json.load(f)
            except:
                self.download_log = {}
    
    def save_download_log(self):
        """Save the download log to file."""
        try:
            with open(self.log_file, 'w') as f:
                json.dump(self.download_log, f, indent=2)
        except Exception as e:
            print(f"Warning: Could not save download log: {e}")
    
    def find_ffmpeg(self):
        """Find ffmpeg binary in system."""
        # Check if ffmpeg is in PATH using shutil.which
        ffmpeg_in_path = shutil.which('ffmpeg')
        if ffmpeg_in_path:
            return ffmpeg_in_path
        
        # Common locations to check
        common_locations = [
            '/usr/bin/ffmpeg',
            '/usr/local/bin/ffmpeg',
            '/opt/homebrew/bin/ffmpeg',
            '/snap/bin/ffmpeg',
            '~/.local/bin/ffmpeg',
        ]
        
        for location in common_locations:
            expanded_path = os.path.expanduser(location)
            if os.path.exists(expanded_path) and os.access(expanded_path, os.X_OK):
                return expanded_path
        
        return None
    
    def setup_ffmpeg(self):
        """Setup ffmpeg for Linux systems - improved detection."""
        print("Checking for ffmpeg installation...")
        
        # First try to find existing ffmpeg
        self.ffmpeg_path = self.find_ffmpeg()
        
        if self.ffmpeg_path:
            # Verify it actually works
            if self.verify_ffmpeg(self.ffmpeg_path):
                print(f"✓ Found working ffmpeg at: {self.ffmpeg_path}")
                return True
            else:
                print(f"⚠ Found ffmpeg at {self.ffmpeg_path} but it doesn't work properly")
                self.ffmpeg_path = None
        
        # If not found, try to install it
        print("ffmpeg not found. Attempting to install...")
        
        if self.install_ffmpeg():
            # Re-check after installation
            self.ffmpeg_path = self.find_ffmpeg()
            if self.ffmpeg_path and self.verify_ffmpeg(self.ffmpeg_path):
                print(f"✓ Successfully installed ffmpeg at: {self.ffmpeg_path}")
                return True
        
        print("⚠ Failed to install ffmpeg automatically.")
        print("Please install it manually:")
        print("  Ubuntu/Debian: sudo apt-get install ffmpeg")
        print("  Fedora: sudo dnf install ffmpeg")
        print("  Arch: sudo pacman -S ffmpeg")
        return False
    
    def verify_ffmpeg(self, ffmpeg_path):
        """Verify that ffmpeg actually works."""
        try:
            result = subprocess.run(
                [ffmpeg_path, '-version'],
                capture_output=True,
                text=True,
                timeout=5
            )
            return result.returncode == 0 and 'ffmpeg version' in result.stdout
        except Exception as e:
            print(f"Error verifying ffmpeg: {e}")
            return False
    
    def install_ffmpeg(self):
        """Try to install ffmpeg based on detected OS."""
        try:
            # Detect the Linux distribution
            if os.path.exists('/etc/os-release'):
                with open('/etc/os-release', 'r') as f:
                    os_info = f.read().lower()
                
                if 'ubuntu' in os_info or 'debian' in os_info:
                    print("Installing ffmpeg using apt...")
                    subprocess.run(['sudo', 'apt-get', 'update'], check=False, capture_output=True)
                    result = subprocess.run(['sudo', 'apt-get', 'install', '-y', 'ffmpeg'], 
                                          capture_output=True, text=True)
                    return result.returncode == 0
                    
                elif 'fedora' in os_info:
                    print("Installing ffmpeg using dnf...")
                    result = subprocess.run(['sudo', 'dnf', 'install', '-y', 'ffmpeg'], 
                                          capture_output=True, text=True)
                    return result.returncode == 0
                    
                elif 'arch' in os_info:
                    print("Installing ffmpeg using pacman...")
                    result = subprocess.run(['sudo', 'pacman', '-S', '--noconfirm', 'ffmpeg'], 
                                          capture_output=True, text=True)
                    return result.returncode == 0
        except Exception as e:
            print(f"Error during installation: {e}")
        
        return False
    
    def validate_url(self, url):
        """Validate if the URL is a valid SoundCloud URL."""
        pattern = r'^https?://(?:www\.)?soundcloud\.com/[\w-]+/[\w-]+'
        return bool(re.match(pattern, url))
    
    def get_file_hash(self, url):
        """Generate a unique hash for a URL."""
        return hashlib.md5(url.encode()).hexdigest()
    
    def is_already_downloaded(self, url):
        """Check if a URL has already been successfully downloaded."""
        url_hash = self.get_file_hash(url)
        
        if url_hash in self.download_log:
            file_path = self.download_log[url_hash].get('file_path')
            if file_path and os.path.exists(file_path):
                file_size = os.path.getsize(file_path)
                if file_size > 1000000:  # > 1MB
                    print(f"✓ Already downloaded: {os.path.basename(file_path)}")
                    return True
        
        return False
    
    def download_audio(self, url, max_retries=3):
        """Download audio from SoundCloud URL with proper ffmpeg configuration."""
        if not self.validate_url(url):
            print(f"✗ Invalid URL: {url}")
            return None
        
        # Check if already downloaded
        if self.is_already_downloaded(url):
            return self.download_log[self.get_file_hash(url)]['file_path']
        
        print(f"⬇ Downloading: {url}")
        
        # Configure yt-dlp options
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': os.path.join(self.output_dir, '%(title)s.%(ext)s'),
            'noplaylist': True,
            'quiet': True,
            'no_warnings': True,
            'retries': 10,
            'fragment_retries': 10,
            'skip_unavailable_fragments': True,
            # Audio extraction settings
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'prefer_ffmpeg': True,  # Prefer ffmpeg over avconv
        }
        
        # CRITICAL: Set ffmpeg location properly
        if self.ffmpeg_path:
            # Get the directory containing ffmpeg
            ffmpeg_dir = os.path.dirname(self.ffmpeg_path)
            ydl_opts['ffmpeg_location'] = ffmpeg_dir
            print(f"  Using ffmpeg from: {ffmpeg_dir}")
        
        # Try downloading with retries
        for attempt in range(max_retries):
            try:
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    # Extract info first
                    info = ydl.extract_info(url, download=False)
                    
                    # Generate expected filename
                    filename = ydl.prepare_filename(info)
                    base, _ = os.path.splitext(filename)
                    mp3_file = f"{base}.mp3"
                    
                    # Check if file already exists
                    if os.path.exists(mp3_file) and os.path.getsize(mp3_file) > 1000000:
                        print(f"✓ File already exists: {os.path.basename(mp3_file)}")
                        # Add to log
                        self.download_log[self.get_file_hash(url)] = {
                            'url': url,
                            'file_path': mp3_file,
                            'download_date': datetime.now().isoformat()
                        }
                        self.save_download_log()
                        return mp3_file
                    
                    # Now download
                    ydl.download([url])
                    
                    # Verify the download
                    if os.path.exists(mp3_file) and os.path.getsize(mp3_file) > 0:
                        print(f"✓ Successfully downloaded: {os.path.basename(mp3_file)}")
                        
                        # Add to download log
                        self.download_log[self.get_file_hash(url)] = {
                            'url': url,
                            'file_path': mp3_file,
                            'download_date': datetime.now().isoformat()
                        }
                        self.save_download_log()
                        
                        return mp3_file
                    else:
                        raise Exception("Downloaded file is empty or missing")
                        
            except Exception as e:
                error_msg = str(e)[:100]
                print(f"  Attempt {attempt + 1}/{max_retries} failed: {error_msg}")
                
                # If ffmpeg issue, try to fix it
                if 'ffmpeg' in error_msg.lower() and attempt == 0:
                    print("  Attempting to fix ffmpeg configuration...")
                    self.setup_ffmpeg()
                
                if attempt < max_retries - 1:
                    time.sleep(5 * (attempt + 1))
        
        print(f"✗ Failed to download after {max_retries} attempts")
        return None
    
    def download_date_range(self, profile_url, start_date, end_date):
        """
        Download tracks from a SoundCloud profile within a date range.
        
        Args:
            profile_url: SoundCloud profile URL
            start_date: Start date (datetime object or string 'YYYY-MM-DD')
            end_date: End date (datetime object or string 'YYYY-MM-DD')
        
        Returns:
            Tuple of (successful_downloads, failed_urls)
        """
        # Parse dates if strings
        if isinstance(start_date, str):
            start_date = datetime.strptime(start_date, '%Y-%m-%d')
        if isinstance(end_date, str):
            end_date = datetime.strptime(end_date, '%Y-%m-%d')
        
        print(f"\n📅 Searching for tracks from {start_date.date()} to {end_date.date()}")
        print(f"📁 Output directory: {self.output_dir}\n")
        
        # Generate URLs for date range
        urls = self.generate_urls_for_range(profile_url, start_date, end_date)
        
        if not urls:
            print("No URLs found for the specified date range")
            return [], []
        
        print(f"Found {len(urls)} potential tracks to download\n")
        
        successful = []
        failed = []
        
        for i, (date, url) in enumerate(urls, 1):
            print(f"[{i}/{len(urls)}] Processing {date.date()}")
            
            # Check if URL exists (optional - can be skipped for faster processing)
            try:
                response = requests.head(url, timeout=5, allow_redirects=True)
                if response.status_code == 404:
                    print(f"  ✗ URL not found (404)")
                    failed.append(url)
                    continue
            except:
                pass  # Try downloading anyway
            
            result = self.download_audio(url)
            
            if result:
                successful.append(result)
            else:
                failed.append(url)
            
            # Small delay between downloads
            if i < len(urls):
                time.sleep(2)
        
        # Print summary
        print(f"\n{'='*50}")
        print(f"Download Summary:")
        print(f"  ✓ Successful: {len(successful)}")
        print(f"  ✗ Failed: {len(failed)}")
        print(f"  📁 Files saved to: {self.output_dir}")
        print(f"{'='*50}\n")
        
        return successful, failed
    
    def generate_urls_for_range(self, profile_url, start_date, end_date):
        """Generate potential URLs for a date range."""
        urls = []
        
        # Clean profile URL
        profile_url = profile_url.rstrip('/')
        
        current_date = start_date
        while current_date <= end_date:
            day = current_date.day
            month = current_date.strftime('%b').lower()
            year = current_date.year
            
            # Generate potential URL formats (most common first)
            potential_urls = [
                f"{profile_url}/idaacadda-{day:02d}-{month}-{year}",
                f"{profile_url}/idaacadda-{day}-{month}-{year}",
            ]
            
            # Add the first URL for this date
            urls.append((current_date, potential_urls[0]))
            
            current_date += timedelta(days=1)
        
        return urls


# Usage example
def main():
    """Main function for command-line usage."""
    downloader = SoundCloudDownloader(output_dir="data")
    
    # Download a specific date range
    profile_url = "https://soundcloud.com/radio-ergo"
    start_date = "2024-07-01"
    end_date = "2024-07-02"
    
    successful, failed = downloader.download_date_range(
        profile_url,
        start_date,
        end_date
    )
    
    # Retry failed downloads if any
    if failed:
        print("\n🔄 Retrying failed downloads...")
        retry_successful = []
        still_failed = []
        
        for url in failed:
            result = downloader.download_audio(url, max_retries=2)
            if result:
                retry_successful.append(result)
            else:
                still_failed.append(url)
        
        if retry_successful:
            print(f"✓ Successfully downloaded {len(retry_successful)} on retry")
        if still_failed:
            print(f"✗ Still failed: {len(still_failed)} URLs")
            for url in still_failed:
                print(f"  - {url}")


if __name__ == "__main__":
    main()

Checking for ffmpeg installation...
✓ Found working ffmpeg at: /usr/bin/ffmpeg

📅 Searching for tracks from 2024-07-01 to 2024-07-02
📁 Output directory: soundcloud_downloads

Found 2 potential tracks to download

[1/2] Processing 2024-07-01
⬇ Downloading: https://soundcloud.com/radio-ergo/idaacadda-01-jul-2024
  Using ffmpeg from: /usr/bin
✓ Successfully downloaded: IDAACADDA 01-JUL-2024.mp3                      
[2/2] Processing 2024-07-02
⬇ Downloading: https://soundcloud.com/radio-ergo/idaacadda-02-jul-2024
  Using ffmpeg from: /usr/bin
✓ Successfully downloaded: IDAACADDA 02-JUL-2024.mp3                      

Download Summary:
  ✓ Successful: 2
  ✗ Failed: 0
  📁 Files saved to: soundcloud_downloads



In [6]:
#!/usr/bin/env python3
"""
Enhanced SoundCloud Downloader with On-the-Fly Transcription
Processes audio files in memory without saving to disk, focusing on transcription output only.
"""

import os
import re
import subprocess
import sys
from datetime import datetime, timedelta
import yt_dlp
import requests
import time
import hashlib
import json
import shutil
from pathlib import Path
from typing import Optional, List, Dict, Tuple, Union
import io
import tempfile
from contextlib import contextmanager

# Audio processing imports
try:
    from pydub import AudioSegment
    from pydub.utils import which
    PYDUB_AVAILABLE = True
except ImportError:
    PYDUB_AVAILABLE = False
    print("Warning: pydub not available. Install with: pip install pydub")

# Transcription imports
try:
    from google import genai
    from google.genai import types
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False
    print("Warning: Google Gemini not available. Install with: pip install google-generativeai")

try:
    import whisper
    WHISPER_AVAILABLE = True
except ImportError:
    WHISPER_AVAILABLE = False


class MemoryAudioProcessor:
    """Handles audio processing in memory without disk storage."""
    
    def __init__(self):
        self.max_memory_size = 100 * 1024 * 1024  # 100MB limit
    
    def normalize_audio_format(self, audio_data: bytes, target_format: str = "mp3") -> bytes:
        """
        Convert audio data to specified format in memory.
        
        Args:
            audio_data (bytes): Raw audio data
            target_format (str): Target format ('mp3', 'wav', 'flac')
            
        Returns:
            bytes: Converted audio data
            
        Raises:
            RuntimeError: If pydub is not available or conversion fails
        """
        if not PYDUB_AVAILABLE:
            raise RuntimeError("pydub is required for audio format conversion")
        
        try:
            # Load audio from bytes
            audio_segment = AudioSegment.from_file(io.BytesIO(audio_data))
            
            # Convert to target format
            output_buffer = io.BytesIO()
            audio_segment.export(output_buffer, format=target_format)
            
            return output_buffer.getvalue()
            
        except Exception as e:
            raise RuntimeError(f"Audio format conversion failed: {e}")
    
    def optimize_audio_for_transcription(self, audio_data: bytes) -> bytes:
        """
        Optimize audio data for better transcription accuracy.
        
        Args:
            audio_data (bytes): Raw audio data
            
        Returns:
            bytes: Optimized audio data
        """
        if not PYDUB_AVAILABLE:
            return audio_data  # Return as-is if pydub not available
        
        try:
            # Load audio
            audio = AudioSegment.from_file(io.BytesIO(audio_data))
            
            # Normalize audio for better transcription
            # Convert to mono if stereo
            if audio.channels > 1:
                audio = audio.set_channels(1)
            
            # Normalize sample rate to 16kHz (good for speech)
            if audio.frame_rate != 16000:
                audio = audio.set_frame_rate(16000)
            
            # Normalize volume
            audio = audio.normalize()
            
            # Export optimized audio
            output_buffer = io.BytesIO()
            audio.export(output_buffer, format="wav")
            
            return output_buffer.getvalue()
            
        except Exception as e:
            print(f"Warning: Audio optimization failed: {e}")
            return audio_data  # Return original if optimization fails


class TranscriptionEngine:
    """Handles different transcription methods."""
    
    def __init__(self, method: str = "gemini"):
        """
        Initialize transcription engine.
        
        Args:
            method (str): Transcription method ('gemini', 'whisper')
        """
        self.method = method
        self.client = None
        
        # Load environment variables first
        self._load_environment()
        
        if method == "gemini":
            self._setup_gemini()
        elif method == "whisper":
            self._setup_whisper()
    
    def _load_environment(self):
        """Load environment variables from .env file."""
        from dotenv import load_dotenv
        
        # Try different possible locations for .env file
        possible_env_paths = [
            ".env",  # Current directory
            "../.env",  # One level up (for notebooks)
            "../../.env",  # Two levels up
            "/teamspace/studios/this_studio/.env",  # Your specific path
            "/teamspace/studios/this_studio/somali-radios-with-ai-for-food-security/.env",  # Project root
        ]
        
        env_loaded = False
        for env_path in possible_env_paths:
            if os.path.exists(env_path):
                try:
                    load_dotenv(dotenv_path=env_path)
                    api_key = os.getenv("GEMINI_API_KEY")
                    if api_key:
                        print(f"✓ Loaded API key from: {env_path}")
                        env_loaded = True
                        break
                except Exception as e:
                    print(f"Warning: Error loading {env_path}: {e}")
        
        if not env_loaded:
            print("Warning: No .env file found with valid GEMINI_API_KEY")
    
    def _setup_gemini(self):
        """Setup Gemini API client."""
        if not GEMINI_AVAILABLE:
            raise RuntimeError("Gemini API not available. Install google-generativeai")
        
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            print("\nERROR: GEMINI_API_KEY not found!")
            print("Quick fix:")
            print("1. Get API key from: https://aistudio.google.com")
            print("2. Create .env file in project root:")
            print('   echo \'GEMINI_API_KEY="your_key_here"\' > .env')
            print("3. Or set environment variable:")
            print('   export GEMINI_API_KEY="your_key_here"')
            raise EnvironmentError("GEMINI_API_KEY not found in environment variables")
        
        self.client = genai.Client(api_key=api_key)
    
    def _setup_whisper(self):
        """Setup Whisper model."""
        if not WHISPER_AVAILABLE:
            raise RuntimeError("Whisper not available. Install with: pip install openai-whisper")
        
        try:
            self.client = whisper.load_model("base")
        except Exception as e:
            raise RuntimeError(f"Failed to load Whisper model: {e}")
    
    def transcribe_from_memory(
        self, 
        audio_data: bytes, 
        model: str = "gemini-2.0-flash"
    ) -> str:
        """
        Transcribe audio data from memory.
        
        Args:
            audio_data (bytes): Audio data in memory
            model (str): Model to use for transcription
            
        Returns:
            str: Transcribed text
            
        Raises:
            ValueError: If transcription fails or returns empty result
        """
        if self.method == "gemini":
            return self._transcribe_gemini(audio_data, model)
        elif self.method == "whisper":
            return self._transcribe_whisper(audio_data)
        else:
            raise ValueError(f"Unsupported transcription method: {self.method}")
    
    def _transcribe_gemini(self, audio_data: bytes, model: str) -> str:
        """Transcribe using Gemini API."""
        try:
            # Create audio part from bytes
            audio_part = types.Part.from_bytes(
                data=audio_data,
                mime_type="audio/mp3"
            )
            
            # Generate transcript
            response = self.client.models.generate_content(
                model=model,
                contents=["Generate a transcript of the speech.", audio_part]
            )
            
            if hasattr(response, 'text') and response.text:
                return response.text
            else:
                raise ValueError("No transcript text returned from Gemini API")
                
        except Exception as e:
            raise ValueError(f"Gemini transcription failed: {e}")
    
    def _transcribe_whisper(self, audio_data: bytes) -> str:
        """Transcribe using Whisper."""
        try:
            # Whisper requires a file, so we use a temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                tmp_file.write(audio_data)
                tmp_file_path = tmp_file.name
            
            try:
                result = self.client.transcribe(tmp_file_path)
                return result["text"]
            finally:
                # Clean up temporary file
                os.unlink(tmp_file_path)
                
        except Exception as e:
            raise ValueError(f"Whisper transcription failed: {e}")


class StreamingSoundCloudDownloader:
    """Enhanced SoundCloud downloader with on-the-fly transcription capabilities."""
    
    def __init__(
        self, 
        output_dir: str = "transcriptions",
        transcription_method: str = "gemini",
        keep_failed_audio: bool = False
    ):
        """
        Initialize the streaming downloader.
        
        Args:
            output_dir (str): Directory for transcription files
            transcription_method (str): Method for transcription ('gemini', 'whisper')
            keep_failed_audio (bool): Whether to save audio files that fail transcription
        """
        self.output_dir = output_dir
        self.transcription_method = transcription_method
        self.keep_failed_audio = keep_failed_audio
        self.ffmpeg_path = None
        self.transcription_log = {}
        self.log_file = os.path.join(output_dir, ".transcription_log.json")
        
        # Initialize components
        os.makedirs(output_dir, exist_ok=True)
        self.audio_processor = MemoryAudioProcessor()
        self.transcription_engine = TranscriptionEngine(transcription_method)
        
        # Load transcription log
        self.load_transcription_log()
        
        # Setup ffmpeg
        self.setup_ffmpeg()
    
    def load_transcription_log(self):
        """Load the transcription log from file."""
        if os.path.exists(self.log_file):
            try:
                with open(self.log_file, 'r') as f:
                    self.transcription_log = json.load(f)
            except:
                self.transcription_log = {}
    
    def save_transcription_log(self):
        """Save the transcription log to file."""
        try:
            with open(self.log_file, 'w') as f:
                json.dump(self.transcription_log, f, indent=2)
        except Exception as e:
            print(f"Warning: Could not save transcription log: {e}")
    
    def setup_ffmpeg(self):
        """Setup ffmpeg for audio processing."""
        self.ffmpeg_path = shutil.which('ffmpeg')
        if not self.ffmpeg_path:
            # Try common locations
            common_locations = [
                '/usr/bin/ffmpeg',
                '/usr/local/bin/ffmpeg',
                '/opt/homebrew/bin/ffmpeg',
            ]
            
            for location in common_locations:
                if os.path.exists(location) and os.access(location, os.X_OK):
                    self.ffmpeg_path = location
                    break
        
        if self.ffmpeg_path:
            print(f"✓ Found ffmpeg at: {self.ffmpeg_path}")
        else:
            print("⚠ ffmpeg not found. Some audio processing may fail.")
    
    @contextmanager
    def memory_buffer_manager(self):
        """Context manager for handling memory buffers safely."""
        buffer = io.BytesIO()
        try:
            yield buffer
        finally:
            buffer.close()
    
    def download_to_memory(self, url: str) -> Optional[bytes]:
        """
        Download audio from URL directly to memory buffer.
        
        Args:
            url (str): SoundCloud URL to download
            
        Returns:
            Optional[bytes]: Audio data in memory, None if failed
        """
        print(f"⬇ Downloading to memory: {url}")
        
        with self.memory_buffer_manager() as buffer:
            # Custom hook to capture data in memory
            def progress_hook(d):
                if d['status'] == 'downloading':
                    # Write chunks to buffer as they come in
                    if 'tmpfilename' in d:
                        try:
                            with open(d['tmpfilename'], 'rb') as tmp_f:
                                chunk = tmp_f.read()
                                if chunk:
                                    buffer.seek(0)
                                    buffer.write(chunk)
                        except:
                            pass
            
            # Configure yt-dlp for memory download
            ydl_opts = {
                'format': 'bestaudio/best',
                'noplaylist': True,
                'quiet': True,
                'no_warnings': True,
                'progress_hooks': [progress_hook],
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
            }
            
            if self.ffmpeg_path:
                ffmpeg_dir = os.path.dirname(self.ffmpeg_path)
                ydl_opts['ffmpeg_location'] = ffmpeg_dir
            
            try:
                # Use temporary file that we'll read and delete immediately
                with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_file:
                    ydl_opts['outtmpl'] = tmp_file.name.replace('.mp3', '.%(ext)s')
                    
                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                        ydl.download([url])
                    
                    # Read the processed file into memory
                    processed_file = tmp_file.name.replace('.mp3', '.mp3')
                    if os.path.exists(processed_file):
                        with open(processed_file, 'rb') as f:
                            audio_data = f.read()
                        
                        # Clean up temporary file immediately
                        os.unlink(processed_file)
                        return audio_data
                    
            except Exception as e:
                print(f"  ✗ Download failed: {str(e)[:100]}")
                return None
        
        return None
    
    def transcribe_audio_data(
        self, 
        audio_data: bytes, 
        optimize: bool = True
    ) -> Optional[str]:
        """
        Transcribe audio data from memory buffer.
        
        Args:
            audio_data (bytes): Raw audio data
            optimize (bool): Whether to optimize audio for transcription
            
        Returns:
            Optional[str]: Transcribed text, None if failed
            
        Raises:
            ValueError: If audio data is invalid or transcription fails
        """
        if not audio_data or len(audio_data) < 1000:
            raise ValueError("Audio data is too small or empty")
        
        try:
            # Optimize audio if requested
            if optimize and PYDUB_AVAILABLE:
                print("  🔧 Optimizing audio for transcription...")
                audio_data = self.audio_processor.optimize_audio_for_transcription(audio_data)
            
            # Check memory usage
            memory_mb = len(audio_data) / (1024 * 1024)
            print(f"  📊 Processing {memory_mb:.1f}MB audio in memory")
            
            if memory_mb > 100:
                print("  ⚠ Large audio file, processing may take longer")
            
            # Transcribe
            print("  🎯 Generating transcript...")
            transcript = self.transcription_engine.transcribe_from_memory(audio_data)
            
            return transcript
            
        except Exception as e:
            print(f"  ✗ Transcription error: {e}")
            return None
    
    def process_url_streaming(
        self, 
        url: str, 
        custom_filename: Optional[str] = None
    ) -> Tuple[bool, Optional[str]]:
        """
        Complete streaming workflow: download → transcribe → save text.
        
        Args:
            url (str): SoundCloud URL to process
            custom_filename (str, optional): Custom name for output file
            
        Returns:
            Tuple[bool, Optional[str]]: (success_status, transcript_file_path)
        """
        # Generate URL hash for tracking
        url_hash = hashlib.md5(url.encode()).hexdigest()
        
        # Check if already processed
        if url_hash in self.transcription_log:
            existing_file = self.transcription_log[url_hash].get('transcript_path')
            if existing_file and os.path.exists(existing_file):
                print(f"✓ Already transcribed: {os.path.basename(existing_file)}")
                return True, existing_file
        
        print(f"🔄 Processing: {url}")
        
        # Step 1: Download to memory
        audio_data = self.download_to_memory(url)
        if not audio_data:
            return False, None
        
        try:
            # Step 2: Transcribe from memory
            transcript = self.transcribe_audio_data(audio_data)
            if not transcript:
                return False, None
            
            # Step 3: Save transcript
            if custom_filename:
                base_filename = custom_filename
            else:
                # Extract title from URL or use timestamp
                base_filename = self.extract_title_from_url(url)
            
            # Ensure clean filename
            safe_filename = re.sub(r'[^\w\s-]', '', base_filename)
            safe_filename = re.sub(r'[-\s]+', '-', safe_filename).strip('-')
            
            transcript_file = os.path.join(self.output_dir, f"{safe_filename}.txt")
            
            # Handle filename conflicts
            counter = 1
            original_path = transcript_file
            while os.path.exists(transcript_file):
                name_part = original_path.replace('.txt', '')
                transcript_file = f"{name_part}_{counter}.txt"
                counter += 1
            
            # Save transcript
            with open(transcript_file, 'w', encoding='utf-8') as f:
                f.write(f"# Transcript\n")
                f.write(f"**Source**: {url}\n")
                f.write(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"**Method**: {self.transcription_method}\n\n")
                f.write("---\n\n")
                f.write(transcript)
            
            # Update log
            self.transcription_log[url_hash] = {
                'url': url,
                'transcript_path': transcript_file,
                'processing_date': datetime.now().isoformat(),
                'method': self.transcription_method,
                'file_size_mb': len(audio_data) / (1024 * 1024)
            }
            self.save_transcription_log()
            
            print(f"  ✓ Transcript saved: {os.path.basename(transcript_file)}")
            return True, transcript_file
            
        finally:
            # Memory cleanup
            del audio_data  # Explicit cleanup
            
    def extract_title_from_url(self, url: str) -> str:
        """Extract a reasonable title from SoundCloud URL."""
        try:
            # Try to get title from yt-dlp info
            ydl_opts = {'quiet': True, 'no_warnings': True}
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info = ydl.extract_info(url, download=False)
                if 'title' in info:
                    return info['title']
        except:
            pass
        
        # Fallback: extract from URL
        parts = url.split('/')
        if len(parts) >= 2:
            return f"{parts[-2]}_{parts[-1]}"
        
        # Final fallback
        return f"soundcloud_audio_{int(time.time())}"
    
    def process_date_range_streaming(
        self,
        profile_url: str,
        start_date: Union[str, datetime],
        end_date: Union[str, datetime],
        batch_size: int = 5
    ) -> Dict[str, List[str]]:
        """
        Process date range with streaming transcription.
        
        Args:
            profile_url (str): SoundCloud profile URL
            start_date: Start date for range
            end_date: End date for range
            batch_size (int): Number of files to process before memory cleanup
            
        Returns:
            Dict[str, List[str]]: Results categorized by success/failure
        """
        # Parse dates if strings
        if isinstance(start_date, str):
            start_date = datetime.strptime(start_date, '%Y-%m-%d')
        if isinstance(end_date, str):
            end_date = datetime.strptime(end_date, '%Y-%m-%d')
        
        print(f"\n🎵 Streaming transcription: {start_date.date()} to {end_date.date()}")
        print(f"📁 Transcripts will be saved to: {self.output_dir}")
        print(f"🧠 Using {self.transcription_method} for transcription\n")
        
        # Generate URLs
        urls = self.generate_urls_for_range(profile_url, start_date, end_date)
        
        if not urls:
            print("No URLs generated for date range")
            return {'successful': [], 'failed': [], 'skipped': []}
        
        results = {'successful': [], 'failed': [], 'skipped': []}
        batch_count = 0
        
        for i, (date, url) in enumerate(urls, 1):
            print(f"[{i}/{len(urls)}] {date.date()}")
            
            try:
                success, transcript_path = self.process_url_streaming(url)
                
                if success and transcript_path:
                    results['successful'].append(transcript_path)
                else:
                    results['failed'].append(url)
                    
            except Exception as e:
                print(f"  ✗ Processing error: {e}")
                results['failed'].append(url)
            
            # Memory management: cleanup every batch_size items
            batch_count += 1
            if batch_count >= batch_size:
                print(f"  🧹 Memory cleanup after {batch_size} items...")
                import gc
                gc.collect()
                batch_count = 0
            
            # Rate limiting
            time.sleep(2)
        
        # Print final summary
        self.print_processing_summary(results, len(urls))
        return results
    
    def generate_urls_for_range(
        self, 
        profile_url: str, 
        start_date: datetime, 
        end_date: datetime
    ) -> List[Tuple[datetime, str]]:
        """Generate potential URLs for a date range."""
        urls = []
        profile_url = profile_url.rstrip('/')
        
        current_date = start_date
        while current_date <= end_date:
            day = current_date.day
            month = current_date.strftime('%b').lower()
            year = current_date.year
            
            # Generate URL (using your existing pattern)
            url = f"{profile_url}/idaacadda-{day:02d}-{month}-{year}"
            urls.append((current_date, url))
            
            current_date += timedelta(days=1)
        
        return urls
    
    def print_processing_summary(self, results: Dict[str, List[str]], total_urls: int):
        """Print a summary of processing results."""
        successful = len(results['successful'])
        failed = len(results['failed'])
        skipped = len(results['skipped'])
        
        print(f"\n{'='*60}")
        print(f"🎵 STREAMING TRANSCRIPTION SUMMARY")
        print(f"{'='*60}")
        print(f"  📊 Total URLs processed: {total_urls}")
        print(f"  ✅ Successfully transcribed: {successful}")
        print(f"  ⏭️  Skipped (already done): {skipped}")
        print(f"  ❌ Failed: {failed}")
        print(f"  📁 Transcripts saved to: {self.output_dir}")
        print(f"  💾 Memory-efficient processing: ✓")
        print(f"{'='*60}\n")
    
    def validate_url(self, url: str) -> bool:
        """Validate SoundCloud URL format."""
        pattern = r'^https?://(?:www\.)?soundcloud\.com/[\w-]+/[\w-]+'
        return bool(re.match(pattern, url))
    
    def batch_transcribe_existing_files(
        self, 
        audio_dir: str, 
        batch_size: int = 3
    ) -> Dict[str, str]:
        """
        Transcribe existing MP3 files with memory-efficient batch processing.
        
        Args:
            audio_dir (str): Directory containing MP3 files
            batch_size (int): Number of files to process before memory cleanup
            
        Returns:
            Dict[str, str]: Processing results per file
        """
        mp3_files = list(Path(audio_dir).glob("*.mp3"))
        results = {}
        
        print(f"📁 Found {len(mp3_files)} MP3 files to transcribe")
        print(f"📝 Batch size: {batch_size} files")
        
        for i, mp3_file in enumerate(mp3_files, 1):
            print(f"\n[{i}/{len(mp3_files)}] Processing {mp3_file.name}")
            
            try:
                # Read file to memory
                with open(mp3_file, 'rb') as f:
                    audio_data = f.read()
                
                # Transcribe
                transcript = self.transcribe_audio_data(audio_data)
                
                if transcript:
                    # Save transcript
                    output_file = os.path.join(
                        self.output_dir, 
                        f"{mp3_file.stem}.txt"
                    )
                    
                    with open(output_file, 'w', encoding='utf-8') as f:
                        f.write(f"# Transcript for {mp3_file.name}\n")
                        f.write(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
                        f.write(transcript)
                    
                    results[mp3_file.name] = "success"
                    print(f"  ✓ Saved transcript: {os.path.basename(output_file)}")
                else:
                    results[mp3_file.name] = "failed"
                
                # Memory cleanup
                del audio_data
                
                # Batch cleanup
                if i % batch_size == 0:
                    print(f"  🧹 Memory cleanup after {batch_size} files...")
                    import gc
                    gc.collect()
                    time.sleep(1)
                    
            except Exception as e:
                results[mp3_file.name] = f"error: {e}"
                print(f"  ✗ Error: {e}")
        
        return results


# Enhanced usage functions
def stream_transcribe_date_range(
    profile_url: str,
    start_date: str,
    end_date: str,
    output_dir: str = "streaming_transcripts",
    method: str = "gemini"
) -> Dict[str, List[str]]:
    """
    Main function for streaming transcription of date ranges.
    
    Args:
        profile_url (str): SoundCloud profile URL
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        output_dir (str): Directory for transcript output
        method (str): Transcription method ('gemini' or 'whisper')
        
    Returns:
        Dict[str, List[str]]: Processing results
    """
    downloader = StreamingSoundCloudDownloader(
        output_dir=output_dir,
        transcription_method=method
    )
    
    return downloader.process_date_range_streaming(
        profile_url=profile_url,
        start_date=start_date,
        end_date=end_date
    )

def stream_transcribe_single_url(
    url: str, 
    output_dir: str = "streaming_transcripts",
    method: str = "gemini"
) -> Tuple[bool, Optional[str]]:
    """
    Transcribe a single URL with streaming processing.
    
    Args:
        url (str): SoundCloud URL to transcribe
        output_dir (str): Output directory for transcript
        method (str): Transcription method to use
        
    Returns:
        Tuple[bool, Optional[str]]: (success, transcript_file_path)
    """
    downloader = StreamingSoundCloudDownloader(
        output_dir=output_dir,
        transcription_method=method
    )
    
    return downloader.process_url_streaming(url)

def batch_transcribe_existing_mp3s(
    input_dir: str,
    output_dir: str = "batch_transcripts",
    method: str = "gemini",
    batch_size: int = 3
) -> Dict[str, str]:
    """
    Batch transcribe existing MP3 files with memory management.
    
    Args:
        input_dir (str): Directory containing MP3 files
        output_dir (str): Directory for transcript output  
        method (str): Transcription method to use
        batch_size (int): Files to process before memory cleanup
        
    Returns:
        Dict[str, str]: Results per filename
    """
    downloader = StreamingSoundCloudDownloader(
        output_dir=output_dir,
        transcription_method=method
    )
    
    return downloader.batch_transcribe_existing_files(input_dir, batch_size)

# Memory optimization utilities
def get_memory_usage() -> float:
    """
    Get current memory usage in MB.
    
    Returns:
        float: Memory usage in megabytes
    """
    import psutil
    process = psutil.Process()
    memory_mb = process.memory_info().rss / 1024 / 1024
    return memory_mb

def cleanup_memory():
    """Force garbage collection and memory cleanup."""
    import gc
    gc.collect()
    print("🧹 Memory cleanup completed")


# Example usage demonstrations
print("=== USAGE EXAMPLES ===\n")

# Example 1: Stream transcribe a date range
print("Example 1: Stream transcribe date range")
print("-" * 40)
example_results = stream_transcribe_date_range(
    profile_url="https://soundcloud.com/radio-ergo",
    start_date="2024-07-01", 
    end_date="2024-07-02",
    output_dir="data"
)

# Example 2: Single URL streaming transcription  
# print("\nExample 2: Single URL transcription")
# print("-" * 40)
# success, transcript_path = stream_transcribe_single_url(
#     url="https://soundcloud.com/radio-ergo/idaacadda-01-jul-2024",
#     output_dir="single_transcripts"
# )

# Example 3: Batch process existing files
print("\nExample 3: Batch transcribe existing MP3s")
print("-" * 40)
batch_results = batch_transcribe_existing_mp3s(
    input_dir="./data",  # Your existing MP3 directory
    output_dir="batch_transcripts",
    batch_size=3
)

# Example 4: Memory monitoring during processing
print("\nExample 4: Memory monitoring")
print("-" * 40)
print(f"Current memory usage: {get_memory_usage():.1f} MB")
cleanup_memory()

=== USAGE EXAMPLES ===

Example 1: Stream transcribe date range
----------------------------------------
✓ Loaded API key from: ../.env
✓ Found ffmpeg at: /usr/bin/ffmpeg

🎵 Streaming transcription: 2024-07-01 to 2024-07-02
📁 Transcripts will be saved to: data
🧠 Using gemini for transcription

[1/2] 2024-07-01
🔄 Processing: https://soundcloud.com/radio-ergo/idaacadda-01-jul-2024
⬇ Downloading to memory: https://soundcloud.com/radio-ergo/idaacadda-01-jul-2024
  🔧 Optimizing audio for transcription...                                
  📊 Processing 109.6MB audio in memory
  ⚠ Large audio file, processing may take longer
  🎯 Generating transcript...
  ✓ Transcript saved: IDAACADDA-01-JUL-2024.txt
[2/2] 2024-07-02
🔄 Processing: https://soundcloud.com/radio-ergo/idaacadda-02-jul-2024
⬇ Downloading to memory: https://soundcloud.com/radio-ergo/idaacadda-02-jul-2024
  🔧 Optimizing audio for transcription...                                
  📊 Processing 108.7MB audio in memory
  ⚠ Large audio f