In [None]:
from moviepy import VideoFileClip
import os

def extract_audio_from_video(video_path = "vid.mp4", audio_path="aud.wav"):
    try:
        # Load the video file using VideoFileClip
        video_clip = VideoFileClip(video_path)
        
        # Check if the video clip has audio
        if video_clip.audio is None:
            print(f"Warning: The video file '{video_path}' does not contain any audio.")
            video_clip.close()  # Close the video clip to release resources
            return None # Exit the function
        
        # Extract the audio and save it to a file. Use .mp3 for compressed audio.
        video_clip.audio.write_audiofile(audio_path, codec='libmp3lame')
        print(f"Audio extracted and saved to {audio_path}")
        
        return video_clip.audio
    except FileNotFoundError as e:
        print(f"File not found: {e}")
    except OSError as e:
        print(f"OS error: {e}")
        # If it's related to ffmpeg, provide more detailed guidance
        if "ffmpeg" in str(e).lower():
            print("\nThis appears to be an ffmpeg-related error.")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Ensure resources are released (though VideoFileClip's context manager should handle this)
        if 'video_clip' in locals() and video_clip is not None:
            video_clip.close()

print(extract_audio_from_video("vid.mp4", "aud.wav"))


{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso6iso2avc1mp41', 'encoder': 'Lavf60.16.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 436, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'handler_name': 'ISO Media file produced by Google Inc.'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'handler_name': 'ISO Media file produced by Google Inc.'}}], 'input_number': 0}], 'duration': 32.33, 'bitrate': 567, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [1280, 720], 'video_bitrate': 436, 'video_fps': 30.0, 'default_audio_input_number': 0, 'default_audio_stream_number': 1, 'au

                                                                    

MoviePy - Done.
Audio extracted and saved to aud.wav
<moviepy.audio.io.AudioFileClip.AudioFileClip object at 0x78fe90e29f40>




In [None]:
import os
import torch
import torchaudio
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB
from moviepy import AudioFileClip

def separate_voice_and_music(audio_path="aud.wav", output_dir="audios"):
    """
    Separate voice and music from audio using torchaudio's Demucs model.
    
    Args:
        audio_path: Path to the audio file
        output_dir: Directory to save the separated audio files
        
    Returns:
        Dictionary containing paths to the separated voice and music files
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Load the Demucs model from torchaudio
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = HDEMUCS_HIGH_MUSDB.get_model()
        model.to(device)
        
        print(f"Separating voice and music using Demucs on {device}... This may take a while.")
        
        # Load the audio file
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Convert to 2-channel if needed
        if waveform.shape[0] == 1:
            waveform = torch.cat([waveform, waveform], dim=0)
        
        # Waveform needs to be on the same device as model
        waveform = waveform.to(device)
        
        # Process audio - separate into sources
        sources = model.separate_sources(waveform)
        
        # The output shape is [source, channel, time]
        # sources is a tensor with shape (4, 2, T): [vocals, drums, bass, other]
        
        # Get the source names from the model
        source_names = ["vocals", "music"]
        output_files = {}
        
        for i, source_name in enumerate(source_names):
            # Move back to CPU for saving
            source_audio = sources[i].cpu()
            
            output_path = os.path.join(output_dir, f"{source_name}.wav")
            torchaudio.save(output_path, source_audio, sample_rate)
            output_files[source_name] = output_path
            print(f"Saved {source_name} to {output_path}")
        
        # Return paths to separated files
        return {
            'vocals': output_files.get('vocals'),
            'music': output_files.get('music')  # 'other' typically contains most of the music
        }
        
    except Exception as e:
        print(f"Error during separation: {e}")
        import traceback
        traceback.print_exc()
        return None
    
separate_voice_and_music()

 79%|███████▉  | 253M/319M [01:08<00:21, 3.25MB/s] 