In [1]:
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav

In [2]:
# Set parameters for recording
duration = 5  # seconds
sample_rate = 44100  # Hz

def record_audio(duration, sample_rate):
    print("Recording started...")
    # Record audio using the sounddevice library
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float64')
    sd.wait()  # Wait until the recording is finished
    print("Recording finished.")
    return audio_data

# Function to save recorded audio to a .wav file
def save_audio(filename, audio_data, sample_rate):
    # Normalize the audio to 16-bit PCM format
    audio_data = (audio_data * 32767).astype(np.int16)
    wav.write(filename, sample_rate, audio_data)
    print(f"Audio saved to {filename}")

Recording started...
Recording finished.
Audio saved to recorded_audio.wav


In [None]:
# Record and save the audio
audio_data = record_audio(duration, sample_rate)
save_audio("recorded_audio.wav", audio_data, sample_rate)

In [4]:
def play_audio(filename):
    # Read the audio file
    sample_rate, audio_data = wav.read(filename)
    
    # Play the audio using sounddevice
    sd.play(audio_data, samplerate=sample_rate)
    sd.wait()  # Wait until playback is finished
    print("Playback finished.")


Playback finished.


In [16]:
# Play the saved audio file
play_audio("recorded_audio.wav")

Playback finished.


### Input Audio

In [10]:
input_audio = "recorded_audio.wav"

### Basic Noise Reduction using pydub

In [None]:
from pydub import AudioSegment
from pydub.playback import play

# Load the audio file
audio = AudioSegment.from_file("recorded_audio.wav")

# Apply low-pass filter to remove high-frequency noise
filtered_audio = audio.low_pass_filter(3000)  # Cut off frequencies above 3000Hz

# Export the filtered audio
filtered_audio.export("output_audio1.wav", format="wav")

# Play the filtered audio
play(filtered_audio)

In [39]:
import noisereduce as nr
import librosa
import soundfile as sf

# Load the noisy audio file
data, rate = librosa.load(input_audio, sr=None)

# Perform noise reduction using spectral subtraction
reduced_noise_audio = nr.reduce_noise(y=data, sr=rate)

# Save the reduced noise audio
sf.write("output_audio2.wav", reduced_noise_audio, rate)
play_audio("output_audio2.wav")

Playback finished.


In [35]:
import webrtcvad
import wave

# Load the audio file
def read_wave(path):
    with wave.open(path, 'rb') as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1  # mono
        sample_width = wf.getsampwidth()
        assert sample_width == 2  # 16-bit
        sample_rate = wf.getframerate()
        print(f'Sample rate: {sample_rate}')
        assert sample_rate in (8000, 16000, 32000, 48000)  # common sampling rates
        frames = wf.readframes(wf.getnframes())
    return frames, sample_rate

# Save the output audio
def write_wave(path, audio, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)

# Perform Voice Activity Detection
def vad_filter(input_file, output_file, aggressiveness=3):
    audio, sample_rate = read_wave(input_file)
    vad = webrtcvad.Vad(aggressiveness)  # aggressiveness 0-3
    frame_duration = 10  # in ms
    frame_size = int(sample_rate * frame_duration / 1000 * 2)
    segments = []
    
    for i in range(0, len(audio), frame_size):
        frame = audio[i:i + frame_size]
        if vad.is_speech(frame, sample_rate):
            segments.append(frame)

    filtered_audio = b''.join(segments)
    write_wave(output_file, filtered_audio, sample_rate)

# Run VAD

# Load the audio file (original sample rate is detected automatically)
audio_data, original_sample_rate = librosa.load(input_audio, sr=None)

# Resample the audio to the desired sample rate (e.g., 16000 Hz)
target_sample_rate = 32000
audio_data_resampled = librosa.resample(audio_data, orig_sr=original_sample_rate, target_sr=target_sample_rate)

# Save the resampled audio
sf.write("output_audio_data_resampled.wav", audio_data_resampled, target_sample_rate)

vad_filter("output_audio_data_resampled.wav", 'output_audio3.wav')
play_audio("output_audio3.wav")

Sample rate: 32000
Playback finished.


In [None]:
import onnxruntime as ort
import numpy as np
import librosa
import soundfile as sf

# Load audio file
audio, sr = librosa.load(input_audio, sr=16000)

# Load pre-trained DNS model (download the model from Microsoft's DNS Challenge)
model = ort.InferenceSession("dns_model.onnx")

# Define frame size for real-time processing
frame_size = 512  # Example frame size

# Process the audio in frames
def denoise_audio(audio, model):
    processed_audio = []
    
    for i in range(0, len(audio), frame_size):
        frame = audio[i:i + frame_size]
        if len(frame) < frame_size:
            frame = np.pad(frame, (0, frame_size - len(frame)))
        frame = frame.reshape(1, 1, frame_size).astype(np.float32)
        
        # Model inference
        denoised_frame = model.run(None, {"input": frame})[0].flatten()
        processed_audio.extend(denoised_frame)

    return np.array(processed_audio)

# Denoise the audio
denoised_audio = denoise_audio(audio, model)

# Save the denoised audio
sf.write("output_audio4", denoised_audio, sr)


In [None]:
from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio

model = separator.from_hparams(source="speechbrain/sepformer-dns4-16k-enhancement", savedir='pretrained_models/sepformer-dns4-16k-enhancement')

# for custom file, change path
est_sources = model.separate_file(path='speechbrain/sepformer-dns4-16k-enhancement/example_dns4-16k.wav') 

torchaudio.save("enhanced_dns4-16k.wav", est_sources[:, :, 0].detach().cpu(), 16000)


In [None]:
import sounddevice as sd
import numpy as np
from speechbrain.pretrained import SpectralMaskGenerator

# Load the DNS model (make sure to adjust for your specific model and settings)
dns_model = SpectralMaskGenerator.from_hparams(source="speechbrain/VoiceEnhancement", savedir="dns_model")

# Set parameters
sample_rate = 16000  # Common sample rate for speech processing
frame_duration = 0.02  # Frame duration in seconds
frame_size = int(sample_rate * frame_duration)

def callback(indata, frames, time, status):
    if status:
        print(status)
    # Convert input to tensor and perform noise suppression
    noisy_audio = torch.tensor(indata.T)
    enhanced_audio = dns_model(noisy_audio)

    # Output the enhanced audio
    sd.play(enhanced_audio.detach().numpy().T, samplerate=sample_rate)

# Start the input stream
with sd.InputStream(callback=callback, channels=1, samplerate=sample_rate):
    print("Press Ctrl+C to stop")
    sd.sleep(100000)  # Keep the stream open for a while


### Audio API

In [3]:
# Import necessary libraries
import requests  # Used for making HTTP requests
import json  # Used for working with JSON data

# Define constants for the script
CHUNK_SIZE = 1024  # Size of chunks to read/write at a time
XI_API_KEY = "sk_632eb96bd5ff0ac2d1695753c761f837209eba3287b3c6a4"  # Your API key for authentication
VOICE_ID = "cgSgspJ2msm6clMCkdW9"  # ID of the voice model to use
TEXT_TO_SPEAK = "Hello,my name is charlie.I am your personal assistant."  # Text you want to convert to speech
OUTPUT_PATH_MP3 = "output.mp3"  # Path to save the output audio file

# Construct the URL for the Text-to-Speech API request
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"

# Set up headers for the API request, including the API key for authentication
headers = {
    "Accept": "application/json",
    "xi-api-key": XI_API_KEY
}

# Set up the data payload for the API request, including the text and voice settings
data = {
    "text": TEXT_TO_SPEAK,
    "model_id": "eleven_multilingual_v2",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.8,
        "style": 0.0,
        "use_speaker_boost": True
    }
}
# Make the POST request to the TTS API with headers and data, enabling streaming response
response = requests.post(tts_url, headers=headers, json=data, stream=True)

# Check if the request was successful
if response.ok:
    # Open the output file in write-binary mode
    with open(OUTPUT_PATH_MP3, "wb") as f:
        # Read the response in chunks and write to the file
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            f.write(chunk)
else:
    # Print the error message if the request was not successful
    print(response.text)


In [12]:
# Import necessary libraries
import requests  # Used for making HTTP requests
import json  # Used for working with JSON data

# Define constants for the script
CHUNK_SIZE = 1024  # Size of chunks to read/write at a time
XI_API_KEY = "sk_632eb96bd5ff0ac2d1695753c761f837209eba3287b3c6a4"  # Your API key for authentication
VOICE_ID = "cgSgspJ2msm6clMCkdW9"  # ID of the voice model to use
TEXT_TO_SPEAK = 'Vision AI, also known as "computer vision," is a field of artificial intelligence that enables computers to interpret and analyze visual data like images and videos, essentially allowing them to "see" and understand their surroundings, similar to how humans do, by identifying and classifying objects within the visual input; this technology is used for tasks like facial recognition, object detection, image classification, and more, with applications in various industries like retail, healthcare, and security.'  # Text you want to convert to speech
OUTPUT_PATH = "output2.mp3"  # Path to save the output audio file

# Construct the URL for the Text-to-Speech API request
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"

# Set up headers for the API request, including the API key for authentication
headers = {
    "Accept": "application/json",
    "xi-api-key": XI_API_KEY
}

# Set up the data payload for the API request, including the text and voice settings
data = {
    "text": TEXT_TO_SPEAK,
    "model_id": "eleven_multilingual_v2",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.8,
        "style": 0.0,
        "use_speaker_boost": True
    }
}
# Make the POST request to the TTS API with headers and data, enabling streaming response
response = requests.post(tts_url, headers=headers, json=data, stream=True)

# Check if the request was successful
if response.ok:
    # Open the output file in write-binary mode
    with open(OUTPUT_PATH, "wb") as f:
        # Read the response in chunks and write to the file
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            f.write(chunk)
    # Inform the user of success
    print("Audio stream saved successfully.")
else:
    # Print the error message if the request was not successful
    print(response.text)


Audio stream saved successfully.


In [6]:
from pydub import AudioSegment
from pydub.playback import play

In [4]:
import simpleaudio as sa

In [None]:
# Play the audio file using simpleaudio
wave_obj = sa.WaveObject.from_wave_file("output_audio1.wav")
play_obj = wave_obj.play()
play_obj.wait_done()  # Wait until playback finishes

### PyAudio

In [4]:
from pydub import AudioSegment
import pyaudio
import wave

# Convert to WAV
OUTPUT_PATH_MP3 = "output.mp3"
OUTPUT_PATH_MKV = "output.mkv"
audio = AudioSegment.from_file(OUTPUT_PATH_MP3, format="mp3")
audio.export(OUTPUT_PATH_MKV, format="wav")
print(f"Audio converted and saved as {OUTPUT_PATH_MKV}")



# Set chunk size of 1024 samples per data frame
chunk = 1024  

# Open the sound file 
wf = wave.open(OUTPUT_PATH_MKV, 'rb')

# Create an interface to PortAudio
p = pyaudio.PyAudio()

# Open a .Stream object to write the WAV file to
# 'output = True' indicates that the sound will be played rather than recorded
stream = p.open(format = p.get_format_from_width(wf.getsampwidth()),
                channels = wf.getnchannels(),
                rate = wf.getframerate(),
                output = True)

# Read data in chunks
data = wf.readframes(chunk)

# Play the sound by writing the audio data to the stream
while data != '':
    stream.write(data)
    data = wf.readframes(chunk)

# Close and terminate the stream
stream.close()
p.terminate()

Audio converted and saved as output.mkv


KeyboardInterrupt: 

### PyDub

In [1]:
from pydub import AudioSegment

# Point to ffmpeg executable directly
# AudioSegment.converter = r"C:\ffmpeg\ffmpeg-master-latest-win64-gpl\bin\ffmpeg.exe"  
# AudioSegment.ffprobe = r"C:\ffmpeg\ffmpeg-master-latest-win64-gpl\bin\ffprobe.exe"

# Convert MP3 to WAV
audio = AudioSegment.from_file("output.mp3", format="mp3")
audio.export("output.wav", format="wav")
print("Conversion complete!")


Conversion complete!


In [13]:
import requests

def text_to_speech(text, api_key, voice_id, output_path="output.mp3"):
    """
    Converts text to speech using the ElevenLabs Text-to-Speech API.

    Args:
        text (str): The text to convert to speech.
        api_key (str): Your ElevenLabs API key.
        voice_id (str): The voice ID to use for speech synthesis.
        output_path (str): The path where the audio file will be saved. Defaults to "output.mp3".

    Returns:
        str: Path to the generated audio file.

    Raises:
        Exception: If the API request fails or encounters an error.
    """
    tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
    
    headers = {
        "Accept": "application/json",
        "xi-api-key": api_key
    }

    data = {
        "text": text,
        "model_id": "eleven_multilingual_v2",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.8,
            "style": 0.0,
            "use_speaker_boost": True
        }
    }

    try:
        response = requests.post(tts_url, headers=headers, json=data, stream=True)
        if response.ok:
            # Save the audio to the specified file
            with open(output_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print(f"Audio saved to {output_path}")
            return output_path
        else:
            # Raise an exception if the API request fails
            raise Exception(f"Error: {response.status_code} - {response.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {e}")

# Example usage
# if __name__ == "__main__":
#     # Replace with your actual API key and voice ID
#     XI_API_KEY = "sk_632eb96bd5ff0ac2d1695753c761f837209eba3287b3c6a4"
#     VOICE_ID = "cgSgspJ2msm6clMCkdW9"
    
#     # Input text
#     input_text = "Hello, my name is Charlie. I am your personal assistant.What i can help you"
    
#     # Generate speech
#     audio_file_path = text_to_speech(input_text, XI_API_KEY, VOICE_ID)
#     print(f"Generated audio file: {audio_file_path}")


In [15]:
from pydub import AudioSegment
import pyaudio
import wave

def convert_and_play_audio(input_audio_path, output_audio_path="output.wav"):
    """
    Converts an audio file to WAV format and plays it.

    Args:
        input_audio_path (str): Path to the input audio file.
        output_audio_path (str): Path to save the converted WAV file. Defaults to "output.wav".

    Returns:
        str: Path to the saved WAV file.

    Raises:
        Exception: If any error occurs during processing or playback.
    """
    try:
        # Convert to WAV format
        audio = AudioSegment.from_file(input_audio_path)
        audio.export(output_audio_path, format="wav")
        print(f"Audio converted and saved as {output_audio_path}")

        # Play the WAV file
        chunk = 1024  # Chunk size for reading audio

        # Open the sound file
        wf = wave.open(output_audio_path, 'rb')

        # Create an interface to PortAudio
        p = pyaudio.PyAudio()

        # Open a .Stream object to write the WAV file to
        stream = p.open(
            format=p.get_format_from_width(wf.getsampwidth()),
            channels=wf.getnchannels(),
            rate=wf.getframerate(),
            output=True
        )

        # Read and play audio in chunks
        data = wf.readframes(chunk)
        while data:
            stream.write(data)
            data = wf.readframes(chunk)

        # Close and terminate the stream
        stream.close()
        p.terminate()

        print("Audio playback completed.")
        return output_audio_path

    except Exception as e:
        raise Exception(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    # Input and output file paths
    INPUT_AUDIO_PATH = "output.mp3"
    OUTPUT_AUDIO_PATH = "output.wav"
    
    # Convert and play audio
    try:
        result_path = convert_and_play_audio(INPUT_AUDIO_PATH, OUTPUT_AUDIO_PATH)
        print(f"Processed audio saved at: {result_path}")
    except Exception as error:
        print(error)


Audio converted and saved as output.wav
Audio playback completed.
Processed audio saved at: output.wav


In [16]:
if __name__ == "__main__":
    # Replace with your actual API key and voice ID
    XI_API_KEY = "sk_632eb96bd5ff0ac2d1695753c761f837209eba3287b3c6a4"
    VOICE_ID = "cgSgspJ2msm6clMCkdW9"
    
    # Input text
    input_text = "Hello, my name is Charlie. I am your personal assistant.What i can help you"
    
    # Generate speech
    audio_file_path = text_to_speech(input_text, XI_API_KEY, VOICE_ID)
    print(f"Generated audio file: {audio_file_path}")

    INPUT_AUDIO_PATH = audio_file_path
    OUTPUT_AUDIO_PATH = "output.wav"
    
    # Convert and play audio
    try:
        result_path = convert_and_play_audio(INPUT_AUDIO_PATH, OUTPUT_AUDIO_PATH)
        print(f"Processed audio saved at: {result_path}")
    except Exception as error:
        print(error)

Audio saved to output.mp3
Generated audio file: output.mp3
Audio converted and saved as output.wav
Audio playback completed.
Processed audio saved at: output.wav
