In [4]:
!pip install pedalboard
!pip install gtts
!pip install pydub

Collecting pedalboard
  Downloading pedalboard-0.9.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading pedalboard-0.9.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pedalboard
Successfully installed pedalboard-0.9.16
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [5]:
import os
import csv
import json
import librosa
import numpy as np
import soundfile as sf
from pedalboard.io import AudioFile
from pedalboard import Reverb, Compressor, Pedalboard
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio, display

General parameters used to craft different voice recipes

In [24]:
# Base parameters
sample_rate = 35000 #22050 #48000
speed_factor = 1.15

# Ring modulator
ring_mod_freq = 100

# Pedalboard effects
room_size=0.50
damping=0.25
dry_level=0.75
wet_level=0.25

n_steps=2.5

board = Pedalboard(
    [
        Reverb(room_size=room_size, damping=damping, dry_level=dry_level, wet_level=wet_level),
        Compressor(),
    ]
)

#Voice modification function.

this function applies a pedalboard object, previously defined, as well as a pitch shift (see the parameter n_steps). There is also a reverb tail that adds a couple of silence frames at the end of the file. this is important in case there is reverb or delay being applied to the audio file, in order to prevent an abrupt stop.

In [25]:
# Voice modification function: Adds pedalboard effects to speech

def voice_modification(board, input_audio, output_audio_file, n_steps):
    """
    Function that applies a pipeline of voice effects
    """

    audio, sr = librosa.load(input_audio)

    # Adds a reverb tail
    audio = np.append(audio, [0.0] * 5000)

    # Shifts the pitch down
    audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

    sf.write("temp_audio.wav", audio, sr)
    with AudioFile("temp_audio.wav") as f:

        # Open an audio file to write to:
        with AudioFile(output_audio_file, "w", f.samplerate, f.num_channels) as o:

            # Read one second of audio at a time, until the file is empty:
            while f.tell() < f.frames:
                chunk = f.read(f.samplerate)

                # Run the audio through our pedalboard:
                effected = board(chunk, f.samplerate, reset=False)

                # Write the output to our output file:
                o.write(effected)

# Ring Modulator Funciton

This function applies a ring modulator effect to the audio file, in order to mimic a robotic or alien-like voice. Audio is clipped in order to prevent artifacts or audible saturation. The carrier_freq corresponds to the frequency of the ring modulator. Adjust this parameter in order to obtain different types of ring modulation.

In [17]:
# Ring modulator function: Adds a robotic/alien tone to speech

def ring_modulator(audio_file: str, carrier_freq: int):
    """
    Function that applies a ring modulator to distort input speech
    """

    audio, sr = librosa.load(audio_file, sr=None)
    t = np.arange(len(audio)) / sr
    carrier = np.sin(2 * np.pi * carrier_freq * t)

    modulated_audio = audio * carrier
    modulated_audio = np.clip(modulated_audio, -1.0, 1.0)

    sf.write("audio_ringmode.wav", modulated_audio, sr)

# Testing our TTS recipe

Following the setup of our functions and parameters, we are ready to test each individual component of this voice recipe.

Our script is based on a simple TTS from the gTTS toolkit. Only female voices are covered. If we use as input for each subsequent unit the previously generated audio file, we can perceptually assess the full effect chain.

In [None]:
text= "Insert text for speech synthesis here"

In [9]:
tts = gTTS(text, lang="en", tld="co.uk")
tts.save("audio.mp3")

# Listens to base TTS file
display(Audio("audio.mp3", autoplay=True))

In the following unit we can adjust the speed_factor, performing a temporal distortion on our input audio file. We can also adjust the sampling rate of our file here.

In [15]:
audio = AudioSegment.from_mp3("audio.mp3")

audio = audio._spawn(
    audio.raw_data,
    overrides={"frame_rate": int(audio.frame_rate * speed_factor)},
)
audio = audio.set_frame_rate(sample_rate)
audio.export("audio_speed.mp3", format="mp3")

# listens to speed perturbed audio file
display(Audio("audio_speed.mp3", autoplay=True))

Applies the ring modulator effect:

In [19]:
input_audio="audio.mp3"

# Applies ring modulator to input audio signal
ring_modulator(input_audio, carrier_freq=ring_mod_freq)

display(Audio("audio_ringmode.wav", autoplay=True))

Applies the voice_modification function:

In [27]:
input_audio="audio_ringmode.wav"

voice_modification(board, input_audio, "audio_voicemod.wav", n_steps)
display(Audio("audio_voicemod.wav", autoplay=True))