### Human voice to Robot voice

In [177]:
# %pip install soundfile scipy numpy pywavelets librosa matplotlib simpleaudio

In [178]:
import soundfile as sf
import numpy as np
import pandas as pd 
import pywt
import librosa
import wave
import simpleaudio as sa
import matplotlib.pyplot as plt
import scipy.io.wavfile as wav
import IPython.display as ipd
from scipy.signal import butter, lfilter, filtfilt

Function to plot the signal

In [179]:
# Plot the signal
def plotSignal(filename):
    with wave.open(filename, 'rb') as wav_file:
        # Extract the signal and the sample rate
        signal = wav_file.readframes(-1)
        signal = np.frombuffer(signal, dtype='int16')
        fs = wav_file.getframerate()

    # Compute the time vector
    time = np.arange(signal.size) / fs

    # Plot the waveform
    plt.plot(time, signal)
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.title("Waveform of {}".format(filename))
    plt.show()

Load source voice and target voice signals

In [255]:
# Load the WAV file
fs, signal = wav.read('croc_input.wav')

# plotSignal("normal_input.wav")

print("croc_input.wav:")
ipd.Audio("croc_input.wav")

croc_input.wav:


Perform Noise filtering

In [256]:
# Define the filter parameters
cutoff_freq = 1000  # Hz
filter_order = 5

# Create a Butterworth filter
nyquist_freq = 0.5 * fs
cutoff = cutoff_freq / nyquist_freq
b, a = butter(filter_order, cutoff, btype="highpass")

In [257]:
# Apply the filter to the signal
filtered_signal = lfilter(b, a, signal)

In [258]:
# Save the filtered signal to a new WAV file
wav.write("filtered.wav", fs, ((filtered_signal + filtered_signal.min()) * (2 ** 15) / filtered_signal.ptp()).astype(np.int16))

# plotSignal("filtered.wav")

print("filtered.wav:")
ipd.Audio("filtered.wav")

filtered.wav:


## Feature Extraction

Perform Discrete Transform

Divide the generated approximation wav file into small frames of fixed time period

In [259]:
# Load the filtered WAV file
fs, signal = wav.read('filtered.wav')

# Define the frame length and hop length in seconds
frame_length = 0.1  # seconds
hop_length = 0.05  # seconds

# Convert the frame and hop lengths to samples
frame_length_samples = int(frame_length * fs)
hop_length_samples = int(hop_length * fs)

# Divide the signal into frames
frames = librosa.util.frame(signal, frame_length=frame_length_samples, hop_length=hop_length_samples)

Estimate fundamental frequency of each frame using auto correlation

Extract the frequency and bandwidth using cepstral analysis method

Compute spectral envelope using lpc method

In [260]:
# Apply pre-emphasis filter
signal = librosa.effects.preemphasis(signal.astype(np.float32), 0.95)

# Set the order of the cepstral analysis
order = 10

In [261]:
# Initialize an empty list to hold the spectral envelopes
spectral_envelopes = []

# Iterate over each frame
for i in range(frames.shape[1]):
    # Get the current frame
    frame = frames[:, i]
    
    # Apply window to the frame
    frame_windowed = frame * np.hamming(frame_length_samples)
    
    # Compute the LPC coefficients using autocorrelation method
    lpc_coeffs = librosa.lpc(frame_windowed, order)
    
    # Compute the roots of the LPC polynomial to obtain the poles
    poles = np.roots(lpc_coeffs)
    
    # Convert the poles to frequencies
    freqs = np.arctan2(poles.imag, poles.real) * (fs / (2*np.pi))
    
    # Compute the magnitude of the poles to obtain the spectral envelope
    mags = np.abs(lfilter(lpc_coeffs, [1], np.exp(-2j*np.pi*freqs/fs)))
    
    # Normalize the spectral envelope
    mags /= mags.max()
    
    # Append the spectral envelope to the list
    spectral_envelopes.append(mags)

Concatenate extracted features of each frame into a feature vector

In [262]:
# Concatenate the spectral envelopes of each frame into a feature vector
feature_vector = np.hstack(spectral_envelopes)

feature_vector, len(feature_vector)

(array([0.04667098, 0.10657962, 0.14827078, ..., 0.98770982, 1.        ,
        0.92575003]),
 3730)

## Voice Synthesis

Apply signal processing techniques (phase vocoder)

In [266]:
# # Load the approximation WAV file
# fs, signal = wav.read("approximation.wav")
signal, fs = librosa.load("filtered.wav", sr=None, mono=True)

# Define the frame length and hop length in seconds
frame_length = 0.1  # seconds
hop_length = 0.05   # seconds

# Convert the frame and hop lengths to samples
frame_length_samples = int(frame_length * fs)
hop_length_samples = int(hop_length * fs)

# Apply pre-emphasis filter
signal_preemphasized = librosa.effects.preemphasis(signal, 1)

# Define the speed-up factor
speed_up_factor = 1

# Apply phase vocoder
spectrogram = librosa.stft(signal_preemphasized)
spectrogram_stretched = librosa.phase_vocoder(spectrogram, speed_up_factor, hop_length=hop_length_samples)

Combine synthesized frames into continuous waveform

Apply post-processing (filtering/normalization)

Save synthesized voice as a audio file

In [267]:
# Set the sampling rate and window size
sr = 48000

waveform = librosa.griffinlim(spectrogram)

# Save the synthetic speech waveform as an audio file
sf.write('new_output.wav', waveform, sr)

print("new_output.wav:")
ipd.Audio("new_output.wav")

new_output.wav:


In [269]:
# Set the sampling rate and window size
sr = 44100

hop_length = 10000

feature_vector = feature_vector.reshape((-1, 10))

# Synthesize an audio waveform from the feature vector
waveform = librosa.feature.inverse.mfcc_to_audio(feature_vector, sr=sr, hop_length=hop_length)

# Compute the short-time Fourier transform of the waveform to generate the spectrogram
spectrogram = np.abs(librosa.stft(waveform, hop_length=hop_length))

# Save the synthetic speech waveform as an audio file
sf.write('new_output2.wav', waveform, sr)

print("new_output2.wav:")
ipd.Audio("new_output2.wav")

new_output2.wav:
