### Human voice to Robot voice

In [267]:
%pip install soundfile scipy numpy pywavelets librosa matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [268]:
import soundfile as sf
import numpy as np
import pywt
import librosa
import matplotlib.pyplot as plt
import scipy.io.wavfile as wav
from scipy.signal import butter, lfilter

Load source voice and target voice signals

In [269]:
# Load the WAV file
fs, signal = wav.read('input.wav')

Perform Noise filtering

In [270]:
# Define the filter parameters
cutoff_freq = 1000  # Hz
filter_order = 5

# Create a Butterworth filter
nyquist_freq = 0.5 * fs
cutoff = cutoff_freq / nyquist_freq
b, a = butter(filter_order, cutoff, btype="highpass")

In [271]:
# Apply the filter to the signal
filtered_signal = lfilter(b, a, signal)

In [272]:
# Save the filtered signal to a new WAV file
wav.write("filtered.wav", fs, ((filtered_signal + filtered_signal.min()) * (2 ** 15) / filtered_signal.ptp()).astype(np.int16))

## Feature Extraction

Perform Discrete Transform

In [273]:
# Define the wavelet to use and the number of decomposition levels
wavelet = 'db4'
levels = 5

# Perform the discrete wavelet transform
coeffs = pywt.wavedec(signal, wavelet, level=levels)

# Get the approximation coefficients (i.e., the reconstructed low-pass component)
approx = pywt.upcoef('a', coeffs[0], wavelet, level=levels, take=len(signal))

In [274]:
# Save the approximation coefficients to a new WAV file
wav.write("approximation.wav", fs, ((approx + approx.min()) * (2 ** 15) / approx.ptp()).astype(np.int16))

Divide the generated approximation wav file into small frames of fixed time period

In [275]:
# Define the frame length and hop length in seconds
frame_length = 0.1  # seconds
hop_length = 0.05   # seconds

# Convert the frame and hop lengths to samples
frame_length_samples = int(frame_length * fs)
hop_length_samples = int(hop_length * fs)

# Divide the signal into frames
frames = librosa.util.frame(signal, frame_length=frame_length_samples, hop_length=hop_length_samples)

In [276]:
# Save each frame to a separate WAV file
for i, frame in enumerate(frames):
    filename = f"./frames/frame_{i+1}.wav"
    wav.write(filename, fs, frame)

Estimate fundamental frequency of each frame using auto correlation

In [283]:
# Initialize an empty list to hold the fundamental frequencies
f0s = []

# Iterate over each frame
for i, frame in enumerate(frames):
    # Compute the autocorrelation of the frame
    autocorr = np.correlate(frame, frame, mode='full')
    
    # Get the positive half of the autocorrelation
    autocorr_pos = autocorr[len(autocorr)//2:]
    
    # Find the index of the maximum value in the positive half of the autocorrelation
    peak_index = np.argmax(autocorr_pos)
    
    # Convert the peak index to a frequency value in Hz
    f0 = fs / peak_index
    
    # Append the fundamental frequency to the list
    f0s.append(f0)

  f0 = fs / peak_index


Extract the frequency and bandwidth using cepstral analysis method

In [277]:
# Set the order of the cepstral analysis
order = 12

# Initialize an empty array to hold the frequency and bandwidth information for each frame
freq_band = np.zeros((frames.shape[1], 2))

In [278]:
# Iterate over each frame
for i in range(frames.shape[1]):
    # Apply pre-emphasis to the frame
    pre_emphasis = librosa.effects.preemphasis(frames[:, i].astype(np.float32))

    # Compute the power spectrum of the pre-emphasized frame
    power_spectrum = np.abs(np.fft.fft(pre_emphasis)) ** 2

    # Compute the logarithm of the power spectrum
    log_power_spectrum = np.log(power_spectrum)

    # Compute the cepstrum of the logarithm of the power spectrum
    cepstrum = np.real(np.fft.ifft(log_power_spectrum))

    # Set the coefficients of the cepstrum beyond the order to zero
    cepstrum[order+1:] = 0

    # Compute the inverse Fourier transform of the modified cepstrum
    modified_power_spectrum = np.exp(np.fft.fft(cepstrum))
    modified_power_spectrum[0] = 0

    # Find the index of the maximum value in the modified power spectrum
    max_index = np.argmax(modified_power_spectrum)

    # Convert the index to a frequency value in Hz
    freq = fs / max_index

    # Compute the bandwidth of the peak in the modified power spectrum
    bandwidth = fs / np.sum(modified_power_spectrum > 0.5 * np.max(modified_power_spectrum))

    # Store the frequency and bandwidth information for the frame
    freq_band[i, :] = [freq, bandwidth]

Compute spectral envelope using lpc method

In [279]:
# Apply pre-emphasis filter
signal = librosa.effects.preemphasis(signal.astype(np.float32), 0.95)

# Initialize an empty list to hold the spectral envelopes
spectral_envelopes = []

In [280]:
# Iterate over each frame
for i in range(frames.shape[1]):
    # Get the current frame
    frame = frames[:, i]
    
    # Apply window to the frame
    frame_windowed = frame * np.hamming(frame_length_samples)
    
    # Compute the LPC coefficients using autocorrelation method
    lpc_coeffs = librosa.lpc(frame_windowed, order)
    
    # Compute the roots of the LPC polynomial to obtain the poles
    poles = np.roots(lpc_coeffs)
    
    # Convert the poles to frequencies
    freqs = np.arctan2(poles.imag, poles.real) * (fs / (2*np.pi))
    
    # Compute the magnitude of the poles to obtain the spectral envelope
    mags = np.abs(lfilter(lpc_coeffs, [1], np.exp(-2j*np.pi*freqs/fs)))
    
    # Normalize the spectral envelope
    mags /= mags.max()
    
    # Append the spectral envelope to the list
    spectral_envelopes.append(mags)

Concatenate extracted features of each frame into a feature vector

In [281]:
# Concatenate the spectral envelopes of each frame into a feature vector
feature_vector = np.hstack(spectral_envelopes)

## Voice Synthesis

Apply signal processing techniques (phase vocoder)

### The code below may be incorrect

In [282]:
# # Load the approximation WAV file
# fs, signal = wav.read("approximation.wav")
# signal, fs = librosa.load(filename, sr=None, mono=True)

# # Define the speed-up factor
# speed_up_factor = 2

# # Write the stretched signal to a WAV file
# wav.write("stretched.wav", fs, feature_vector.astype(np.int16))