In [2]:
import IPython.display as ipd
import torch
from torch.fft import fft, ifft
# import soundfile
import torchaudio
from torch.nn.functional import interpolate

In [6]:
basta = "audio_16k/Basta_16k.wav"
waveform, sample_rate = torchaudio.load(basta)
waveform2 = waveform.unsqueeze(0).unsqueeze(0)

output_08 = interpolate(input=waveform2, scale_factor=0.8, mode='bilinear')  # Compress
output_08 = output_08.squeeze(0).squeeze(0)
output_12 = interpolate(input=waveform2, scale_factor=1.2, mode='bilinear')  # Strech
output_12 = output_12.squeeze(0).squeeze(0)

torchaudio.save('outputs/interpolation_0_8.wav', output_08, sample_rate)
torchaudio.save('outputs/interpolation_1_2.wav', output_12, sample_rate)

ipd.display(ipd.Audio(waveform, rate=sample_rate))
ipd.display(ipd.Audio(output_08, rate=sample_rate))
ipd.display(ipd.Audio(output_12, rate=sample_rate))

In [126]:
def naive_tempo_shift(wav, factor):
    """
    Stretch an audio waveform by a given factor.
    :param wav: Input audio (tensor) waveform.
    :param factor: Float stretch factor.
    :return: Tensor for the stretched audio waveform.
    """
    if factor == 1.0:
        return wav
    new_wav = []
    for channel in wav:
        new_len = int(channel.shape[0] * factor)
        resampled = []
        for i in range(new_len):
            old_idx = int(i / factor)
            resampled.append(channel[old_idx])
        new_wav.append(resampled)
    return torch.tensor(new_wav)


# Load the audio file
basta = 'audio_16k/Basta_16k.wav'
waveform, sample_rate = torchaudio.load(basta)

# Generate tempo-shifted versions of the audio
factors = [0.8, 1.2]
for factor in factors:
    # Apply the naive tempo shift function
    shifted_wav = naive_tempo_shift(waveform, factor)

    # Save the tempo-shifted waveform to a file
    output_filename = f"outputs/naive_pitch_shift_{str(factor).replace('.', '_')}.wav"
    torchaudio.save(output_filename, shifted_wav, sample_rate)
    
    ipd.display(ipd.Audio(output_filename, rate=sample_rate))

In [7]:
from torch import angle, stack, view_as_complex
from math import pi


# Helper functions

def construct_hann_window(win_size):
    return torch.hann_window(win_size)


def get_complex_stft(signal, win_size, hop, window):
    return torch.stft(signal, win_size, hop, window=window, return_complex=True)


def get_acc_phase_delta(stft_left, stft_right):
    # Calculate angular distance between two complex STFTs
    phase_delta = angle(stft_right) - angle(stft_left)
    
    # Accumulate & Round phase
    phase = torch.zeros_like(phase_delta)
    for i in range(len(phase_delta)):
        phase[i] = phase_delta[i] + phase[i-1] if i>0 else phase_delta[i]
        phase[i] = phase[i] % (2 * pi)
    
    return phase


def get_re_im_from_phase(phase):
    re = torch.cos(phase)
    im = torch.sin(phase)
    return re, im


# Main function

def time_stretch(signal, factor, win_size=1024, hop=1024//4):
    # Create window
    hann_window = construct_hann_window(win_size)
    
    # Draw two complex STFTs
    new_hop = int(hop * (2-factor))
    stft_left = get_complex_stft(signal[:-hop], win_size, new_hop, hann_window)
    stft_right = get_complex_stft(signal[hop:], win_size, new_hop, hann_window)
    
    # Calculate accumulated phase delta with modulus (2 pi)
    phase = get_acc_phase_delta(stft_left, stft_right)
    
    # Reconstruct component from phase
    re, im = get_re_im_from_phase(phase)
    complex_new_stft = view_as_complex(stack([re, im], dim=-1)) * abs(stft_right)
    output = torch.istft(complex_new_stft, n_fft=1024, win_length=win_size, hop_length=hop, window=hann_window)
    return output


basta = 'audio_16k/Basta_16k.wav'
signal, sr = torchaudio.load(basta)
factors = [0.8, 1.2]
for factor in factors:
    output_file = f'outputs/phase_vocoder_{str(factor).replace(".", "_")}.wav'
    stretched_signal = torch.empty(0)
    stretched_signal = time_stretch(signal[0], factor)
    strech = time_stretch(signal[1], factor)
    stretched_signal = torch.stack((stretched_signal, strech), dim=0)
    torchaudio.save(output_file, stretched_signal, sr)
    ipd.display(ipd.Audio(stretched_signal, rate=sr))