In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
import torchaudio
torchaudio.set_audio_backend("soundfile")
import matplotlib.pyplot as plt



In [2]:
class AudioSample:
    def __init__(self, filepath):
        loadedData = torchaudio.load(filepath)
        self.waveform = loadedData[0][0]
        self.sampleRate = loadedData[1]
        del loadedData
        self.pitchDeltas = torch.tensor([], dtype = int)
        self.pitchBorders = torch.tensor([], dtype = int)
        self.Pitch = torch.tensor([0], dtype = int)
        self.spectra = torch.tensor([[]], dtype = float)
        self.spectrum = torch.tensor([], dtype = float)
        self.excitation = torch.tensor([], dtype = float)
        self.voicedExcitation = torch.tensor([], dtype = float)
        self.VoicedExcitations = torch.tensor([], dtype = float)
        
    def CalculatePitch(self, expectedPitch, searchRange = 0.2):
        batchSize = math.floor((1. + searchRange) * self.sampleRate / expectedPitch)
        lowerSearchLimit = math.floor((1. - searchRange) * self.sampleRate / expectedPitch)
        batchStart = 0
        while batchStart + batchSize <= self.waveform.size()[0] - batchSize:
            sample = torch.index_select(self.waveform, 0, torch.linspace(batchStart, batchStart + batchSize, batchSize, dtype = int))
            zeroTransitions = torch.tensor([], dtype = int)
            for i in range(lowerSearchLimit, batchSize):
                if (sample[i-1] < 0) and (sample[i] > 0):
                    zeroTransitions = torch.cat([zeroTransitions, torch.tensor([i])], 0)
            error = math.inf
            delta = math.floor(self.sampleRate / expectedPitch)
            for i in zeroTransitions:
                shiftedSample = torch.index_select(self.waveform, 0, torch.linspace(batchStart + i.item(), batchStart + batchSize + i.item(), batchSize, dtype = int))
                newError = torch.sum(torch.pow(sample - shiftedSample, 2))
                if error > newError:
                    delta = i.item()
                    error = newError
            self.pitchDeltas = torch.cat([self.pitchDeltas, torch.tensor([delta])])
            batchStart += delta
        nBatches = self.pitchDeltas.size()[0]
        self.pitchBorders = torch.zeros(nBatches + 1, dtype = int)
        for i in range(nBatches):
            self.pitchBorders[i+1] = self.pitchBorders[i] + self.pitchDeltas[i]
        self.Pitch = torch.mean(self.pitchDeltas.float()).int()
        del batchSize
        del lowerSearchLimit
        del batchStart
        del sample
        del zeroTransitions
        del error
        del delta
        del shiftedSample
        del newError
        del nBatches
        
    def CalculateSpectra(self, iterations = 10, filterWidth = 10, preIterations = 2):
        tripleBatchSize = int(self.sampleRate / 25)
        BatchSize = int(self.sampleRate / 75)
        Window = torch.hann_window(tripleBatchSize)
        signals = torch.stft(self.waveform, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, return_complex = True)
        signals = torch.transpose(signals, 0, 1)
        signalsAbs = signals.abs()
        
        workingSpectra = torch.log(signalsAbs)
        
        workingSpectra = torch.max(workingSpectra, torch.tensor([-100]))
        self.spectra = torch.full_like(workingSpectra, -float("inf"), dtype=torch.float)
        
        for j in range(preIterations):
            workingSpectra = torch.max(workingSpectra, self.spectra)
            self.spectra = workingSpectra
            for i in range(filterWidth):
                self.spectra = torch.roll(workingSpectra, -i, dims = 1) + self.spectra + torch.roll(workingSpectra, i, dims = 1)
            self.spectra = self.spectra / (2 * filterWidth + 1)
        
        self.VoicedExcitations = torch.zeros_like(signals)
        for i in range(signals.size()[0]):
            for j in range(signals.size()[1]):
                if torch.log(signalsAbs[i][j]) > self.spectra[i][j]:
                    self.VoicedExcitations[i][j] = signals[i][j]
                
        for j in range(iterations):
            workingSpectra = torch.max(workingSpectra, self.spectra)
            self.spectra = workingSpectra
            for i in range(filterWidth):
                self.spectra = torch.roll(workingSpectra, -i, dims = 1) + self.spectra + torch.roll(workingSpectra, i, dims = 1)
            self.spectra = self.spectra / (2 * filterWidth + 1)
        
        self.spectrum = torch.mean(self.spectra, 0)
        for i in range(self.spectra.size()[0]):
            self.spectra[i] = self.spectra[i] - self.spectrum
        
        del Window
        del signals
        del workingSpectra
        
    def CalculateExcitation(self, filterWidth = 10):
        tripleBatchSize = int(self.sampleRate / 25)
        BatchSize = int(self.sampleRate / 75)
        Window = torch.hann_window(tripleBatchSize)
        signals = torch.stft(self.waveform, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, return_complex = True)
        signals = torch.transpose(signals, 0, 1)
        excitations = torch.empty_like(signals)
        for i in range(excitations.size()[0]):
            excitations[i] = signals[i] / (torch.exp(self.spectrum) + torch.exp(self.spectra[i]))
            self.VoicedExcitations[i] = self.VoicedExcitations[i] / (torch.exp(self.spectrum) + torch.exp(self.spectra[i]))
        
        VoicedExcitations = torch.transpose(self.VoicedExcitations, 0, 1)
            
        excitations = torch.transpose(excitations, 0, 1)
        self.excitation = torch.istft(excitations, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, onesided = True)
        self.voicedExcitation = torch.istft(VoicedExcitations, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, onesided = True)
        
        self.excitation = self.excitation - self.voicedExcitation
        
        self.excitation = torch.stft(self.excitation, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, return_complex = True)
        
        del Window
        del signals
        del excitations

In [3]:
class RelLoss(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(RelLoss, self).__init__()
 
    def forward(self, inputs, targets):    
        inputs = inputs.view(-1)
        targets = targets.view(-1)
        differences = torch.abs(inputs - targets)
        sums = torch.abs(inputs + targets)
        out = (differences / sums).sum() / inputs.size()[0]
        return out

In [4]:
class SpecCrfAi(nn.Module):
    def __init__(self, learningRate=1e-4):
        super(SpecCrfAi, self).__init__()
        
        self.layer1 = torch.nn.Linear(3843, 3843)
        self.ReLu1 = nn.PReLU()
        self.layer2 = torch.nn.Linear(3843, 5763)
        self.ReLu2 = nn.PReLU()
        self.layer3 = torch.nn.Linear(5763, 3842)
        self.ReLu3 = nn.PReLU()
        self.layer4 = torch.nn.Linear(3842, 1921)
        
        self.learningRate = learningRate
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learningRate, weight_decay=0.)
        self.criterion = nn.L1Loss()
        #self.criterion = RelLoss()
        
    def forward(self, spectrum1, spectrum2, factor):
        fac = torch.tensor([factor])
        x = torch.cat((spectrum1, spectrum2, fac), dim = 0)
        x = x.float()#.unsqueeze(0).unsqueeze(0)
        x = self.layer1(x)
        x = self.ReLu1(x)
        x = self.layer2(x)
        x = self.ReLu2(x)
        x = self.layer3(x)
        x = self.ReLu3(x)
        x = self.layer4(x)
        return x
    
    def processData(self, spectrum1, spectrum2, factor):
        output = torch.squeeze(self(spectrum1, spectrum2, factor))
        return output
    
    def train(self, indata, epochs=1):
        for epoch in range(epochs):
            for data in self.dataLoader(indata):
                spectrum1 = data[0]
                spectrum2 = data[-1]
                indexList = np.arange(0, data.size()[0], 1)
                np.random.shuffle(indexList)
                for i in indexList:
                    factor = i / float(data.size()[0])
                    spectrumTarget = data[i]
                    output = torch.squeeze(self(spectrum1, spectrum2, factor))
                    loss = self.criterion(output, spectrumTarget)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, epochs, loss.data))
            
    def dataLoader(self, data):
        return torch.utils.data.DataLoader(dataset=data, shuffle=True)

In [5]:
class VocalSegment:
    def __init__(self, start1, start2, start3, end1, end2, end3, startCap, endCap, phonemeKey, vb, offset, repetititionSpacing, pitch, steadiness):
        self.start1 = start1
        self.start2 = start2
        self.start3 = start3
        self.end1 = end1
        self.end2 = end2
        self.end3 = end3
        self.startCap = startCap
        self.endCap = endCap
        self.phonemeKey = phonemeKey
        self.vb = vb
        self.offset = offset
        self.repetititionSpacing = repetititionSpacing
        self.pitch = pitch
        self.steadiness = steadiness
        
    def getSpectrum(self):
        if self.startCap:
            windowStart = self.offset
        else:
            windowStart = self.start3 - self.start1 + self.offset
        if self.endCap:
            windowEnd = self.end3 - self.start1 + self.offset
        else:
            windowEnd = self.end1 - self.start1 + self.offset
        spectrum =  self.vb.phonemeDict[self.phonemeKey].spectrum[windowStart:windowEnd]#implement looping
        spectra =  self.vb.phonemeDict[self.phonemeKey].spectrum[windowStart:windowEnd]
        return torch.exp(spectrum) +  torch.pow(1 - self.steadiness, 2) * torch.exp(spectra)
    
    def getExcitation(self):
        premul = self.vb.phonemeDict[phonemeKey].excitation.size()[0] / (self.end3 - self.start1)
        if self.startCap:
            windowStart = int((self.start2 - self.start1 + self.offset) * premul)
        else:
            windowStart = int(self.offset * premul)
        if self.endCap:
            windowEnd = int(self.end2 - self.start1 + self.offset * premul)
        else:
            windowEnd = int(self.end3 - self.start1 + self.offset * premul)
        excitation = self.vb.phonemeDict[phonemeKey].excitation[windowStart:windowEnd]
        transform = torchaudio.transforms.TimeStretch(hop_length = self.vb.sampleRate / 75,
                                                      n_freq = self.vb.sampleRate / 25, 
                                                      fixed_rate = 1. / premul)
        excitation = transform(excitation)
        window = torch.hann_window(self.vb.sampleRate / 25)
        excitation = torch.istft(excitation, self.vb.sampleRate / 25, hop_length = self.vb.sampleRate / 75, win_length = self.vb.sampleRate / 25, window = window, onesided = True)
        return excitation
    
    def getVoicedExcitation(self):
        nativePitch = self.vb.phonemeDict[self.phonemeKey].pitch
        #nativePitch = self.vb.phonemeDict[self.phonemeKey].pitches[...]
        #pitch = nativePitch + self.vb.phonemeDict[phonemeKey].pitches...
        premul = self.pitch / nativePitch * self.vb.sampleRate / 25
        windowStart = int(self.offset * premul)
        windowEnd = int(self.end3 - self.start1 + self.offset * premul)
        voicedExcitation = self.vb.phonemeDict[phonemeKey].voicedExcitation[windowStart:windowEnd]
        window = torch.hann_window(self.vb.sampleRate / 25)
        voicedExcitation = torch.stft(voicedExcitation, self.vb.sampleRate / 25, hop_length = self.vb.sampleRate / 75, win_length = self.vb.sampleRate / 25, window = Window, return_complex = True)
        transform = torchaudio.transforms.Resample(orig_freq = self.vb.sampleRate,
                                                   new_freq = self.vb.sampleRate / premul,
                                                   resampling_method = 'sinc_interpolation',
                                                   lowpass_filter_width = 6,
                                                   rolloff = 0.99,
                                                   beta = None)
        voicedExcitation = transform(voicedExcitation)
        voicedExcitation = torch.istft(voicedExcitation, self.vb.sampleRate / 25, hop_length = self.vb.sampleRate / 75, win_length = self.vb.sampleRate / 25, window = window, onesided = True)
        return voicedExcitation
        #resample segments
        #individual fourier transform
        #istft
        #windowing adaptive to borders

In [22]:
class VocalSequence:
    def __init__(self, start, end, vb, borders, phonemes, offsets):
        self.start = start
        self.end = end
        self.vb = vb
        self.synth = Synthesizer(self.vb.sampleRate)
        self.segments = []
        if len(phonemes)== 1:#rewrite border system to use tensor, implement pitch and steadiness
            self.segments.append(VocalSegment(borders[0], borders[1], borders[2], borders[3], borders[4], borders[5],
                                             True, True, phonemes[0], vb, offsets[0], None, 385, 0))
        else:
            self.segments.append(VocalSegment(borders[0], borders[1], borders[2], borders[3], borders[4], borders[5],
                                             True, False, phonemes[0], vb, offsets[0], None, 385, 0))
            for i in range(1, len(phonemes)-1):
                self.segments.append(VocalSegment(borders[3*i], borders[3*i+1], borders[3*i+2], borders[3*i+3], borders[3*i+4], borders[3*i+5],
                                                  False, False, phonemes[i], vb, offsets[i], None, 385, 0))
            endpoint = len(phonemes)-1
            self.segments.append(VocalSegment(borders[3*endpoint], borders[3*endpoint+1], borders[3*endpoint+2], borders[3*endpoint+3], borders[3*endpoint+4], borders[3*endpoint+5],
                                             False, True, phonemes[0], vb, offsets[0], None, 385, 0))

        self.requiresUpdate = np.ones(len(phonemes))
        self.update()
    def update(self):
        for i in range(self.requiresUpdate.size):
            if self.requiresUpdate[i] == 1:
                segment = self.segments[i]
                spectrum = torch.zeros((segment.end3 - segment.start1, int(self.vb.sampleRate / 25) + 1))
                excitation = torch.zeros(segment.end3 - segment.start1)
                voicedExcitation = torch.zeros(segment.end3 - segment.start1)
                if segment.startCap:
                    windowStart = 0
                else:
                    windowStart = segment.start3 - segment.start1
                    previousSpectrum = self.segments[i-1].getSpectrum()[-1]
                    previousVoicedExcitation = self.segments[i-1].getVoicedExcitation()[self.segments[i-1].end1:self.segments[i-1].end3]
                if segment.endCap:
                    windowEnd = segment.end3 - segment.start1
                else:
                    windowEnd = segment.end1 - segment.start1
                    nextSpectrum = self.segments[i+1].getSpectrum()[0]
                    nextVoicedExcitation = self.segments[i-1].getVoicedExcitation()[self.segments[i+1].start1:self.segments[i+1].start3]
                spectrum[windowStart:windowEnd] = segment.getSpectrum()
                voicedExcitation[segment.start1:segment.end3] = segment.getVoicedExcitation()
                if segment.startCap == False:
                    for i in range(segment.start3 - segment.start1):
                        spectrum[i] = self.vb.crfAi.processData(previousSpectrum, spectrum[windowStart], i / (self.start3 - self.start1))
                    voicedExcitation[segment.start1:segment.start3] += previousVoicedExcitation
                if segment.endCap == False:
                    for i in range(segment.end1 - segment.start1, segment.end3 - segment.start1):
                        spectrum[i] = self.vb.crfAi.processData(spectrum[windowEnd], nextSpectrum, (i - self.start1) / (self.end3 - self.end1))
                    voicedExcitation[segment.end1:segment.end3] += nextVoicedExcitation
                if segment.startCap:
                    windowStart = segment.offset
                else:
                    windowStart = segment.start2 - segment.start1 + segment.offset
                    previousExcitation = self.segments[i-1].getSpectrum()[-1]
                if segment.endCap:
                    windowEnd = segment.end3 - segment.start1 + segment.offset
                else:
                    windowEnd = segment.end2 - segment.start1 + segment.offset
                excitation[windowStart:windowEnd] = segment.getExcitation()
                
                skipPrevious = True#implement skipPrevious
                
            else:
                skipPrevious = False
            
            self.synth.Synthesize(0, spectrum, excitation, voicedExcitation)
    def save(self):
        self.synth.save("Output_Demo.wav")
    

In [7]:
class TempVB:
    def __init__(self):
        self.sampleRate = 48000
        self.phonemeDict = dict([])
        phonemeKeys = ["A", "E", "I", "O", "U", "G", "K", "N", "S", "T"]
        for key in phonemeKeys:
            self.phonemeDict[key] = AudioSample("Samples_rip/"+key+".wav")
            self.phonemeDict[key].CalculatePitch(385.)
            self.phonemeDict[key].CalculateSpectra(iterations = 15)
            self.phonemeDict[key].CalculateExcitation()
        self.crfAi = SpecCrfAi(learningRate=1e-4)

In [8]:
class Voicebank:
    def __init__(self, vbKey):
        self.phonemeDict = dict()
        loaded_weights = 0#(vbKey)
        self.crfAi = 0#SpecCrfAi(loaded_weights)
        #load additional parameters
        self.sampleRate = 48000

In [9]:
class Synthesizer:
    def __init__(self, sampleRate):
        self.sampleRate = sampleRate
        self.returnSignal = torch.tensor([], dtype = float)
        
    def Synthesize(self, Excitation, VoicedExcitation):
        tripleBatchSize = int(self.sampleRate / 25)
        BatchSize = int(self.sampleRate / 75)
        Window = torch.hann_window(tripleBatchSize)
        self.returnSignal = torch.stft(Excitation + VoicedExcitation, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, return_complex = True)
        self.returnSignal = torch.transpose(self.returnSignal, 0, 1)
        for i in range(Spectra.size()[0]):
            self.returnSignal[i] = self.returnSignal[i] * torch.exp(Spectrum)[i]
        self.returnSignal = torch.transpose(self.returnSignal, 0, 1)
        self.returnSignal = torch.istft(self.returnSignal, tripleBatchSize, hop_length = BatchSize, win_length = tripleBatchSize, window = Window, onesided=True, )
        del Window
        
    def save(self, filepath):
        torchaudio.save(filepath, torch.unsqueeze(self.returnSignal.detach(), 0), self.sampleRate, format="wav", encoding="PCM_S", bits_per_sample=32)

In [10]:
TrainingSamples = dict([])
TrainingKeys = ["A_E", "A_G", "A_I", "A_K", "A_N", "A_O", "A_S", "A_T", "A_U",
                   "E_A", "E_G", "E_I", "E_K", "E_N", "E_O", "E_S", "E_T", "E_U",
                   "I_A", "I_E", "I_G", "I_K", "I_N", "I_O", "I_S", "I_T", "I_U",
                   "O_A", "O_E", "O_G", "O_I", "O_K", "O_N", "O_S", "O_T", "O_U",
                   "U_A", "U_E", "U_G", "U_I", "U_K", "U_N", "U_O", "U_S", "U_T",
                   "G_A", "G_E", "G_I", "G_O", "G_U",
                   "K_A", "K_E", "K_I", "K_O", "K_U",
                   "N_A", "N_E", "N_I", "N_O", "N_U",
                   "S_A", "S_E", "S_I", "S_O", "S_U",
                   "T_A", "T_E", "T_I", "T_O", "T_U"
                  ]

for key in TrainingKeys:
    TrainingSamples[key] = AudioSample("Samples_rip/"+key+".wav")
    TrainingSamples[key].CalculatePitch(252.)
    TrainingSamples[key].CalculateSpectra(iterations = 25)
    TrainingSamples[key].CalculateExcitation()

In [11]:
trainSpectra = []
i = 0
for key in TrainingKeys:
    trainSpectra.append(torch.empty_like(TrainingSamples[key].spectra))
    for j in range(TrainingSamples[key].spectra.size()[0]):
        trainSpectra[i][j] = TrainingSamples[key].spectrum + TrainingSamples[key].spectra[j]
    i += 1

vb = TempVB()
#specCrfAi = SpecCrfAi(learningRate=1e-4)
for i in range(70):
    vb.crfAi.train(trainSpectra[i], epochs = 2)

epoch [1/2], loss:1.4085
epoch [2/2], loss:0.7905
epoch [1/2], loss:0.6390
epoch [2/2], loss:1.2114
epoch [1/2], loss:1.3043
epoch [2/2], loss:0.6266
epoch [1/2], loss:0.8756
epoch [2/2], loss:2.5752
epoch [1/2], loss:0.8390
epoch [2/2], loss:1.1239
epoch [1/2], loss:0.7671
epoch [2/2], loss:0.5695
epoch [1/2], loss:0.7172
epoch [2/2], loss:0.5776
epoch [1/2], loss:1.9729
epoch [2/2], loss:1.6665
epoch [1/2], loss:0.6442
epoch [2/2], loss:0.7506
epoch [1/2], loss:0.5274
epoch [2/2], loss:0.5160
epoch [1/2], loss:0.7016
epoch [2/2], loss:0.6520
epoch [1/2], loss:0.7348
epoch [2/2], loss:0.5469
epoch [1/2], loss:0.6946
epoch [2/2], loss:0.6906
epoch [1/2], loss:0.6787
epoch [2/2], loss:0.9228
epoch [1/2], loss:0.3633
epoch [2/2], loss:0.3958
epoch [1/2], loss:0.5550
epoch [2/2], loss:0.6227
epoch [1/2], loss:2.9978
epoch [2/2], loss:0.6827
epoch [1/2], loss:0.9355
epoch [2/2], loss:0.5118
epoch [1/2], loss:0.4209
epoch [2/2], loss:0.4778
epoch [1/2], loss:0.5622
epoch [2/2], loss:0.3717


In [23]:
borders = [0, 1, 2,
           70, 75, 80,
           145, 150, 155,
           222, 225, 228,
           297, 300, 303,
           373, 374, 375
          ]
phonemes = ["A", "N", "A", "T", "A"]
offsets = [0, 10, 10, 15, 10]

sequence = VocalSequence(0, 400, vb, borders, phonemes, offsets)

AttributeError: 'VocalSequence' object has no attribute 'offset'

In [None]:
sequence.update()

In [None]:
sequence.save()