In [1]:
from autovc.utils.dataloader import TrainDataLoader
from autovc.utils.model_loader import load_models
# from autovc.auto_encoder.model_vc import Generator
# from autovc.speaker_encoder.model import SpeakerEncoder
# model = Generator()
# model.load_model('models/AutoVC/AutoVC_SMK.pt', device = 'cpu')

# speaker_encoder = SpeakerEncoder()
# speaker_encoder.load_model('models/SpeakerEncoder/SpeakerEncoder.pt')

model, speaker_encoder = load_models(
    model_types= ["auto_encoder", "speaker_encoder"],
    model_paths= ['models/AutoVC/AutoVC_SMK.pt', 'models/SpeakerEncoder/SpeakerEncoder.pt']
)

dataset = TrainDataLoader(data_dir_path = 'data/samples', speaker_encoder = speaker_encoder)
dataloader = dataset.get_dataloader(batch_size = 2, shuffle = True)

model.learn(dataloader, n_epochs = 2)


Loaded encoder "models/SpeakerEncoder/SpeakerEncoder.pt" trained to step 1564501
Training beginning on cpu
Step: 1
Step: 2
Step: 3
Step: 4


In [1]:
from autovc.speaker_encoder.model import SpeakerEncoder
from autovc.speaker_encoder.utils import wav_to_mel_spectrogram, preprocess_wav
import torch
speaker_encoder = SpeakerEncoder()
speaker_encoder.load_model('Models/SpeakerEncoder/SpeakerEncoder.pt')
mels = [speaker_encoder(torch.from_numpy(wav_to_mel_spectrogram(preprocess_wav(wav))).unsqueeze(0)) for wav in ["data/samples/chooped7.wav", "data/samples/mette_183.wav"]]


In [2]:
speaker_encoder.loss(torch.stack(mels))


tensor(nan, grad_fn=<NllLossBackward>)

In [1]:
from autovc.utils.audio import get_mel_frames, audio_to_melspectrogram
import torch
import numpy as np
import matplotlib.pyplot as plt
from autovc.utils.model_loader import load_model
from torch.nn.functional import pad
import soundfile as sf
from autovc.utils.audio import remove_noise
AE = load_model('auto_encoder', 'models/AutoVC/AutoVC_SMK.pt')
SE = load_model('speaker_encoder', 'models/SpeakerEncoder/SpeakerEncoder.pt')
vocoder = load_model('vocoder', 'models/WaveRNN/WaveRNN_Pretrained.pyt')
N = 1000
source = 'data/samples/hilde_301.wav'
target = 'data/samples/HaegueYang_5.wav'
min_pad_coverage = 0.1
overlap = 0.5
frames = get_mel_frames(source,
                        audio_to_melspectrogram, 
                        sr = 22050, 
                        mel_window_step             = 12.5, 
                        order                       = 'MF', 
                        partial_utterance_n_frames  = N, 
                        min_pad_coverage            = min_pad_coverage, 
                        overlap                     = overlap,
                        )

X = torch.stack(frames)
c_source = SE.embed_utterance(source).unsqueeze(0).expand(X.size(0),-1)
c_target = SE.embed_utterance(target).unsqueeze(0).expand(X.size(0),-1)


out, post_out, content_codes = AE(X, c_source, c_target)


Loaded auto encoder "models/AutoVC/AutoVC_SMK.pt" trained to step 13801
Loaded speaker encoder "models/SpeakerEncoder/SpeakerEncoder.pt" trained to step 0
Loaded vocoder "models/WaveRNN/WaveRNN_Pretrained.pyt"


In [100]:

frames = list(post_out)
M = len(frames)
T = int(N * (1-overlap))
for i in range(M):
    frames[i] = pad(frames[i], (i * T, (M-i-1) * T) , mode = 'constant', value = torch.nan)

X = torch.stack(frames)
X_paste = X.nanmean(axis = 0)
X_paste.shape

torch.Size([80, 1000])

In [101]:
waveform = vocoder.generate(X_paste.unsqueeze(0))

| ████████████████ 288000/290400 | Batch Size: 24 | Gen Rate: 6.5kHz | 



In [102]:

wave = np.asarray(waveform)
wave = remove_noise(wave, 22050)
sf.write('chop_and_past_00p_overlap.wav', wave, samplerate =22050)

In [4]:
SE.state_dict()['test'] = 2

torch.save( {'step':0, 'model_state': SE.state_dict(), 'speakers':{'hilde' : torch.randn((1,256)), 'yang':torch.randn((1,256))}}, 'models/SpeakerEncoder/SpeakerEncoder2.pt')

In [7]:
SE2 = load_model('speaker_encoder', 'models/SpeakerEncoder/SpeakerEncoder2.pt')



Loaded speaker encoder "models/SpeakerEncoder/SpeakerEncoder2.pt" trained to step 0


dict_values([tensor([[-0.8979, -0.7079,  0.5696, -0.0379,  1.1863, -0.3241, -0.7413,  0.5706,
         -0.8708, -0.7614,  0.8727, -0.3884, -1.1303,  0.2941,  0.5893,  0.0048,
          1.6446, -0.5212,  2.1275,  1.0680,  2.3424, -0.2470, -0.5285,  1.2640,
          1.0019, -3.6085,  0.5445,  0.0365, -0.2123, -0.0211,  1.4480, -0.1225,
         -1.1412, -1.3419,  0.6223, -0.7319,  0.4823,  0.2453, -1.1903,  0.6038,
          0.8117,  0.3642, -1.5035,  0.1531, -1.1984,  0.0442,  0.0104, -0.0516,
         -0.5144,  1.2197,  1.5099,  0.5336, -1.4649, -0.3608, -0.6761,  1.5573,
         -0.3179, -0.0367,  0.3051,  0.0157,  1.5915,  0.3776,  0.8285, -0.6463,
          0.2736, -0.4032,  0.0318, -0.7122,  0.6429, -0.6561, -0.1261,  0.4313,
          0.2231, -1.3784,  0.5166, -0.9819, -0.1305,  1.5581,  0.1344,  0.1749,
         -1.0496,  1.4752,  0.5158, -0.2854,  1.6855,  1.2184, -0.5191,  0.0500,
         -0.4805, -1.2850, -2.9035,  0.2492, -0.6376,  1.3687, -0.8934,  1.0843,
         -0.372