In [16]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import DataLoader
from datasets import load_dataset
from TTS.api import TTS
from torch.utils.data import DataLoader

from IPython.display import Audio
from copy import deepcopy

device = "cuda" if torch.cuda.is_available() else "cpu"

In [26]:
# data
cv_13 = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train", trust_remote_code=True).shuffle(seed=42)

In [27]:
model_silero, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en',
                                       device=device)
read_batch, _, _, prepare_model_input = utils

Using cache found in /home/ansafronov/.cache/torch/hub/snakers4_silero-models_master


In [28]:
# model text-to-seach

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


In [29]:
pretrained_speaker_embedding = [speaker['speaker_embedding'].numpy().flatten() for speaker in tts.synthesizer.tts_model.speaker_manager.speakers.values()]
mean_embedding = torch.tensor(np.mean(pretrained_speaker_embedding, axis=0)).reshape(1, 512)

In [34]:
data

{'client_id': '1b3926baec025c02cadcf9816c13268ced2a32f97cedf810bf21d819ae338bbff4fe8cf77d8eb2c85de9e78e9b2011ff14f07fc6cc05efff2f74351b7563af1c',
 'path': '/home/ansafronov/.cache/huggingface/datasets/downloads/extracted/49ccba06b8c20fe2a44a984eb08a4241a4273bf197c98f398e257725c2ee8f32/en_train_9/common_voice_en_23841314.mp3',
 'audio': {'path': '/home/ansafronov/.cache/huggingface/datasets/downloads/extracted/49ccba06b8c20fe2a44a984eb08a4241a4273bf197c98f398e257725c2ee8f32/en_train_9/common_voice_en_23841314.mp3',
  'array': array([ 0.00000000e+00, -9.61121509e-14, -5.16199225e-14, ...,
          6.76553464e-06,  3.94860181e-05,  4.30507207e-05]),
  'sampling_rate': 48000},
 'sentence': 'The pavilion is decorated with a flame motif in gilded black lacquer.',
 'up_votes': 2,
 'down_votes': 1,
 'age': 'twenties',
 'gender': 'male',
 'accent': '',
 'locale': 'en',
 'segment': '',
 'variant': ''}

In [31]:
data = next(iter(cv_13))

file_path = data['path']

input = prepare_model_input(read_batch([file_path]),
                            device=device)

output = model_silero(input)
text = decoder(output[0].cpu())
output_file = "output.wav"
tts.tts_to_file(text, speaker_wav=file_path, language="en", file_path=output_file)

 > Text splitted to sentences.
['the paavilion is decorated with a flame orortifen created plate liquor']
 > Processing time: 1.8811466693878174
 > Real-time factor: 0.3158064628761221


'output.wav'

In [32]:
Audio(file_path)

In [33]:
Audio(output_file)

In [36]:
class ConstantWrapper(nn.Module):
    def __init__(self, constant):
        super().__init__()
        self.constant = constant
    
    def forward(self, X):
        return self.constant

class AddWrapper(nn.Module):
    def __init__(self, constant):
        super().__init__()
        self.constant = constant
    
    def forward(self, X):
        return X + self.constant

class SpeakerEncoderWrapper(nn.Module):
    def __init__(self, speaker_encoder, type_of_augmentation=None):
        super().__init__()
        self.speaker_encoder = speaker_encoder
        self.aug_type = type_of_augmentation
        if self.aug_type:
            if self.aug_type == 'mean':
                self.correct_embeding = ConstantWrapper(mean_embedding)
            if self.aug_type == 'noise':
                self.correct_embeding = AddWrapper(torch.randn(1, 512))
        else:
            self.correct_embeding = nn.Identity()

    def forward(self, x, l2_norm=False):
        out = self.speaker_encoder( x, l2_norm)
        corrected = self.correct_embeding(out)
        return corrected

In [37]:
class SpeechAnonymizer(nn.Module):
    OUTPUT_DIR = '../output/audio/'
        
    def __init__(self, anonymize=False, augment=None):
        super().__init__()

        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
        model_silero, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en',
                                       device=device)
        read_batch, _, _, prepare_model_input = utils

        self.speech_to_text = model_silero
        self.text_decoder = decoder
        self.text_to_speech = tts
                    
        self._prepeare_model_inputs = prepare_model_input
        self._read_batch = read_batch

        self.speech_to_text.eval()
        self.text_to_speech.eval()
        
        self.anonymize = anonymize
        if self.anonymize:
            (
                self
                .text_to_speech
                .synthesizer
                .tts_model
                .hifigan_decoder
                .speaker_encoder
            ) = SpeakerEncoderWrapper(
                (
                    self
                    .text_to_speech
                    .synthesizer
                    .tts_model
                    .hifigan_decoder
                    .speaker_encoder
                ),
                type_of_augmentation=augment
            )

    def forward(self, data):
        file_path = data['path']
        output_path = self._get_output_path(file_path)

        # speach-to-text
        stt_input = self._prepeare_model_inputs(self._read_batch([file_path]), device=device)
        stt_output = self.speech_to_text(stt_input)
        text = self.text_decoder(stt_output[0].cpu())

        # text-to-speach
        self.text_to_speech.tts_to_file(text, speaker_wav=file_path, language="en", file_path=output_path)
        return output_path
        

    def get_speaker_embeding(self, data):
        activation = {}
        def get_activation(name):
            def hook(model, input, output):
                activation[name] = output.detach()
            return hook
        
        file_path = data['path']
        output_path = self._get_output_path(file_path)

        # speach-to-text
        stt_input = prepare_model_input(read_batch([file_path]), device=device)
        stt_output = self.speech_to_text(stt_input)
        text = self.text_decoder(stt_output[0].cpu())

        (
            self.text_to_speech.synthesizer.tts_model.hifigan_decoder.speaker_encoder
            .register_forward_hook(get_activation('speaker_encoder'))
        )
        self.text_to_speech.tts(text,speaker_wav=file_path, language="en")
        print(activation)
        return activation['speaker_encoder']
        

    def _get_output_path(self, file_path):
        file_name = file_path.split('/')[-1]
        return self.OUTPUT_DIR + file_name
        

In [38]:
def show_audio(original_file, anonimized_file):
    print(original_file['sentence'])
    print('Original file')
    display(Audio(original_file))
    print('Anonimized file')
    display(Audio(anonimized_file))

In [39]:
model = SpeechAnonymizer(anonymize=True, augment='mean')

output_data = []
for i, data in enumerate(cv_13):
    out_dict = deepcopy(data)
    out_dict['out_path'] = model(data)
    output_data.append(out_dict)
    if i >= 1000:
        break

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


Using cache found in /home/ansafronov/.cache/torch/hub/snakers4_silero-models_master


 > Text splitted to sentences.
['the paavilion is decorated with a flame orortifen created plate liquor']
 > Processing time: 1.73490309715271
 > Real-time factor: 0.30123640301922366
 > Text splitted to sentences.
['here goods had to be trainshipped on the meorgoard strain']
 > Processing time: 1.7326281070709229
 > Real-time factor: 0.30640037341936555
 > Text splitted to sentences.
['there were also three separate laboratory actions that resulted in infection']
 > Processing time: 1.8861727714538574
 > Real-time factor: 0.29860790932335984
 > Text splitted to sentences.
['he did not seek it will turn to provincial office after this time']
 > Processing time: 2.1280486583709717
 > Real-time factor: 0.3075093905124772
 > Text splitted to sentences.
['the artist did in fact paint much of their work in these wild areas']
 > Processing time: 1.521899938583374
 > Real-time factor: 0.29586237168291896
 > Text splitted to sentences.
['language has mittturn your lghter script']
 > Processing

In [None]:
from datasets import Dataset

output_dataset = Dataset.from_list(output_data)
output_dataset.save_to_disk('../output/')

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
cur = output_data[1]

show_audio(cur['path'], cur['out_path'])

Original file


Anonimized file


In [23]:
cur

NameError: name 'cur' is not defined