In [None]:
!gdown 1ROgY8hfDKaocoeUuSOiNOaCPKQhtQzH5
!unzip Homework_48.zip

Downloading...
From: https://drive.google.com/uc?id=1ROgY8hfDKaocoeUuSOiNOaCPKQhtQzH5
To: /content/Homework_48.zip
  0% 0.00/5.90M [00:00<?, ?B/s] 18% 1.05M/5.90M [00:00<00:00, 9.73MB/s]100% 5.90M/5.90M [00:00<00:00, 34.2MB/s]
Archive:  Homework_48.zip
replace test_1.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
!pip install -q TTS torch torchaudio soundfile numpy

[0m

In [None]:
import pandas as pd
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, MarianMTModel, MarianTokenizer
import torch  # Библиотека для работы с нейронными сетями
from TTS.api import TTS  # API для Text-to-Speech
import soundfile as sf  # Работа с аудиофайлами
import librosa  # Библиотека для анализа аудио
import numpy as np  # Работа с массивами
import matplotlib.pyplot as plt  # Визуализация
from IPython.display import Audio, display  # Воспроизведение аудио

# Класс для распознавания и транскпибирования речи

In [None]:
class ASR:
    def __init__(self):
        self.asr = pipeline("automatic-speech-recognition", "openai/whisper-large-v3", torch_dtype=torch.float16, device="cuda:0")

    def transcribe(self, path_to_audiofile):
        transcription = self.asr(path_to_audiofile)
        return transcription["text"]


# Класс для перевода текста транскрибированной речи

In [None]:
class Translator:
    def __init__(self, origin, target):
        self.model_name_ru_en = "Helsinki-NLP/opus-mt-ru-en"
        self.model_name_ru_fr = "Helsinki-NLP/opus-mt-ru-fr"
        self.model_name_en_fr = "Helsinki-NLP/opus-mt-en-fr"
        self.tokenizer_ru_en = MarianTokenizer.from_pretrained(self.model_name_ru_en)
        self.tokenizer_ru_fr = MarianTokenizer.from_pretrained(self.model_name_ru_fr)
        self.tokenizer_en_fr = MarianTokenizer.from_pretrained(self.model_name_en_fr)
        self.model_ru_en = MarianMTModel.from_pretrained(self.model_name_ru_en)
        self.model_ru_fr = MarianMTModel.from_pretrained(self.model_name_ru_fr)
        self.model_en_fr = MarianMTModel.from_pretrained(self.model_name_en_fr)


    def translate(self, text_to_translate, text_language = "ru", target_language = "en"):
        match text_language:
            case "ru":
                if target_language == "en":
                    return self.translate_custom(self.model_ru_en, self.tokenizer_ru_en, f">>rus<<{text_to_translate}")
                elif target_language =="fr-fr":
                    return self.translate_custom(self.model_ru_fr, self.tokenizer_ru_fr, f">>rus<<{text_to_translate}")
            case "en":
                return self.translate_custom(self.model_en_fr, self.tokenizer_en_fr, f">>en<<{text_to_translate}")

    def translate_custom(self, model, tokenizer, text):
        translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
        return [tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]

## Класс для клонирования голоса

In [None]:
class VoiceCloningTTS:
    def __init__(self):
        # Инициализация модели TTS с многоязычной поддержкой
        self.tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts",
                       progress_bar=False)

    def clone_voice(self, reference_audio_path, text, language="en", output_path="output.wav"):
        try:
            # Загрузка референсного аудио без изменения частоты дискретизации
            _, sr = librosa.load(reference_audio_path, sr=None)

            # Генерация речи с клонированным голосом
            wav = self.tts.tts(
                text=text,
                speaker_wav=reference_audio_path,
                language=language
            )

            # Сохранение результата
            sf.write(output_path, wav, self.tts.synthesizer.output_sample_rate)
            return True, "Голос успешно клонирован и сгенерирован"
        except Exception as e:
            return False, f"Ошибка при клонировании голоса: {str(e)}"

    def analyze_voices(self, original_path, cloned_path):
        # Загрузка оригинального и клонированного аудио
        y_orig, sr_orig = librosa.load(original_path, sr=None)
        y_clone, sr_clone = librosa.load(cloned_path, sr=None)

        # Извлечение MFCC-признаков
        mfcc_orig = librosa.feature.mfcc(y=y_orig, sr=sr_orig)
        mfcc_clone = librosa.feature.mfcc(y=y_clone, sr=sr_clone)

        # Выравнивание длины признаков
        min_frames = min(mfcc_orig.shape[1], mfcc_clone.shape[1])
        mfcc_orig = mfcc_orig[:, :min_frames]
        mfcc_clone = mfcc_clone[:, :min_frames]

        # Расчет корреляции между признаками
        correlation = np.corrcoef(mfcc_orig.flatten(), mfcc_clone.flatten())[0, 1]
        return correlation

# Декоратор переводчика

In [None]:
class AudioTranslator:
    def __init__(self, origin_language, target_language):
        self.asr = ASR()
        self.translator = Translator(origin_language, target_language)
        self.tts = VoiceCloningTTS()
        self.language = target_language

    def translate_audio(self, path_to_audiofile, output_file):
        transcription = self.asr.transcribe(path_to_audiofile)
        print("transcribed")
        translation = self.translator.translate(transcription)
        print("translated")
        success, message = self.tts.clone_voice(
            path_to_audiofile,
            translation,
            self.language,
            output_file
        )

        if success:
            print("\nОригинальный голос:")
            display(Audio(path_to_audiofile))
            print("\Сгенерированный голос:")
            display(Audio(output_file))
        else:
            print(message)


In [None]:
translator = AudioTranslator("ru", "en")
translator_2 = AudioTranslator("ru", "fr-fr")
translator_3 = AudioTranslator("en", "fr-fr")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > External Speaker Encoder Loaded !!
 > initialization of language-embedding layers.
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10

Device set to use cuda:0


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

Device set to use cuda:0


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

In [None]:
translator.translate_audio("test_4.wav", "result_4_en.wav")



transcribed
translated
 > Text splitted to sentences.
['* Little known but interesting fact: Ryan Gosling did not die at the end of the film Drive.']
* little known but interesting fact, ryan gosling did not die at the end of the film drive.
 [!] Character '*' not found in the vocabulary. Discarding it.
 > Processing time: 15.43992805480957
 > Real-time factor: 3.062870076335959

Оригинальный голос:


\Сгенерированный голос:


In [None]:
translator_2.translate_audio("test_4.wav", "result_4_fr.wav")



transcribed
translated
 > Text splitted to sentences.
['* Little known but interesting fact: Ryan Gosling did not die at the end of the film Drive.']
* little known but interesting fact, ryan gosling did not die at the end of the film drive.
 [!] Character '*' not found in the vocabulary. Discarding it.
 > Processing time: 12.205081701278687
 > Real-time factor: 2.6208034574358354

Оригинальный голос:


\Сгенерированный голос:


In [None]:
translator_3.translate_audio("result_4_en.wav", "result_4_en_fr.wav")



transcribed
translated
 > Text splitted to sentences.
['*Little known but intersting fact, Ryan Gosling did not die at the end of the Film Drive.']
*little known but intersting fact, ryan gosling did not die at the end of the film drive.
 [!] Character '*' not found in the vocabulary. Discarding it.
 > Processing time: 12.424740076065063
 > Real-time factor: 2.596601896774308

Оригинальный голос:


\Сгенерированный голос:
