### Основные импорты, может занять какое-то время


In [None]:
%%capture
!pip install nemo_toolkit['all'] pydub transformers

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nemo.collections.asr as nemo_asr
from pydub import AudioSegment
from omegaconf import open_dict

pipe = pipeline("translation", model="facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(model_name="nvidia/stt_ru_fastconformer_hybrid_large_pc")

decoding_cfg = asr_model.cfg.decoding
with open_dict(decoding_cfg):
    decoding_cfg.preserve_alignments = True
    decoding_cfg.compute_timestamps = True
    asr_model.change_decoding_strategy(decoding_cfg)



In [None]:
%%capture
!pip install TTS["all"]

import torch
from TTS.api import TTS

### Классы для взаимодействия

In [None]:
class stt_model:
  def __init__(self, video_path):
    self.video_path = video_path
    self.audio_path = None
    self.word_timestamps = None
    self.time_stride = None
    self.sentences = None

  def get_audio(self):
    audio = AudioSegment.from_file(self.video_path)
    audio = audio.set_channels(1)
    audio = audio.set_frame_rate(16000)
    audio_file = audio.export(self.video_path + '.wav', format="wav")

    self.audio_path = self.video_path + '.wav'
    return self.audio_path

  def get_word_timestamps(self):
    hypotheses = asr_model.transcribe([self.audio_path], return_hypotheses=True)
    if type(hypotheses) == tuple and len(hypotheses) == 2:
      hypotheses = hypotheses[0]

    timestamp_dict = hypotheses[0].timestep
    time_stride = 8 * asr_model.cfg.preprocessor.window_stride
    word_timestamps = timestamp_dict['word']

    self.word_timestamps = word_timestamps
    self.time_stride = time_stride
    return word_timestamps

  @staticmethod
  def translate(source_lan, target_lan, text):
    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source_lan, tgt_lang=target_lan)
    output_text = translator(text, max_length=400)
    return output_text

  def make_sentence(self):
    text, sentences = [], []
    time_start, is_start, sentence = 0.0, True, ""

    for stamp in self.word_timestamps:
        start = stamp['start_offset'] * self.time_stride
        end = stamp['end_offset'] * self.time_stride
        word = stamp['char'] if 'char' in stamp else stamp['word']

        if(word!="."):
          text.append(" " + word)
          if(is_start):
            time_start=start
            is_start=False
          continue
        text.append(word)
        sentence = "".join(text)
        sentences.append([sentence, self.translate(source_lan='rus_Cyrl', target_lan='eng_Latn', text=sentence)[0]['translation_text'], time_start, end, end-time_start])
        is_start= True
        text.clear()

    with open('russia_text.txt', 'w') as file:
      for sent in sentences:
        file.write(sent[0])

    with open('foreign_text.txt', 'w') as file:
      for sent in sentences:
        file.write(sent[1])
    self.sentences = sentences
    return sentences


In [None]:
class tts_model:
  def __init__(self, speaker_audio_path):
    self.speaker_audio_path = speaker_audio_path
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.languages = {'fr': 'fr-fr', 'en': 'en', 'pt': 'pt-br'}
    self.tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(self.device)

  def cloning_voice(self, output_file_path,  text, language):
    if language in self.languages.keys():
      language = self.languages[language]
    else:
      raise RuntimeError('language is not in ', self.tts.languages)

    self.tts.tts_to_file(text, speaker_wav=self.speaker_audio_path, language=language, file_path=output_file_path)
    # tts2.voice_conversion_to_file(source_wav=output_file_path, target_wav=self.speaker_audio_path, file_path=output_file_path)
    return output_file_path


In [None]:
from pydub import AudioSegment
import os
import shutil
from IPython.display import Audio, display
from moviepy.editor import VideoFileClip, AudioFileClip

class audiopip:
  def __init__(self, speaker_audio_path, original_video_path, clone_dir, sentences):
    self.original_audio_path = speaker_audio_path
    self.original_video_path = original_video_path
    self.clone_dir = clone_dir
    self.sentences = sentences
    self.inserted_audio = None
    self.merged_audio = None
    self.result_video_path = 'result_video.mp4'
    self.delet_bufer = True

  def make_lower_volume(self):
    audio = AudioSegment.from_file(self.original_audio_path, format="wav")
    points = [[sent[2]*1000, sent[3]*1000] for sent in self.sentences]

    for el in points:
      start_time, end_time = el
      silence_duration = end_time - start_time
      silence_part = AudioSegment.silent(duration=silence_duration)

      part1 = audio[:start_time]
      part2 = audio[start_time:end_time]
      part3 = audio[end_time:]
      part2_lowered = part2 - 23

      audio = part1 + part2_lowered + part3

    return audio

  def delete_voise_part(self):
    audio = AudioSegment.from_file(self.original_audio_path, format="wav")
    points = [[sent[2]*1000, sent[3]*1000] for sent in self.sentences]

    for el in points:
      start_time, end_time = el
      silence_duration = end_time - start_time
      silence_part = AudioSegment.silent(duration=silence_duration)
      audio = audio[:start_time] + silence_part + audio[end_time:]
    return audio

  # переделать
  def insert_audio_at_time(self):
    buffer_dir_path = self.clone_dir
    track = AudioSegment.silent(duration=0)

    audio_files = [f'{f}' for f in os.listdir(buffer_dir_path) if f.endswith(".wav")]
    start_times = [round(sent[2], 3) * 1000 for sent in self.sentences]

    for i, audio_file in enumerate(sorted(audio_files)):


      audio = AudioSegment.from_file(f'{buffer_dir_path}/{audio_file}', format="wav")
      time_difference = start_times[i] - len(track)

      if time_difference > 0:
          silence = AudioSegment.silent(duration=time_difference)
          track += silence
      track += audio

    if self.delet_bufer:
      shutil.rmtree(buffer_dir_path)

    self.inserted_audio = track
    return track

  def merge(self, clone=None):

    if clone == None:
      if self.inserted_audio == None: raise 'No trac to insert try to use "insert_audio_at_time"'
      else: audio2 = self.inserted_audio
    else:
      audio2 = AudioSegment.from_file(clone, format="wav")

    audio1 = AudioSegment.from_file(self.original_audio_path, format="wav")

    output = audio1.overlay(audio2)
    self.merged_audio = output

    return output

  def make_result_video(self):
    video = VideoFileClip(self.original_video_path)
    if self.merged_audio == None:
      raise 'No trac to insert try to use "insert_audio_at_time"'

    self.merged_audio.export("buffer.wav", format="wav")
    audio = AudioFileClip("buffer.wav")

    video = video.set_audio(audio)

    video.write_videofile(self.result_video_path, codec="libx264", audio_codec="aac")
    os.remove("buffer.wav")
    os.remove(self.original_audio_path)

    return self.result_video_path

### Работа с библиотеками

In [3]:
from google.colab import files
lang = input('Введите язык на который нужно превести')
uploaded = files.upload()
video_path = list(uploaded.keys())[0]
buffer_dir = 'buffer'

KeyboardInterrupt: ignored

In [None]:
%%capture
stt_preprocessor = stt_model(video_path)
stt_preprocessor.get_audio()
stt_preprocessor.get_word_timestamps()
stt_preprocessor.make_sentence()


In [None]:
speaker_audio_path = stt_preprocessor.audio_path
sentences = stt_preprocessor.sentences


In [None]:
%%capture
tts_preprocessor = tts_model(speaker_audio_path)

# !mkdir buffer
os.mkdir(buffer_dir)


for i, sentenc in enumerate(sentences):
  output_file_path = f'{buffer_dir}/buffer_audio_file_{i}.wav'
  tts_preprocessor.cloning_voice(output_file_path, sentenc[1], lang)

In [None]:

audio_preprocessor = audiopip(speaker_audio_path, video_path, buffer_dir, sentences)

audio_preprocessor.make_lower_volume()
audio_preprocessor.insert_audio_at_time()
audio_preprocessor.merge()
res_video_path = audio_preprocessor.make_result_video()



Moviepy - Building video result_video.mp4.
MoviePy - Writing audio in result_videoTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video result_video.mp4





Moviepy - Done !
Moviepy - video ready result_video.mp4
