# downloads

In [None]:
!gdown 1Qqi4O6BfSAjx_t1bQ-OmiSo4ZRNa6Qq9 # task_record.wav

In [None]:
!wget https://github.com/tsurumeso/vocal-remover/releases/download/v5.0.4/vocal-remover-v5.0.4.zip
!unzip vocal-remover-v5.0.4.zip

In [None]:
cd vocal-remover/

In [None]:
!pip install -r requirements.txt
!pip install --upgrade librosa
!pip install resampy

In [None]:
cd ../

In [None]:
!mkdir outputs

In [None]:
!pip install transformers
!pip install torchaudio
!pip install razdel
!pip install transformers[sentencepiece]
!pip install sacremoses

In [None]:
!pip install -q torchaudio

In [None]:
!pip3 install ffmpeg-normalize

# library

In [None]:
import torch
import torchaudio
from transformers import pipeline
import pickle
from razdel import sentenize
from IPython.display import Audio

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Main pipeline

## Music Voice Separator

In [None]:
cd vocal-remover/

In [None]:
!python inference.py --input /content/task_record.wav

In [None]:
path2res = "/content/outputs"
!mv task_record_Instruments.wav $path2res
!mv task_record_Vocals.wav $path2res

In [None]:
cd ../

## Preprocessing

Remove silence

In [None]:
!ffmpeg -i /content/outputs/task_record_Vocals.wav -af silenceremove=1:0:-50dB /content/outputs/vocal_silence.wav

normalize with ffmpeg-normalize: https://github.com/slhck/ffmpeg-normalize

In [None]:
!ffmpeg-normalize /content/outputs/vocal_silence.wav -o /content/outputs/vocal_silence_normalized.wav

silero vad

In [None]:
SAMPLING_RATE = 16000
torch.set_num_threads(1)

USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

In [None]:
path2vocal = "/content/outputs/vocal_silence_normalized.wav"
wav = read_audio(path2vocal, sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)

# merge all speech chunks to one audio
save_audio('/content/outputs/vocal_silence_normalized_vad.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('/content/outputs/vocal_silence_normalized_vad.wav')

## ASR

In [None]:
path2data = "/content/outputs/vocal_silence_normalized_vad.wav"

signal, sample_rate = torchaudio.load(path2data)
# stereo --> mono
signal = torch.mean(signal, dim=0, keepdim=True)
# sr = 48000 --> 16000
default_sample_rate = 16000
resampler = torchaudio.transforms.Resample(sample_rate, default_sample_rate)
signal = resampler(signal)
signal = signal.numpy().squeeze()

In [None]:
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-medium", device=device
)

transcription_whisper_medium = pipe(
    signal,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe", "language":"ru"},
    chunk_length_s=30,
    batch_size=8,
    return_timestamps=True,
)["chunks"]

In [None]:
text = ''
for chunk in transcription_whisper_medium:
  text += chunk['text']

In [None]:
sentences = [substring.text for substring in list(sentenize(text))]

In [None]:
# save it

path = "/content/outputs"
with open(f'{path}/asr_result_sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

with open(f'{path}/asr_result_text.pkl', 'wb') as f:
    pickle.dump(text, f)

In [None]:
text

## Translation

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"
translator = pipeline("translation", model=model_checkpoint)

In [None]:
translated_sentences = [translator(sentence)[0]['translation_text'] for sentence in sentences]

In [None]:
whole_english_text = ''

for sentence in translated_sentences:
  whole_english_text += sentence

In [None]:
# save it
path = "/content/outputs"
with open(f'{path}/translation_result_sentences.pkl', 'wb') as f:
    pickle.dump(translated_sentences, f)

with open(f'{path}/translation_result_text.pkl', 'wb') as f:
    pickle.dump(whole_english_text, f)