# 系統配置確認(建議使用GPU執行)

In [1]:
!nvidia-smi

Fri Dec 29 19:04:55 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0 Off |                  Off |
|  0%   42C    P3              49W / 450W |      3MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pwd

/home/llm/Jupyter_Lab/Samuel/NCU/資訊工程概論


# Import

In [3]:
import os
from typing import Union
import gradio as gr
import numpy as np
import torch
import torchaudio
from seamless_communication.inference import Translator
from seamless_m4t_hf.lang_list import (
  LANGUAGE_NAME_TO_CODE,
  S2ST_TARGET_LANGUAGE_NAMES,
  S2TT_TARGET_LANGUAGE_NAMES,
  T2TT_TARGET_LANGUAGE_NAMES,
  TEXT_SOURCE_LANGUAGE_NAMES,
)
from IPython.display import Audio
import librosa
from pydub import AudioSegment
import wave
import soundfile as sf

# Seamless m4t Setup
**Translator Choices** : "seamlessM4T_medium" , "seamlessM4T_large"


In [4]:
CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"

TASK_NAMES = [
  "S2ST (Speech to Speech translation)",
  "S2TT (Speech to Text translation)",
  "T2ST (Text to Speech translation)",
  "T2TT (Text to Text translation)",
  "ASR (Automatic Speech Recognition)",
]

AUDIO_SAMPLE_RATE = 16000.0
MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
DEFAULT_TARGET_LANGUAGE = "French"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

translator = Translator(
  model_name_or_card="seamlessM4T_large",
  vocoder_name_or_card="vocoder_36langs",
  device=device,
  dtype=None,
)

Using the cached checkpoint of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_36langs. Set `force` to `True` to download again.


In [5]:
def predict(
    task_name: str,
    audio_source: str,
    input_audio_mic: Union[str, None],
    input_audio_file: Union[str, None],
    input_text: Union[str, None],
    source_language: Union[str, None],
    target_language: str,
) -> tuple[Union[tuple[int, np.ndarray], None], str]:
    task_name = task_name.split()[0]
    source_language_code = LANGUAGE_NAME_TO_CODE.get(source_language, None)
    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]

    if task_name in ["S2ST", "S2TT", "ASR"]:
        if audio_source == "microphone":
            input_data = input_audio_mic
        else:
            input_data = input_audio_file

        arr, org_sr = torchaudio.load(input_data)
        new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
        max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
        if new_arr.shape[1] > max_length:
            new_arr = new_arr[:, :max_length]
            gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
        torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
    else:
        input_data = input_text
    text_out, batchedspeechoutput = translator.predict(
        input=input_data,
        task_str=task_name,
        tgt_lang=target_language_code,
        src_lang=source_language_code,
    )
    if task_name in ["S2ST", "T2ST"]:
        return (batchedspeechoutput.sample_rate, batchedspeechoutput.audio_wavs[0][0][0].cpu().detach().numpy()), text_out
    else:
        return None, text_out

In [6]:
def process_s2tt(input_audio_file: str, source_language:str, target_language: str) -> tuple[str, str, str]:
    return predict(
        task_name="S2TT",
        audio_source="file",
        input_audio_mic=None,
        input_audio_file=input_audio_file,
        input_text=None,
        source_language=source_language,
        target_language=target_language,
    )

In [7]:
test_text = process_s2tt(input_audio_file='./seamless_m4t_hf/assets/sample_input.wav',source_language='English',target_language='Mandarin Chinese')
print('Translated Text:',test_text[1][0],sep='\n')
print('Tranlated Audio:')
Audio('./seamless_m4t_hf/assets/sample_input.wav')

Translated Text:
我最喜欢的动物是大象 ⁇ 
Tranlated Audio:


# VITS-fast-fine-tuning Interface Setup

In [8]:
%cd VITS-fast-fine-tuning

/home/llm/Jupyter_Lab/Samuel/NCU/資訊工程概論/VITS-fast-fine-tuning


In [9]:
import os
import numpy as np
import torch
from torch import no_grad, LongTensor
import argparse
import commons
from mel_processing import spectrogram_torch
import utils
from models import SynthesizerTrn
import gradio as gr
import librosa
import webbrowser

from text import text_to_sequence, _clean_text

In [10]:
language_marks = {
    "Japanese": "",
    "日本語": "[JA]",
    "简体中文": "[ZH]",
    "English": "[EN]",
    "Mix": "",
}
# lang = ['日本語', '简体中文', 'English', 'Mix']
lang = ['简体中文']

In [11]:
def save_as_wav(filepath, sr, audio_data):
    with wave.open(filepath, 'w') as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(sr)
        wav_file.writeframes(np.int16(audio_data).tobytes())
        print(f'Saved at {filepath}')

In [12]:
def get_text(text, hps, is_symbol):
    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = LongTensor(text_norm)
    return text_norm

In [13]:
def create_tts_fn(model, hps, speaker_ids):
    def tts_fn(text, speaker, language, speed):
        speaker_id = speaker_ids[speaker]
        stn_tst = get_text(text, hps, False)
        with no_grad():
            x_tst = stn_tst.unsqueeze(0).to(device)
            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
            sid = LongTensor([speaker_id]).to(device)
            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
                                length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
        del stn_tst, x_tst, x_tst_lengths, sid
        return "Success", (hps.data.sampling_rate, audio)

    return tts_fn

In [14]:
def create_vc_fn(model, hps, speaker_ids):
    def vc_fn(original_speaker, record_audio):
        input_audio = record_audio
        sampling_rate, audio = input_audio
        sf.write("./raw_audio/output.wav", audio, sampling_rate)
        
        output_text = process_s2tt(input_audio_file='./raw_audio/output.wav',source_language='English',target_language='Mandarin Chinese')
        ouptut_text = str(output_text[1][0])

        speaker_id = speaker_ids[original_speaker]
        stn_tst = get_text(ouptut_text, hps, False)
        with no_grad():
            x_tst = stn_tst.unsqueeze(0).to(device)
            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
            sid = LongTensor([speaker_id]).to(device)
            audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
                                length_scale=1.0)[0][0, 0].data.cpu().float().numpy()
        del stn_tst, x_tst, x_tst_lengths, sid
        return "Success", (hps.data.sampling_rate, audio)

    return vc_fn

In [15]:
hps = utils.get_hparams_from_file("./fine_tune_models/finetune_speaker.json")

net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()

_ = utils.load_checkpoint("./fine_tune_models/G_latest.pth", net_g, None)
speaker_ids = hps.speakers
speakers = list(hps.speakers.keys())
tts_fn = create_tts_fn(net_g, hps, speaker_ids)
vc_fn = create_vc_fn(net_g, hps, speaker_ids)

2023-12-29 19:05:03,736 INFO -- root: Loaded checkpoint './fine_tune_models/G_latest.pth' (iteration 61)


# Interface

In [16]:
app = gr.Blocks()
with app:
    with gr.Tab("Text-to-Speech"):
        with gr.Row():
            with gr.Column():
                textbox = gr.TextArea(label="Text",
                                      placeholder="Type your sentence here",
                                      value="你好 這是測試。", elem_id=f"tts-input")
                # select character
                char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
                language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
                duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
                                            label='速度 Speed')
            with gr.Column():
                text_output = gr.Textbox(label="Message")
                audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
                btn = gr.Button("Generate!")
                btn.click(tts_fn,
                          inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
                          outputs=[text_output, audio_output])
    with gr.Tab("Voice Conversion"):
        gr.Markdown("""
                        錄製你的聲音，並挑選欲轉換的音色。User代表的音色是你自己。
        """)
        with gr.Column():
            record_audio = gr.Audio(label="record your voice", source="microphone")
            source_speaker = gr.Dropdown(choices=speakers, value="User", label="user")
        with gr.Column():
            message_box = gr.Textbox(label="Message")
            converted_audio = gr.Audio(label='converted audio')
        btn = gr.Button("Convert!")

        btn.click(vc_fn, inputs=[source_speaker, record_audio],
                  outputs=[message_box, converted_audio])



In [17]:
webbrowser.open("http://127.0.0.1:7860")

False

In [18]:
app.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://d4b2574f980ddfe398.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


