In [2]:
from faster_whisper import WhisperModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import MarianMTModel, MarianTokenizer

In [4]:
model_name ='Helsinki-NLP/opus-mt-ja-en'

model_translator = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

def translate_text(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors = 'pt', padding = True, truncation = True)

    translation_ids = model_translator.generate(
        input_ids,
        max_length = 200,
        num_beams = 5,
        length_penalty = 1.0,
        no_repeat_ngram_size = 2,
        top_k = 25,
        top_p = 0.95,
        early_stopping = False,
        do_sample = True
    )

    translated_text = tokenizer.decode(translation_ids[0], skip_special_tokens = True)

    return translated_text


In [5]:
model_size = "small"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

segments, info = model.transcribe("song4.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))


Detected language 'ja' with probability 0.987793
[0.00s -> 8.00s] ねえあたし知ってるよ君だ一人知ってるの知ってるよ
[8.00s -> 14.00s] ビクンビクンツーでさ声もただ漏れなどは正直にチャイヨバレてるんだし
[14.00s -> 18.00s] チャイヨ起点の普通普通恥ずかしみんな隠しているだけ
[18.00s -> 29.00s] ねえあたし知ってるよ君だ一人並みで知ってるの知ってるよ
[29.00s -> 34.00s] グスグスヘコンでさ弱ね人からインデナイン合わせてくるまで一緒行数もっとない
[34.00s -> 37.00s] たってなんとだってカット見てあげる
[37.00s -> 40.00s] もう構うしないでいっぱい出してね
[40.00s -> 43.00s] ねえお願い君が欲しいの
[43.00s -> 45.00s] 無くさめさせて Shake Shake
[45.00s -> 48.00s] 愛の才能で 泣いてくれなきゃ
[48.00s -> 50.00s] 腫れてしまう 濡れてんたい
[50.00s -> 53.00s] ねぇいいでしょ 舐めとって 飲み干したい
[53.00s -> 54.00s] ガッガッガッガッガッ
[54.00s -> 57.00s] うーわ お願い君が欲しいの
[57.00s -> 61.00s] 頼り散らして シックラップなんて最高ね
[61.00s -> 64.00s] 負けてくれなきゃ 君が痛い 感じてたい
[64.00s -> 67.00s] ねぇいいでしょ 吸い取って 救いたい
[67.00s -> 69.00s] ガッガッガッガッガッ
[70.00s -> 72.00s] ガッガッガッガッガッガッ
[72.00s -> 76.00s] ガッガッガッガッガッガッガッガッガッが
[76.00s -> 78.00s] 君が愛に 今 彼愛に 今 彼に
[78.00s -> 83.50s] ガッガッガッガッガッガッガッ benefit
[83.50s -> 86.20s] ねぇ 私知ってるよ 君はひとり悔しかった
[86.20s -> 89.20s] 不意の知ってるよ 突き うつきに伝わくるちゃう
[89.20s -> 90.92s]

In [8]:
segments, info = model.transcribe("clip2.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    translated_text = translate_text(segment.text)
    print(translated_text)


Detected language 'ja' with probability 0.984375
[0.92s -> 2.76s] おはよう センパー
Good morning, Senper.
[3.36s -> 5.22s] おはよう
Good morning.
[5.94s -> 8.12s] わぁ!可愛い今日は
It's so cute, isn't it?
[9.34s -> 10.08s] 今日は
Hello.
[10.08s -> 11.56s] センパー
Smper
[12.92s -> 16.22s] おー!ベンキュー ベンシュ ベンシュ
Oh, Benké, bench.
[19.94s -> 21.94s]  horse house
horsse house
[25.76s -> 26.94s]  forwards house
forwards house
[26.94s -> 28.94s] そう、つくる
Yes, I do.
[28.94s -> 30.94s] あーつくるー
Oh, I'm going to make it.
[30.94s -> 32.94s] あっちは専長ハウス
That's the chief house.
[32.94s -> 34.94s] マリンハウス
Marine House.
[34.94s -> 36.94s] はい
Yes, sir.
[36.94s -> 38.94s] 専長ハウス
The Long House.
[38.94s -> 40.94s] 専長ハウス
- A long house. - Mm-hmm.
[40.94s -> 42.94s] 音楽
Music
[42.94s -> 44.94s] プレイ
Play
[44.94s -> 46.94s] かっこいい
That's cool.
[46.94s -> 48.94s] いい曲
It's good.
[48.94s -> 50.94s] ナイスミュージック
Nice music.
[50.94s -> 52.94s] おしゃれ
Snacks.
[52.94s -> 54.94s] ハウスつくる
I'm going to build a house.
[54.94s -> 56.94s] ビルディン・ジョイズ
Bildin Joys.
[