In [1]:
!pip install git+https://github.com/openai/whisper.git transformers sentencepiece

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /private/var/folders/pz/qq52dgsj7dn690q5b3f2nb440000gn/T/pip-req-build-z57sd01n
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /private/var/folders/pz/qq52dgsj7dn690q5b3f2nb440000gn/T/pip-req-build-z57sd01n
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting numba (from openai-whisper==20231117)
  Using cached numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting numpy (from openai-whisper=

In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

import whisper
import os
from datetime import timedelta

def generate_srt_from_audio(audio_file_path, output_srt_file_path):
    """
    从音频文件生成 SRT 字幕文件。

    参数:
    audio_file_path (str): 音频文件的路径。
    output_srt_file_path (str): 输出 SRT 文件的路径。
    """
    # 加载模型
    # model = whisper.load_model("large-v2")
    model = whisper.load_model("medium")

    # 转录音频文件
    result = model.transcribe(audio_file_path, verbose=True)

    # 将转录结果转换为 SRT 格式并保存
    with open(output_srt_file_path, 'w', encoding='UTF-8') as srt_file:
        for i, segment in enumerate(result["segments"], start=1):
            start_seconds = segment["start"]
            end_seconds = segment["end"]
            # 格式化开始和结束时间为 SRT 规范
            start_srt = str(timedelta(seconds=start_seconds)).replace('.', ',')
            end_srt = str(timedelta(seconds=end_seconds)).replace('.', ',')
            # 确保毫秒是三位数字
            if ',' not in start_srt:
                start_srt += ',000'
            if ',' not in end_srt:
                end_srt += ',000'
            text = segment["text"]
            srt_file.write(f"{i}\n")
            srt_file.write(f"{start_srt} --> {end_srt}\n")
            srt_file.write(f"{text}\n\n")

    print(f"SRT 文件已保存至：{output_srt_file_path}")

# 音频文件路径列表
audio_folder_path = '/Users/zhoudexiao/Desktop/Project/testaudio/'
audio_files = [
    'segment-7-f2-v1-a.mp3'
]

# 遍历音频文件路径列表并生成对应的 SRT 文件
for audio_file in audio_files:
    audio_file_path = os.path.join(audio_folder_path, audio_file)
    # 获取音频文件名，不包括路径和扩展名
    audio_name = audio_file.split('.')[0]
    # 设置 SRT 输出文件路径
    output_srt_file_path = os.path.join(audio_folder_path, f'{audio_name}.srt')
    # 生成 SRT 文件
    generate_srt_from_audio(audio_file_path, output_srt_file_path)


  checkpoint = torch.load(fp, map_location=device)


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: German
[00:00.000 --> 00:04.760]  Das hängt von ihren Noten ab und machen unterschiedliche Schulabschlüsse.
[00:04.760 --> 00:07.240]  Der höchste Schulabschluss ist das Abitur.
[00:07.240 --> 00:09.440]  Damit kann man an einer Universität schlafen.
SRT 文件已保存至：/Users/zhoudexiao/Desktop/Project/testaudio/segment-7-f2-v1-a.srt


In [4]:
# 直接模型翻译

import whisper
import os
from datetime import timedelta
from transformers import MarianMTModel, MarianTokenizer

def generate_translated_srt(audio_file_path, output_srt_file_path, source_lang="de", target_lang="zh"):
    """
    从音频文件生成带翻译的 SRT 字幕文件。

    参数:
    audio_file_path (str): 音频文件的路径。
    output_srt_file_path (str): 输出 SRT 文件的路径。
    source_lang (str): 源语言代码。
    target_lang (str): 目标语言代码。
    """
    # 加载 Whisper 模型
    # whisper_model = whisper.load_model("large-v3")
    whisper_model = whisper.load_model("medium")

    # 转录音频文件
    result = whisper_model.transcribe(audio_file_path, language=source_lang, verbose=True)

    # 加载翻译模型和 tokenizer
    translation_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
    translation_model = MarianMTModel.from_pretrained(translation_model_name)

    # 将转录结果转换为 SRT 格式并保存
    with open(output_srt_file_path, 'w', encoding='UTF-8') as srt_file:
        for i, segment in enumerate(result["segments"], start=1):
            start_seconds = segment["start"]
            end_seconds = segment["end"]
            # 格式化开始和结束时间为 SRT 规范
            start_srt = str(timedelta(seconds=start_seconds)).replace('.', ',')
            end_srt = str(timedelta(seconds=end_seconds)).replace('.', ',')
            # 确保毫秒是三位数字
            if ',' not in start_srt:
                start_srt += ',000'
            if ',' not in end_srt:
                end_srt += ',000'
            text = segment["text"]

            # 翻译文本
            print(f"Translating text: {text}")  # 调试信息
            inputs = tokenizer.encode(text, return_tensors='pt', padding=True)
            translated_tokens = translation_model.generate(inputs, max_length=400, num_beams=4, early_stopping=True)
            translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

            print(f"Translated text: {translated_text}")  # 调试信息

# 使用中介语言进行优化

            # 写入 SRT 文件
            srt_file.write(f"{i}\n")
            srt_file.write(f"{start_srt} --> {end_srt}\n")
            srt_file.write(f"{translated_text}\n")
            srt_file.write(f"{text}\n\n")

    print(f"SRT 文件已保存至：{output_srt_file_path}")

# 音频文件路径列表
audio_folder_path = '/Users/zhoudexiao/Desktop/Project/testaudio/'
audio_files = [
    'segment-7-f2-v1-a.mp3'
    # 添加其他音频文件名
]

# 遍历音频文件路径列表并生成对应的 SRT 文件
for audio_file in audio_files:
    audio_file_path = os.path.join(audio_folder_path, audio_file)
    # 获取音频文件名，不包括路径和扩展名
    audio_name = os.path.splitext(audio_file)[0]
    # 设置 SRT 输出文件路径
    output_srt_file_path = os.path.join(audio_folder_path, f'{audio_name}.srt')
    # 生成 SRT 文件
    generate_translated_srt(audio_file_path, output_srt_file_path)




[00:00.000 --> 00:04.760]  Das hängt von ihren Noten ab und machen unterschiedliche Schulabschlüsse.
[00:04.760 --> 00:07.240]  Der höchste Schulabschluss ist das Abitur.
[00:07.240 --> 00:09.440]  Damit kann man an einer Universität schlafen.




Translating text:  Das hängt von ihren Noten ab und machen unterschiedliche Schulabschlüsse.
Translated text: 那得看他们的成绩 学历不同
Translating text:  Der höchste Schulabschluss ist das Abitur.
Translated text: 高中毕业典礼是毕业典礼
Translating text:  Damit kann man an einer Universität schlafen.
Translated text: 你可以睡在大学里
SRT 文件已保存至：/Users/zhoudexiao/Desktop/Project/testaudio/segment-7-f2-v1-a.srt


In [1]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [1]:
# 英语作为中间语言翻译

# from google.colab import drive
# drive.mount('/content/drive')

import whisper
import os
from datetime import timedelta
from transformers import MarianMTModel, MarianTokenizer

def generate_translated_srt(audio_file_path, output_srt_file_path, source_lang="de", target_lang="zh"):
    """
    从音频文件生成带翻译的 SRT 字幕文件。
    """
    # 加载 Whisper 模型
    whisper_model = whisper.load_model("medium")
    # whisper_model = whisper.load_model("large")
    # whisper_model = whisper.load_model("small")

    # 转录音频文件
    result = whisper_model.transcribe(
        audio_file_path,
        language=source_lang,
        verbose=True,
        no_speech_threshold=0.6,
        condition_on_previous_text = True
        # beam_size=5,
        # temperature=0.7
    )

    # 加载翻译模型和 tokenizer，首先将源语言翻译成英语
    intermediate_lang = "en"
    translation_model_intermediate = f'Helsinki-NLP/opus-mt-{source_lang}-{intermediate_lang}'
    tokenizer_intermediate = MarianTokenizer.from_pretrained(translation_model_intermediate)
    model_intermediate = MarianMTModel.from_pretrained(translation_model_intermediate)

    # 再将英语翻译成目标语言
    translation_model_final = f'Helsinki-NLP/opus-mt-{intermediate_lang}-{target_lang}'
    tokenizer_final = MarianTokenizer.from_pretrained(translation_model_final)
    model_final = MarianMTModel.from_pretrained(translation_model_final)

    # 将转录结果转换为 SRT 格式并保存
    with open(output_srt_file_path, 'w', encoding='UTF-8') as srt_file:
        for i, segment in enumerate(result["segments"], start=1):
            start_seconds = segment["start"]
            end_seconds = segment["end"]
            start_srt = str(timedelta(seconds=start_seconds)).replace('.', ',')
            end_srt = str(timedelta(seconds=end_seconds)).replace('.', ',')
            if ',' not in start_srt:
                start_srt += ',000'
            if ',' not in end_srt:
                end_srt += ',000'
            text = segment["text"]

            # 先翻译到英语
            inputs = tokenizer_intermediate.encode(text, return_tensors='pt', padding=True)
            intermediate_tokens = model_intermediate.generate(inputs, max_length=400, num_beams=4, early_stopping=True)
            intermediate_text = tokenizer_intermediate.decode(intermediate_tokens[0], skip_special_tokens=True)

            # 再从英语翻译到目标语言
            inputs_final = tokenizer_final.encode(intermediate_text, return_tensors='pt', padding=True)
            final_tokens = model_final.generate(inputs_final, max_length=400, num_beams=4, early_stopping=True)
            translated_text = tokenizer_final.decode(final_tokens[0], skip_special_tokens=True)

            srt_file.write(f"{i}\n")
            srt_file.write(f"{start_srt} --> {end_srt}\n")
            srt_file.write(f"{translated_text}\n")
            srt_file.write(f"{text}\n\n")

            # 调试输出翻译文本
            print(f"Segment {i}: {text} -> {translated_text}")
        print(f"SRT 文件已保存至：{output_srt_file_path}")

# 设置音频文件路径和输出
# audio_folder_path = './drive/MyDrive/Colab Notebooks/KI2/'
audio_folder_path = '/Users/zhoudexiao/Desktop/Project/testaudio/'
audio_files = [
               'segment-7-f2-v1-a.mp3'
            #    'Kardiologische Implantate Teil 2 Woche 10.mp3',
            #    'Kardiologische Implantate Teil 2 Woche 11.mp3',
            #    'Kardiologische Implantate Teil 2 Woche 12.mp3'
]

for audio_file in audio_files:
    audio_file_path = os.path.join(audio_folder_path, audio_file)
    audio_name = os.path.splitext(audio_file)[0]
    output_srt_file_path = os.path.join(audio_folder_path, f'{audio_name}.srt')
    generate_translated_srt(audio_file_path, output_srt_file_path)


  from .autonotebook import tqdm as notebook_tqdm
  checkpoint = torch.load(fp, map_location=device)


[00:00.000 --> 00:04.760]  Das hängt von ihren Noten ab und machen unterschiedliche Schulabschlüsse.
[00:04.760 --> 00:07.240]  Der höchste Schulabschluss ist das Abitur.
[00:07.240 --> 00:09.440]  Damit kann man an einer Universität schlafen.




Segment 1:  Das hängt von ihren Noten ab und machen unterschiedliche Schulabschlüsse. -> 取决于他们的成绩,成绩不同。
Segment 2:  Der höchste Schulabschluss ist das Abitur. -> 最高学校结业证书是Abitur。
Segment 3:  Damit kann man an einer Universität schlafen. -> 你可以在大学里用它睡觉
SRT 文件已保存至：/Users/zhoudexiao/Desktop/Project/testaudio/segment-7-f2-v1-a.srt
