In [None]:
#@title 1安装依赖
!pip uninstall torch torchvision torchaudio -y

# Workaround from: https://github.com/m-bain/whisperX/issues/1027#issuecomment-2627525081
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121


!pip install ctranslate2==4.4.0
!pip install faster-whisper==1.1.1
!pip install pysrt
!pip install pyannote.audio
!pip install imageio-ffmpeg
!pip install "audio-separator[gpu]"



!apt-get update
!apt-get install libcudnn8=8.9.2.26-1+cuda12.1
!apt-get install libcudnn8-dev=8.9.2.26-1+cuda12.1
!pip install pyannote.audio

!python -c "import torch; torch.backends.cuda.matmul.allow_tf32 = True; torch.backends.cudnn.allow_tf32 = True"

In [1]:
#@title 2挂载云盘
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [1]:
#@title 3导入配置
import os
config = {
    "pre_path": "a_pre",  # 存放待提取视频
    "work_path": "b_work",  # 存放待分离音频
    "asr_path": "c_asr",  # 存放转写结果
    "tsl_path": "d_tsl",
    "model_path": "/content",  # 存放模型
    "log_path": "z_log",  # 存放记录

    "prompt": "",  # 被识别音频的标题，作为额外信息输入
    "language": "ja",
    "space": 2,  # 人声段切割后间隔，单位为秒
    "min_duration_on": 0.0,  # 一段语音至少要持续这么久，才认为是“有效语音段”，优先级高于min_duration_off
    "min_duration_off": 0.2,  # 两段语音之间的静音至少要持续这么久，才认为是“真正的停顿”

    "separator": "htdemucs_ft.yaml",
    "vad": "4evergr8/pyannote-segmentation-3.0",  # VAD模型，来自Pyannote，“修复了原版模型强制登陆的bug”
    "asr": "zh-plus/faster-whisper-large-v2-japanese-5k-steps",  # Whisper模型，仅支持ctranslate2格式

    "output": [  # 输出文件格式，可多选
        "lrc",
        "log",
        # "srt",
        # "vtt"
    ]
}


def get_path(config_value):
    path = os.path.join('/content/gdrive/MyDrive/ASMRASR', config_value)
    os.makedirs(path, exist_ok=True)
    print(f"创建文件夹: {path}")
    return path

config["pre_path"] = get_path(config["pre_path"])
config["work_path"] = get_path(config["work_path"])
config["asr_path"] = get_path(config["asr_path"])
config["log_path"] = get_path(config["log_path"])










创建文件夹: /content/gdrive/MyDrive/ASMRASR/a_pre
创建文件夹: /content/gdrive/MyDrive/ASMRASR/b_work
创建文件夹: /content/gdrive/MyDrive/ASMRASR/c_asr
创建文件夹: /content/gdrive/MyDrive/ASMRASR/z_log


In [None]:
#@title 4人声分离
from audio_separator.separator import Separator
separator = Separator(
            output_dir=config["work_path"],
            model_file_dir=config["model_path"],
            output_single_stem="vocals",
            demucs_params={"segment_size": "22", "shifts": 2, "overlap": 0.25, "segments_enabled": True},

        )
separator.load_model(model_filename=config["separator"])
output_files = separator.separate(config["pre_path"])
print(f"<UNK>{len(output_files)}")

INFO:audio_separator.separator.separator:Separator version 0.32.0 instantiating with output_dir: /content/gdrive/MyDrive/ASMRASR/b_work, output_format: WAV
INFO:audio_separator.separator.separator:Using model directory from model_file_dir parameter: /content
INFO:audio_separator.separator.separator:Operating System: Linux #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
INFO:audio_separator.separator.separator:System: Linux Node: d08638d6f997 Release: 6.1.123+ Machine: x86_64 Proc: x86_64
INFO:audio_separator.separator.separator:Python Version: 3.11.12
INFO:audio_separator.separator.separator:PyTorch Version: 2.5.1+cu121
INFO:audio_separator.separator.separator:FFmpeg installed: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
INFO:audio_separator.separator.separator:ONNX Runtime GPU package installed with version: 1.21.1
INFO:audio_separator.separator.separator:ONNX Runtime CPU package installed with version: 1.21.1
INFO:audio_separator.separator.

In [2]:
#@title 5音频转写
import gc
import pysrt
import torch
import numpy as np
from pyannote.audio import Model
from faster_whisper import WhisperModel
import librosa
from pyannote.audio.pipelines import VoiceActivityDetection
from dataclasses import dataclass
import os


# 数据结构
@dataclass
class AudioSegmentInfo:
    start: float
    end: float
    group_start: float
    group_end: float
    text: str = "..."

@dataclass
class AudioData:
    audio_array: np.ndarray
    segment_info_list: list




# 硬件
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print('设备:', device, '类型:', compute_type)

# 只初始化一次ASR模型（不会在每个音频内循环初始化）


# 遍历所有音频
for root, dirs, files in os.walk(config["work_path"]):
    for filename in files:
        audio_path = os.path.join(root, filename)
        basename = os.path.splitext(filename)[0]
        print(f"\n处理音频: {audio_path}")

        # Step 1: 加载音频
        audio, sr = librosa.load(str(audio_path), sr=16000, mono=True)


        gc.collect()
        torch.cuda.empty_cache()

        vad_model = Model.from_pretrained(checkpoint=config["vad"], cache_dir=config["model_path"])
        vad_model.to(torch.device(device))
        vad_pipeline = VoiceActivityDetection(segmentation=vad_model)
        vad_pipeline.instantiate({
            "min_duration_on": config["min_duration_on"],
            "min_duration_off": config["min_duration_off"],
        })

        vad_result = vad_pipeline(str(audio_path))

        # VAD结束，释放模型和缓存
        del vad_pipeline, vad_model
        gc.collect()
        torch.cuda.empty_cache()

        # Step 3: 切人声片段
        timeline = vad_result.get_timeline()




        audio_groups = []
        group_start_limit = 30 * 60  # 每组音频的时间限制，30分钟

        silence = np.zeros(int(sr * config["space"]), dtype=audio.dtype)

        audio_groups = []
        group_start_limit = 30 * 60  # 每组音频的时间限制，30分钟
        silence_duration = config["space"]
        silence = np.zeros(int(sr * silence_duration), dtype=audio.dtype)

        for segment in timeline:
            # 计算当前 segment 的时间
            segment_start = segment.start
            segment_end = segment.end

            # 所属分组
            group_index = int(segment_end // group_start_limit)

            # 创建新分组（如果尚不存在）
            while len(audio_groups) <= group_index:
                audio_groups.append(AudioData(audio_array=np.array([]), segment_info_list=[]))

            # 添加 segment_info 到对应组
            audio_groups[group_index].segment_info_list.append(
                AudioSegmentInfo(start=segment_start, end=segment_end, group_start=0.0, group_end=0.0)
            )

        # 对每组音频进行拼接处理
        for audio_group in audio_groups:
            group_audio = []
            current_group_end = 0.0

            for i, segment in enumerate(audio_group.segment_info_list):
                segment_start = segment.start
                segment_end = segment.end

                # 计算拼接后的位置
                group_start = current_group_end
                group_end = group_start + (segment_end - segment_start)

                # 更新 group_start 和 group_end
                segment.group_start = group_start
                segment.group_end = group_end

                # 提取音频段
                start_sample = int(segment_start * sr)
                end_sample = int(segment_end * sr)
                audio_seg = audio[start_sample:end_sample]

                # 加入静音（除首段）
                if i > 0:
                    group_audio.append(silence)
                group_audio.append(audio_seg)

                # 更新下一段的起点
                current_group_end = group_end + (
                    silence_duration if i < len(audio_group.segment_info_list) - 1 else 0)

            if group_audio:
                audio_group.audio_array = np.concatenate(group_audio)
            else:
                audio_group.audio_array = np.array([])

        del audio
        subs = pysrt.SubRipFile()
        for audio_group in audio_groups:
            for segment in audio_group.segment_info_list:
                sub = pysrt.SubRipItem(
                    index=len(subs) + 1,  # 字幕索引
                    start=pysrt.SubRipTime.from_ordinal(int(segment.group_start * 1000)),  # 转换 start 为 SRT 时间格式
                    end=pysrt.SubRipTime.from_ordinal(int(segment.group_end * 1000)),  # 转换 end 为 SRT 时间格式
                    text=segment.text  # 字幕内容
                )
                subs.append(sub)


        srt_path = os.path.join(config["log_path"], f"before-{basename}.srt")
        subs.save(srt_path)
        print(f"log写入: {srt_path}")




        gc.collect()
        asr_model = WhisperModel(
            config["asr"],
            device=device,
            compute_type=compute_type,
            download_root=config["model_path"],
            num_workers=20
        )
        subs = pysrt.SubRipFile()
        for audio_group in audio_groups:
            segments, _ = asr_model.transcribe(
                audio=audio_group.audio_array,
                beam_size=2,
                vad_filter=False,
                initial_prompt=basename,
                language=config['language']
            )

            for seg in segments:
                seg_start = seg.start
                seg_end = seg.end
                seg_text = seg.text.strip()

                best_match = None
                max_overlap = 0.0

                subtitle = pysrt.SubRipItem(
                    index=len(subs) + 1,
                    start=pysrt.SubRipTime.from_ordinal(int(seg_start * 1000)),  # 转换为毫秒
                    end=pysrt.SubRipTime.from_ordinal(int(seg_end * 1000)),  # 转换为毫秒
                    text=seg_text
                )
                subs.append(subtitle)

                for segment_info in audio_group.segment_info_list:
                    # 求开始时间的最大值和结束时间的最小值
                    overlap_start = max(seg_start, segment_info.group_start)
                    overlap_end = min(seg_end, segment_info.group_end)

                    # 如果重合时间大于零，计算重合时长
                    overlap_duration = max(0.0, overlap_end - overlap_start)

                    # 只有当重合时长大于零时，才可能是一个有效的匹配
                    if overlap_duration >= max_overlap:
                        max_overlap = overlap_duration
                        best_match = segment_info

                if best_match and max_overlap > 0:
                    best_match.text = seg_text

        srt_path = os.path.join(config["log_path"], f"asr-{basename}.srt")
        subs.save(srt_path)


        del asr_model
        gc.collect()
        subs = pysrt.SubRipFile()

        for audio_group in audio_groups:
            # 创建 SRT 字幕文件对象


            for segment in audio_group.segment_info_list:
                # 将每个 segment 信息转换为 SRT 格式
                sub = pysrt.SubRipItem(
                    index=len(subs) + 1,  # 字幕索引
                    start=pysrt.SubRipTime.from_ordinal(int(segment.start * 1000)),  # 转换 start 为 SRT 时间格式
                    end=pysrt.SubRipTime.from_ordinal(int(segment.end * 1000)),  # 转换 end 为 SRT 时间格式
                    text=segment.text  # 字幕内容
                )
                subs.append(sub)

            # 设置输出 SRT 文件路径

        srt_path = os.path.join(config["asr_path"], f"{basename}.srt")


        subs.save(srt_path)
        print(f"字幕写入: {srt_path}")




DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


设备: cuda 类型: float16

处理音频: /content/gdrive/MyDrive/ASMRASR/b_work/ゆるギャルJK!!_track02.mp3


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



log写入: /content/gdrive/MyDrive/ASMRASR/z_log/before-ゆるギャルJK!!_track02.srt
字幕写入: /content/gdrive/MyDrive/ASMRASR/c_asr/ゆるギャルJK!!_track02.srt


#6翻译字幕

In [None]:
import os
os.kill(os.getpid(), 9)
