#1挂载云盘

In [None]:

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


#2安装依赖

In [None]:
!pip uninstall torch torchvision torchaudio -y

# Workaround from: https://github.com/m-bain/whisperX/issues/1027#issuecomment-2627525081
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121

# WhisperX-related packages:
!pip install ctranslate2==4.4.0
!pip install faster-whisper==1.1.1

!apt-get update
!apt-get install libcudnn8=8.9.2.26-1+cuda12.1
!apt-get install libcudnn8-dev=8.9.2.26-1+cuda12.1
!pip install pyannote.audio

!python -c "import torch; torch.backends.cuda.matmul.allow_tf32 = True; torch.backends.cudnn.allow_tf32 = True"

In [None]:
import gc
import torch
import os
import librosa
from faster_whisper import WhisperModel

# 配置
path = '/content/gdrive/MyDrive/ASR'
config = {
    "work_path": "1work",
    "asr_path": "1work",
    "model_path": "model",

    "prompt": "",
    "language": "ja",

    "asr": "large-v2",
}

config["work_path"] = os.path.join(path, config["work_path"])
config["asr_path"] = os.path.join(path, config["asr_path"])

# 工具函数
def timestamp_to_srt(ts: float) -> str:
    hours = int(ts // 3600)
    minutes = int((ts % 3600) // 60)
    seconds = int(ts % 60)
    millis = int((ts - int(ts)) * 1000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

# 硬件
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print('设备:', device, '类型:', compute_type)

# 初始化ASR模型
asr_model = WhisperModel(
    config["asr"],
    device=device,
    compute_type=compute_type,
    download_root=config["model_path"]
)

# 遍历所有音频
for root, dirs, files in os.walk(config["work_path"]):
    for filename in files:
        if not filename.endswith(('.wav', '.mp3')):
            continue

        audio_path = os.path.join(root, filename)
        basename = os.path.splitext(filename)[0]
        print(f"\n处理音频: {audio_path}")

        # 加载音频
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)

        gc.collect()
        torch.cuda.empty_cache()

        # 直接转写全音频，启用VAD过滤
        segments, _ = asr_model.transcribe(
            audio=audio,
            beam_size=2,
            vad_filter=True,
            initial_prompt=basename,
            language=config['language'],
            word_timestamps=False
        )

        # 写出SRT文件
        os.makedirs(config["asr_path"], exist_ok=True)
        srt_path = os.path.join(config["asr_path"], f"{basename}.srt")

        with open(srt_path, "w", encoding="utf-8") as f:
            for idx, seg in enumerate(segments, start=1):
                start_time = timestamp_to_srt(seg.start)
                end_time = timestamp_to_srt(seg.end)
                text = seg.text.strip()

                f.write(f"{idx}\n")
                f.write(f"{start_time} --> {end_time}\n")
                f.write(f"{text}\n\n")
        print(f"字幕写入: {srt_path}")

        # 释放内存
        del audio, segments
        gc.collect()
        torch.cuda.empty_cache()

# 最后释放模型
del asr_model
gc.collect()
torch.cuda.empty_cache()


设备: cuda 类型: float16

处理音频: /content/gdrive/MyDrive/ASR/1work/MIDV-771.wav
字幕写入: /content/gdrive/MyDrive/ASR/1work/MIDV-771.srt


#5转写音频

In [None]:
import gc
import torch
import numpy as np
from pyannote.audio import Model
import os
from faster_whisper import WhisperModel
import librosa
from pyannote.audio.pipelines import VoiceActivityDetection
from dataclasses import dataclass

# 配置
path = '/content/gdrive/MyDrive/ASR'
config = {
    "work_path": "1work",
    "asr_path": "1work",
    "log_path": "log",
    "model_path": "model",

    "prompt": "",
    "language": "ja",
    "space": 3,
    "min_duration_on": 0.0,
    "min_duration_off": 0.2,

    "asr": "large-v2",
    "vad": "4evergr8/pyannote-segmentation-3.0",

    "output": ["lrc", "srt", "vtt"]
}

config["work_path"] = os.path.join(path, config["work_path"])
config["asr_path"] = os.path.join(path, config["asr_path"])

# 数据结构
@dataclass
class AudioSegmentInfo:
    start_in_group: float
    end_in_group: float
    start_in_origin: float
    end_in_origin: float

@dataclass
class AudioSegmentGroup:
    audio: np.ndarray
    segments: list
    offset: float

# 工具函数
def timestamp_to_srt(ts: float) -> str:
    hours = int(ts // 3600)
    minutes = int((ts % 3600) // 60)
    seconds = int(ts % 60)
    millis = int((ts - int(ts)) * 1000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

# 硬件
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print('设备:', device, '类型:', compute_type)

# 初始化ASR模型
asr_model = WhisperModel(
    config["asr"],
    device=device,
    compute_type=compute_type,
    download_root=config["model_path"]
)

# 遍历所有音频
for root, dirs, files in os.walk(config["work_path"]):
    for filename in files:
        if not filename.endswith(('.wav', '.mp3')):
            continue

        audio_path = os.path.join(root, filename)
        basename = os.path.splitext(filename)[0]
        print(f"\n处理音频: {audio_path}")

        # Step 1: 加载音频
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)

        gc.collect()
        torch.cuda.empty_cache()

        vad_model = Model.from_pretrained(checkpoint=config["vad"], cache_dir=config["model_path"])
        vad_model.to(torch.device(device))
        vad_pipeline = VoiceActivityDetection(segmentation=vad_model)
        vad_pipeline.instantiate({
            "min_duration_on": config["min_duration_on"],
            "min_duration_off": config["min_duration_off"],
        })

        vad_result = vad_pipeline(str(audio_path))

        # VAD结束，释放模型
        del vad_pipeline, vad_model
        gc.collect()
        torch.cuda.empty_cache()

        # Step 3: 切人声片段，并按30分钟分组
        timeline = vad_result.get_timeline()
        silence = np.zeros(int(sr * config["space"]), dtype=audio.dtype)

        groups = []
        current_audio = []
        current_segments = []
        current_time_concat = 0.0
        current_group_start = 0.0
        group_time_limit = 30 * 60

        for segment in timeline:
            if segment.start >= current_group_start + group_time_limit:
                if current_segments:
                    final_audio = np.concatenate(current_audio[:-1])
                    groups.append(AudioSegmentGroup(audio=final_audio, segments=current_segments, offset=current_group_start))

                current_audio = []
                current_segments = []
                current_time_concat = 0.0
                current_group_start += group_time_limit

            start_sample = int(segment.start * sr)
            end_sample = int(segment.end * sr)

            audio_seg = audio[start_sample:end_sample]
            duration = (end_sample - start_sample) / sr

            segment_info = AudioSegmentInfo(
                start_in_group=current_time_concat,
                end_in_group=current_time_concat + duration,
                start_in_origin=segment.start,
                end_in_origin=segment.end
            )

            current_segments.append(segment_info)
            current_audio.append(audio_seg)
            current_audio.append(silence)
            current_time_concat += duration + config["space"]

        if current_segments:
            final_audio = np.concatenate(current_audio[:-1])
            groups.append(AudioSegmentGroup(audio=final_audio, segments=current_segments, offset=current_group_start))

        if not groups:
            print(f"{audio_path} 没有有效人声段，跳过")
            continue

        # Step 4: ASR转写 + 时间戳匹配
        srt_segments = []
        subtitle_index = 1

        for group in groups:
            segments, _ = asr_model.transcribe(
                audio=group.audio,
                beam_size=2,
                vad_filter=False,
                initial_prompt=basename,
                language=config['language'],
                word_timestamps=False
            )

            asr_results = []
            for seg in segments:
                asr_results.append((seg.start, seg.end, seg.text.strip()))

            for segment_info in group.segments:
                found_text = "..."

                for asr_start, asr_end, asr_text in asr_results:
                    latest_start = max(segment_info.start_in_group, asr_start)
                    earliest_end = min(segment_info.end_in_group, asr_end)
                    overlap = max(0.0, earliest_end - latest_start)

                    if overlap > 0:
                        found_text = asr_text
                        break

                srt_segments.append((
                    subtitle_index,
                    segment_info.start_in_origin,
                    segment_info.end_in_origin,
                    found_text
                ))
                subtitle_index += 1

        # Step 5: 写出SRT文件
        os.makedirs(config["asr_path"], exist_ok=True)
        srt_path = os.path.join(config["asr_path"], f"{basename}.srt")
        with open(srt_path, "w", encoding="utf-8") as f:
            for idx, start, end, text in srt_segments:
                f.write(f"{idx}\n")
                f.write(f"{timestamp_to_srt(start)} --> {timestamp_to_srt(end)}\n")
                f.write(f"{text}\n\n")
        print(f"字幕写入: {srt_path}")

        # 回收
        del audio, groups, timeline, srt_segments
        gc.collect()
        torch.cuda.empty_cache()

# 释放ASR模型
del asr_model
gc.collect()
torch.cuda.empty_cache()


DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


设备: cuda 类型: float16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



处理音频: /content/gdrive/MyDrive/ASR/1work/MIDV-771.wav


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



In [None]:
import os
os.kill

<function posix.kill(pid, signal, /)>