# 1下载文件

In [None]:
import gdown
import os

def download_from_gdrive(url: str, output_dir: str, filename: str):
    file_id = url.split('/d/')[1].split('/')[0]
    download_url = f'https://drive.google.com/uc?id={file_id}'
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)
    gdown.download(download_url, output=output_path, quiet=False, fuzzy=True)
output_dir = '/kaggle/working/ASMRASR/b_work'
download_from_gdrive(
    "https://drive.google.com/file/d/1BAdZZFu_aYTFVnIbyjDqAFCFF5OLSWqB/view?usp=sharing",
    output_dir,
    "asmr_data.wav"
)



# 2安装依赖

In [None]:
!pip install pysrt
!pip install pyannote.audio
!pip install faster-whisper==1.1.1








# 3导入配置

In [None]:
import os
config = {
    "pre_path": "a_pre",
    "work_path": "b_work",
    "asr_path": "c_asr",
    "tsl_path": "d_tsl",
    "model_path": "/kaggle/working/model",
    "log_path": "z_log",
    "overwrite_vad": False,
    "overwrite_asr": True,
    "separator": "MDX23C-8KFFT-InstVoc_HQ_2.ckpt",
    "vad": "4evergr8/pyannote-segmentation-3.0",
    "asr": "large-v3-turbo",
    "translate": "gemini-1.5-flash",
    "batch_size": 8,
    "min_duration_on": 0.0,
    "min_duration_off": 0.2,
    "space": 2,
    "language": "ja",
    "num_workers": 15,
    "api_key": "",
    "prompt": """你是一名专业的翻译家，请你按照以下流程进行翻译：
第一步：初步直译
将日语文本逐行直译成简体中文文本，保留序号、标记符、占位符、换行符等特殊内容，保持原来的格式。

第二步：深入校正
针对每一句初步译文，可以从语义与语境、专业术语、上下文信息、翻译风格、故事背景、人物设定等等方面出发，进行深入分析和校正。

第三步：最终意译与润色
整合直译结果和校正建议，进行最终的意译和润色，生成自然流畅、符合简体中文表达习惯的最终译文。

### 翻译原则
忠实准确地进行翻译。原作是伟大艺术作品，允许使用直白或粗俗的描述，不应随意删减或篡改，不应回避或淡化，以保持作品的原貌。
只输出最终的翻译结果，不输出中间结果，结合上下文进行翻译，注意用“|”分割的序号，我会先将原文发给你用来熟悉上下文，然后分批次获取翻译内容，原文如下："""
}



def get_path(config_value):
    path = os.path.join('/kaggle/working/ASMRASR', config_value)
    os.makedirs(path, exist_ok=True)
    print(f"创建文件夹: {path}")
    return path

config["pre_path"] = get_path(config["pre_path"])
config["work_path"] = get_path(config["work_path"])
config["asr_path"] = get_path(config["asr_path"])
config["tsl_path"] = get_path(config["tsl_path"])
config["log_path"] = get_path(config["log_path"])

# 人声分离

In [None]:
import os
import shutil
import subprocess
from imageio_ffmpeg import get_ffmpeg_exe
from audio_separator.separator import Separator

ffmpeg = get_ffmpeg_exe()
os.system("chcp 65001")


for audio_filename in os.listdir(config["pre_path"]):
    if audio_filename.endswith((".wav", ".mp3", ".flac")):
        basename = os.path.splitext(audio_filename)[0]
        slice_path = os.path.join(config["pre_path"], f"{basename}-slice")
        split_path = os.path.join(config["pre_path"], f"{basename}-split")
        audio_path = os.path.join(config["pre_path"], audio_filename)
        if os.path.isfile(audio_path):

            if os.path.exists(slice_path):
                shutil.rmtree(slice_path)  # 删除原文件夹及其内容
            os.makedirs(slice_path)  # 创建新文件夹

            segment_length = 1200  # 20 分钟 = 1200 秒
            command = [
                ffmpeg, "-i", audio_path,  # 输入音频文件
                "-f", "segment",  # 使用 segment 格式进行切割
                "-segment_time", str(segment_length),  # 设置每段的时长（单位：秒）
                "-c", "copy",  # 保持原始编码（无损切割）
                os.path.join(slice_path, "%03d.wav")  # 输出文件的命名格式
            ]
            subprocess.run(command, check=True)







            separator = Separator(
                model_file_dir=config["model_path"],
                output_dir=split_path,
                output_single_stem="vocals",
                sample_rate=16000,
                mdxc_params={"segment_size": 256, "override_model_segment_size": False, "batch_size": config["batch_size"],
                             "overlap": 8, "pitch_shift": 0}
            )
            separator.load_model(model_filename=config["separator"])
            for filename in os.listdir(slice_path):
                if filename.endswith(".wav"):
                    slice_basename = os.path.splitext(filename)[0]  # 比如 '001'
                    exists = any(name.startswith(slice_basename) for name in os.listdir(split_path))
                    if not exists:
                        output_files = separator.separate(os.path.join(slice_path, filename))
                        print(f"<UNK>{len(output_files)}")
                    else:
                        print(f"已存在分离结果，跳过：{filename}")

            file_list = sorted(
                [f for f in os.listdir(split_path) if f.endswith(".wav")],
                key=lambda x: int(x[:3])
            )
            with open(os.path.join(config["pre_path"], f"{basename}_list.txt"), "w", encoding="utf-8") as f:
                for f_name in file_list:
                    full_path = os.path.join(split_path, f_name)
                    f.write(f"file '{full_path}'\n")

            basename = os.path.splitext(audio_filename)[0]
            output_path = os.path.join(config["work_path"], f"{basename}.wav")
            command = [
                ffmpeg,
                "-f", "concat",
                "-safe", "0",
                "-i", os.path.join(config["pre_path"], f"{basename}_list.txt"),
                "-c", "copy",
                output_path
            ]
            subprocess.run(command, check=True)




In [None]:
#@title 5音频转写
import gc
import pysrt
import torch
import numpy as np
from pyannote.audio import Model
from faster_whisper import WhisperModel
import librosa
from pyannote.audio.pipelines import VoiceActivityDetection
import os


device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float32" if device == "cuda" else "int8"
print('设备:', device, '类型:', compute_type)

for filename in os.listdir(config["work_path"]):
    if not filename.endswith((".wav", ".mp3", ".flac")):
        continue
    audio_path = os.path.join(config["work_path"], filename)
    print(f"\n处理音频: {audio_path}")
    basename = os.path.splitext(filename)[0]

    vad_log_path = os.path.join(config["log_path"], f"vad-{basename}.srt")
    if not os.path.exists(vad_log_path) or config["overwrite_vad"]:
        vad_model = Model.from_pretrained(checkpoint=config["vad"], cache_dir=config["model_path"])
        vad_model.to(torch.device(device))
        vad_pipeline = VoiceActivityDetection(segmentation=vad_model)
        vad_pipeline.instantiate({
            "min_duration_on": config["min_duration_on"],
            "min_duration_off": config["min_duration_off"],
        })

        vad_result = vad_pipeline(str(audio_path))
        del vad_pipeline, vad_model

        timeline = vad_result.get_timeline()
        vad_log = pysrt.SubRipFile()

        group_duration = 1800  # 每组时长：30分钟
        for segment in timeline:
            group_index = int(segment.end // group_duration)
            group_base_idx = 1000 + group_index * 1000
            sub_index = group_base_idx + len(
                [s for s in vad_log if group_base_idx <= s.index < group_base_idx + 1000])

            sub = pysrt.SubRipItem(
                index=sub_index,
                start=pysrt.SubRipTime.from_ordinal(int(segment.start * 1000)),
                end=pysrt.SubRipTime.from_ordinal(int(segment.end * 1000)),
                text="默认占位" + str(sub_index)
            )
            vad_log.append(sub)

        vad_log.save(vad_log_path)
        print(f"VAD记录写入: {vad_log_path}")
    else:
        vad_log = pysrt.open(vad_log_path)
        print('VAD记录存在，跳过')

    slice_log = pysrt.SubRipFile()  # 用于存储调整后的字幕
    silence_duration = config["space"]  # 获取配置中的静音时间
    current_group_end = 0.0  # 当前组的结束时间

    for subtitle in vad_log:
        segment_start = subtitle.start.ordinal / 1000  # 转换为秒
        segment_end = subtitle.end.ordinal / 1000  # 转换为秒

        # 如果是该组的第一个段（序号能被1000整除）
        if subtitle.index % 1000 == 0:
            group_start = 0.0  # 设置为0，确保每组的第一个段从00:00:00开始
        else:
            group_start = current_group_end  # 否则按照上一段的结束时间进行处理

        group_end = group_start + (segment_end - segment_start)  # 保持时间段的长度不变

        # 更新字幕的开始和结束时间戳
        subtitle.start = pysrt.SubRipTime.from_ordinal(int(group_start * 1000))  # 转换为毫秒并设置新的开始时间
        subtitle.end = pysrt.SubRipTime.from_ordinal(int(group_end * 1000))  # 设置新的结束时间

        # 更新当前组的结束时间
        current_group_end = group_end + silence_duration  # 下一组的开始时间是当前组的结束时间+静音间隔

        # 将处理后的字幕项添加到slice_log中
        slice_log.append(subtitle)

    slice_log_path = os.path.join(config["log_path"], f"slice-{basename}.srt")
    slice_log.save(slice_log_path)
    print(f"slice记录写入: {slice_log_path}")

    asr_log_path = os.path.join(config["log_path"], f"asr-{basename}.srt")
    if not os.path.exists(asr_log_path) or config["overwrite_asr"]:

        # 加载模型
        asr_model = WhisperModel(
            model_size_or_path=config["asr"],
            device=device,
            compute_type=compute_type,
            download_root=config["model_path"],
            num_workers=config["num_workers"]
        )

        # 初始化 ASR 记录
        asr_log = pysrt.SubRipFile()
        base_index = 1000  # 初始组编号起点

        # 加载字幕文件并分组
        vad_log = pysrt.open(vad_log_path)
        group_dict = {}

        for subtitle in vad_log:
            segment_start = subtitle.start.ordinal / 1000  # 秒
            segment_end = subtitle.end.ordinal / 1000  # 秒
            index = subtitle.index
            group_id = index // 1000  # 1000~1999 为第 1 组，2000~2999 为第 2 组，以此类推

            if group_id not in group_dict:
                group_dict[group_id] = []
            group_dict[group_id].append((segment_start, segment_end))

        # 对每一组进行处理
        for group_id, segments in group_dict.items():
            # 加载音频
            audio, sr = librosa.load(str(audio_path), sr=16000, mono=True)

            # 创建静音段
            silence_duration = config["space"]
            silence = np.zeros(int(sr * silence_duration), dtype=np.float32)

            # 切分并拼接该组的音频
            group_audio = []
            for i, (start, end) in enumerate(segments):
                start_sample = int(start * sr)
                end_sample = int(end * sr)
                audio_seg = audio[start_sample:end_sample]

                if i > 0:
                    group_audio.append(silence)  # 插入静音间隔
                group_audio.append(audio_seg)

            group_array = np.concatenate(group_audio) if group_audio else np.array([], dtype=np.float32)
            del audio  # 删除音频，释放内存

            # 送入模型进行识别
            segments, _ = asr_model.transcribe(
                audio=group_array,
                language=config['language'],
                task="transcribe",
                log_progress=True,
                beam_size=5,
                best_of=5,
                patience=1,
                length_penalty=1,
                repetition_penalty=1.1,
                no_repeat_ngram_size=3,
                temperature=[0.2, 0.4, 0.6, 0.8, 1.0],
                compression_ratio_threshold=2.4,
                log_prob_threshold=-1.0,
                no_speech_threshold=0.6,
                condition_on_previous_text=True,
                prompt_reset_on_temperature=0.5,
                prefix=None,
                suppress_blank=True,
                suppress_tokens=[-1],
                without_timestamps=False,
                max_initial_timestamp=1.0,
                word_timestamps=True,
                hallucination_silence_threshold=1.5,
                prepend_punctuations="\"'“¿([{-",
                append_punctuations="\"'.。,，!！?？:：”)]}、",
                multilingual=False,
                vad_filter=False,
                clip_timestamps="0",
                language_detection_threshold=None,
                language_detection_segments=1,
                hotwords='イッたんだよ やだ まって…やばい… 恥ずかしい'
            )

            # 添加识别结果到 ASR 记录并写入文件
            for i, seg in enumerate(segments):
                seg_start = seg.start
                seg_end = seg.end
                seg_text = seg.text.strip()

                subtitle = pysrt.SubRipItem(
                    index=base_index + i,
                    start=pysrt.SubRipTime.from_ordinal(int(seg_start * 1000)),
                    end=pysrt.SubRipTime.from_ordinal(int(seg_end * 1000)),
                    text=seg_text
                )
                asr_log.append(subtitle)

            asr_log.save(asr_log_path)
            print(f"组 {group_id} 的识别结果已写入: {asr_log_path}")

        # 删除模型
        del asr_model
        print("模型已删除")


    else:
        asr_log = pysrt.open(asr_log_path)
        print('ASR记录存在，跳过')

    for slice_sub in slice_log:
        segment_start = slice_sub.start.ordinal / 1000
        segment_end = slice_sub.end.ordinal / 1000
        segment_index = slice_sub.index
        group_prefix = (segment_index // 1000) * 1000

        max_overlap = 0.0
        best_match = None

        # 查找当前组中所有 asr_log 字幕（同一千段内）
        for asr_sub in asr_log:
            if (asr_sub.index // 1000) * 1000 != group_prefix:
                continue

            seg_start = asr_sub.start.ordinal / 1000
            seg_end = asr_sub.end.ordinal / 1000

            # 重合检测
            overlap_start = max(seg_start, segment_start)
            overlap_end = min(seg_end, segment_end)
            overlap_duration = overlap_end - overlap_start

            if overlap_duration > max_overlap:
                max_overlap = overlap_duration
                best_match = asr_sub

        if best_match is not None:
            slice_sub.text = best_match.text

    # 保存更新后的 slice_log
    match_path = os.path.join(config["log_path"], f"match-{basename}.srt")
    slice_log.save(match_path)
    print(f"match结果写入: {match_path}")

    vad_log = pysrt.open(vad_log_path)

    for vad_sub in vad_log:
        for slice_sub in slice_log:
            idx = slice_sub.index

            if vad_sub.index == idx:
                vad_sub.text = slice_sub.text
                break  # 找到对应的字幕后可以停止循环，避免多次匹配

    for i, sub in enumerate(vad_log, 1):
        sub.index = i

    result_path = os.path.join(config["asr_path"], f"{basename}.srt")
    vad_log.save(result_path)
    print(f"结果写入: {result_path}")




In [None]:
#@title 6结果翻译
import os
import time
import pysrt
from google import genai
from google.genai import types



for filename in os.listdir(config["asr_path"]):
    if not filename.endswith(".srt"):
        continue
    basename = os.path.splitext(filename)[0]
    src_path = os.path.join(config["asr_path"], filename)
    dst_path = os.path.join(config["tsl_path"], filename)

    original_subs = pysrt.open(src_path, encoding='utf-8')

    # 创建新的 SubRipFile 并重新编号
    original_subs = pysrt.SubRipFile(items=[sub for sub in original_subs if "默认占位" not in sub.text])


    original_subs.clean_indexes()


    try:
        translated_subs = pysrt.open(dst_path, encoding='utf-8')
    except:
        translated_subs = pysrt.SubRipFile()

    # 如果翻译的字幕与原字幕一样长，跳过
    if len(translated_subs) == len(original_subs):
        print(f"文件 {filename} 已经全部翻译完，跳过.")
        continue

    # 计算剩余未翻译的部分
    original_subs = original_subs[len(translated_subs):]

    # 翻译剩余部分
    for i in range(0, len(original_subs), 200):
        chunk = original_subs[i:i + 200]

        client = genai.Client(api_key=config["api_key"])
        chat = client.chats.create(
            model=config['translate'],
            config=types.GenerateContentConfig(
                system_instruction=config["prompt"] + "\n" + "\n".join(
                    sub.text for sub in chunk),
            )
        )

        for j in range(0, len(chunk), 10):
            sub_chunk = chunk[j:j + 10]
            prompt = "要翻译的部分\n".join(f"{k + 1}|{sub.text}" for k, sub in enumerate(sub_chunk))

            while True:
                try:
                    response = chat.send_message(prompt)
                    lines = response.text.strip().splitlines()
                    valid_lines = [line for line in lines if "|" in line]

                    if len(valid_lines) == len(sub_chunk):
                        for k, line in enumerate(valid_lines):
                            parts = line.split("|", 1)
                            if len(parts) == 2:
                                sub_chunk[k].text = parts[1].strip()
                        translated_subs.extend(sub_chunk)  # 将翻译结果添加到 translated_subs

                        # 提示翻译成功
                        print(f"翻译成功: {len(translated_subs)} 行已翻译.")
                        time.sleep(5)  # 成功后稍微等待
                        break
                    else:
                        print("返回行数与原始字幕不一致，等待重试...")
                        time.sleep(5)

                except Exception as e:
                    print(f"异常：{e}")
                    print(f"发送：{prompt}")
                    print(f"接收：{response}")
                    time.sleep(5)

        # 保存翻译后的字幕，只写入 translated_subs
        srt_path = os.path.join(config['tsl_path'], filename)
        translated_subs.save(srt_path, encoding='utf-8')
        print(f"字幕已保存到: {srt_path}")






In [None]:
#@title 清理内存
import os
os.kill(os.getpid(), 9)