# N46Whisper

N46Whisper 是基于 Google Colab 的应用。开发初衷旨在提高乃木坂46（以及坂道系）字幕组日语视频的制作效率,但亦适于所有外语视频的字幕制作。本应用的目标并非生产完美的字幕文件， 而旨在于搭建并提供一个简单且自动化的使用平台以节省生产成品字幕的时间和精力。Whisper模型有其本身的应用场景限制，AI 翻译的质量亦还不能尽如人意。

<font size='4'>**对于中文用户，推荐在使用前阅读[常见问题说明](https://github.com/Ayanaminn/N46Whisper/blob/main/FAQ.md)。如果你觉得本应用对你有所帮助，欢迎帮助扩散给更多的人。**


<font size='4'>**联系作者/Contact me：[E-mail](admin@ikedateresa.cc)**


**<font size='5'>以下选择文件方式按需执行其中一种即可，不需要全部运行</font>**

In [None]:
#@title **从谷歌网盘选择文件**

# @markdown <br/>从网盘目录中选择要转换的文件(视频/音频），单击选中文件，点击'Select'按钮以确认。</font><br/>
# @markdown <br/>若希望从本地上传文件，则跳过此步执行下一单元格。</font><br/>
# @markdown <br/>若到这一步才上传文件到谷歌盘，则重复执行本单元格以刷新文件列表。</font>

!pip install geemap

from IPython.display import clear_output
from google.colab import drive

clear_output()
drive.mount('/drive')

print('谷歌云盘挂载完毕，请选择要转换的文件')

from ipytree import Tree, Node
from google.colab import output

output.enable_custom_widget_manager()
use_drive = True
global drive_dir
drive_dir = []


def file_tree(tree_widget):
    # create widgets as a simple file browser
    full_widget = widgets.HBox()
    left_widget = widgets.VBox()
    right_widget = widgets.VBox()

    path_widget = widgets.Text()
    path_widget.layout.min_width = '300px'
    select_widget = widgets.Button(
        description='Select', button_style='primary', tooltip='Select current media file.'
    )
    drive_url = widgets.Output()

    right_widget.children = [select_widget]
    full_widget.children = [left_widget]

    tree_widget = widgets.Output()
    tree_widget.layout.max_width = '300px'
    tree_widget.overflow = 'auto'

    left_widget.children = [path_widget, tree_widget]

    # init file tree
    my_tree = Tree(multiple_selection=False)
    my_tree_dict = {}
    media_names = []

    def select_file(b):
        drive_dir.append(path_widget.value)
        # full_widget.disabled = True
        # clear_output())
        print(f'已选择文件 {path_widget.value}，可以继续选择或执行下个单元格')

    #     if (out_file not in my_tree_dict.keys()) and (out_dir in my_tree_dict.keys()):
    #         node = Node(os.path.basename(out_file))
    #         my_tree_dict[out_file] = node
    #         parent_node = my_tree_dict[out_dir]
    #         parent_node.add_node(node)

    select_widget.on_click(select_file)

    def handle_file_click(event):
        if event['new']:
            cur_node = event['owner']
            for key in my_tree_dict.keys():
                if (cur_node is my_tree_dict[key]) and (os.path.isfile(key)):
                    try:
                        with open(key) as f:
                            path_widget.value = key
                            path_widget.disabled = False
                            select_widget.disabled = False
                            full_widget.children = [left_widget, right_widget]
                    except Exception as e:
                        path_widget.value = key
                        path_widget.disabled = True
                        select_widget.disabled = True

                        return

    def handle_folder_click(event):
        if event['new']:
            full_widget.children = [left_widget]

    # redirect cwd to default drive root path and add nodes
    my_dir = '/drive/MyDrive'
    my_root_name = my_dir.split('/')[-1]
    my_root_node = Node(my_root_name)
    my_tree_dict[my_dir] = my_root_node
    my_tree.add_node(my_root_node)
    my_root_node.observe(handle_folder_click, 'selected')

    for root, d_names, f_names in os.walk(my_dir):
        folders = root.split('/')
        for folder in folders:
            if folder.startswith('.'):
                continue
        for d_name in d_names:
            if d_name.startswith('.'):
                d_names.remove(d_name)
        for f_name in f_names:
            # if f_name.startswith('.'):
            #     f_names.remove(f_name)
            # only add media files
            if f_name.lower().endswith(('mp3', 'm4a', 'flac', 'aac', 'wav', 'mp4', 'mkv', 'ts', 'flv')):
                media_names.append(f_name)

        d_names.sort()
        f_names.sort()
        media_names.sort()
        keys = my_tree_dict.keys()

        if root not in my_tree_dict.keys():
            # print(f'root name is {root}') # folder path
            name = root.split('/')[-1]  # folder name
            # print(f'folder name is {name}')
            dir_name = os.path.dirname(root)  # parent path of folder
            # print(f'dir name is {dir_name}')
            parent_node = my_tree_dict[dir_name]
            node = Node(name)
            my_tree_dict[root] = node
            parent_node.add_node(node)
            node.observe(handle_folder_click, 'selected')

        if len(media_names) > 0:
            parent_node = my_tree_dict[root]  # parent folders
            # print(parent_node)
            parent_node.opened = False
            for f_name in media_names:
                node = Node(f_name)
                node.icon = 'file'
                full_path = os.path.join(root, f_name)
                # print(full_path)
                my_tree_dict[full_path] = node
                parent_node.add_node(node)
                node.observe(handle_file_click, 'selected')
        media_names.clear()

    with tree_widget:
        tree_widget.clear_output()
        display(my_tree)

    return full_widget


tree = file_tree()
tree


In [None]:
#@title **从本地上传文件(可多选）**
# @markdown <br/>若已选择谷歌盘中的文件，则跳过此步执行下一单元格。</font>

from google.colab import files

use_drive = False
uploaded = files.upload()
file_names = []
file_names.append(list(uploaded.keys())[0])

print('已上传文件，可以执行下个单元格')

**<font size='5'>以下顺次点击下方每个单元格左侧的“运行”图标，不可跳过步骤</font>**

**</br>【重要】:** 务必在"修改"->"笔记本设置"->"硬件加速器"中选择GPU！否则处理速度会非常慢。

In [None]:
#@title **参数设置:**


# @markdown **</br>【重要】:** 选择上传的文件类型(视频-video/音频-audio）</font>
# encoding:utf-8
file_type = "audio"  # @param ["audio","video"]

# @markdown <br/>模型大小将影响转录时间和质量, **默认使用稳定的large-v2模型以节省时间**
# @markdown <br/>默认识别语言为日语，若使用其它语言的视频请自行输入即可。请注意：使用两字母语言代码如'en'，'ja'
# @markdown <br/>【请注意】：large-v3在某些情况下可能未必优于large-v2或更早的模型，请用户自行选择

model_size = "large-v2"  # @param ["base","small","medium", "large-v1","large-v2","large-v3"]
language = "ja"  # @param {type:"string"}

# @markdown <br/>将存在空格的单行文本分割为多行（多句）。分割后的若干行均临时采用相同时间戳，且添加了adjust_required标记提示调整时间戳避免叠轴
# @markdown <br/>普通分割（Modest): 当空格后的文本长度超过5个字符，则另起一行
# @markdown <br/>全部分割（Aggressive): 只要遇到空格即另起一行
# @markdown <br/>标点分割（Punctuation): 只要遇到句号即另起一行，在未来可能添加更加智能的标点分割方法
is_split = "No"  # @param ["No","Yes"]
split_method = "Modest"  # @param ["Modest","Aggressive", "Punctuation"]

# @markdown **使用VAD过滤**
# @markdown <font size="2">使用[Silero VAD model](https://github.com/snakers4/silero-vad)以检测并过滤音频中的无声段落（推荐小语种使用）
# @markdown <br/>【注意】使用VAD filter有优点亦有缺点，请用户自行根据音频内容决定是否启用. [关于VAD filter](https://github.com/Ayanaminn/N46Whisper/blob/main/FAQ.md)
is_vad_filter = "False"  # @param ["True", "False"]

# @markdown **设置Beam Size**
# @markdown <font size="2">Beam Size数值越高，在识别时探索的路径越多，这在一定范围内可以帮助提高识别准确性，但是相对的VRAM使用也会更高. 同时，Beam Size在超过5-10后有可能降低精确性，详情请见https://arxiv.org/pdf/2204.05424.pdf
# @markdown <br/> 默认设置为 5
set_beam_size = 5  #@param

# @markdown <font size="2">在不设置Beam Size时，Whisper将会使用贪心解码，这在一定程度上可能与英语等其他语言的换行功能有联系，详情请见https://github.com/Ayanaminn/N46Whisper/issues/46
# @markdown <br/> 默认设置为 false
beam_size_off = False  # @param {type:"boolean"}

In [None]:
#@title **运行Whisper**
# Hugging Face Hub
# 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

#@markdown 完成后ass文件将自动下载到本地/ass file will be auto downloaded after finish.
! pip install ffmpeg
! pip install pysubs2
! pip install faster-whisper

import torch
from faster_whisper import WhisperModel
import ipywidgets as widgets
from IPython.display import display, clear_output

clear_output()
print('语音识别库配置完毕，将开始转换')

import os
from tqdm import tqdm
import time
from pathlib import Path
import sys
import pysubs2

# Enable nested asyncio in Jupyter Notebook
# nest_asyncio.apply()

# assert file_name != ""
# assert language != ""
import warnings

warnings.filterwarnings("ignore")

file_basenames = []

if use_drive:
    output_dir = os.path.dirname(drive_dir[0])
    try:
        file_names = drive_dir
        for i in range(len(file_names)):
            file_basenames.append(file_names[i].split('.')[0])
        # print(file_name)
        output_dir = os.path.dirname(drive_dir[0])
    except Exception as e:
        print(f'error: {e}')
else:
    sys.path.append('/drive/content')
    if not os.path.exists(file_names[0]):
        raise ValueError(f"No {file_names[0]} found in current path.")
    else:
        try:
            for i in range(len(file_names)):
                file_basenames.append(Path(file_names[i]).stem)
            output_dir = Path(file_names[0]).parent.resolve()
            # print(file_basename)
            # print(output_dir)
        except Exception as e:
            print(f'error: {e}')

clear_output()
print('加载模型..')

model = WhisperModel(model_size)
torch.cuda.empty_cache()

for i in range(len(file_names)):
    file_name = file_names[i]
    #Transcribe
    file_basename = file_basenames[i]
    if file_type == "video":
        print('提取音频中...')
        os.system(f'ffmpeg -i {file_name} -f mp3 -ab 192000 -vn {file_basename}.mp3')
        print('提取完毕.')
    # print(file_basename)
    tic = time.time()
    clear_output()
    print(f'识别中... \n{file_name}')

    if beam_size_off:
        segments, info = model.transcribe(
            audio=f'{file_name}',
            language=language,
            vad_filter=is_vad_filter,
            vad_parameters=dict(min_silence_duration_ms=1000)
        )
    else:
        segments, info = model.transcribe(
            audio=f'{file_name}',
            beam_size=set_beam_size,
            language=language,
            vad_filter=is_vad_filter,
            vad_parameters=dict(min_silence_duration_ms=1000)
        )

    # segments is a generator so the transcription only starts when you iterate over it
    # to use pysubs2, the argument must be a segment list-of-dicts
    total_duration = round(info.duration, 2)  # Same precision as the Whisper timestamps.
    results = []
    with tqdm(total=total_duration, unit=" seconds") as pbar:
        for s in segments:
            segment_dict = {'start': s.start, 'end': s.end, 'text': s.text}
            results.append(segment_dict)
            segment_duration = s.end - s.start
            pbar.update(segment_duration)

    #Time comsumed
    toc = time.time()
    print('识别完毕 Done')
    print(f'Time consumpution {toc - tic}s')

    subs = pysubs2.load_from_whisper(results)
    srt_filename = file_basename + '.srt'
    subs.save(srt_filename)

    files.download(srt_filename)
    
    print('第', i + 1, '个文件字幕生成完毕/', i + 1)
    torch.cuda.empty_cache()

print('所有字幕生成完毕 All done!')