<a href="https://colab.research.google.com/github/Eve-tsai/faster-whisper/blob/main/%E8%87%AA%E8%A8%AD_Faster_Whisper_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **套件安裝**

In [3]:
!apt-get install -y libcublas11
!pip install ctranslate2==4.4.0
!pip install numpy==1.24.0

!pip install faster-whisper ipywidgets

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libcublas11 is already the newest version (11.7.4.6~11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting faster-whisper
  Using cached faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Using cached onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Using cached av-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Using cached hum

In [4]:
from ipywidgets import widgets, VBox, HBox, Layout
from IPython.display import display, clear_output
from faster_whisper import WhisperModel
import os
from google.colab import files as colab_files

# **上傳檔案 version**

## 初始化 WhisperModel

In [3]:
model_size = "large-v2"  # 可根據需求調整模型大小：tiny, base, small, medium, large, large-v2, large-v3
model = WhisperModel (model_size, device="cuda", compute_type="float16")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

## 轉譯辨識

In [11]:

def transcribe(audio_path, mode):
    transcription = ""
    with output:
        clear_output()
        print("正在進行語音辨識，請稍候...")
        segments, info = model.transcribe(audio_path, beam_size=5, language="zh", initial_prompt="繁體")

        if mode == "normal":
            transcription_segments = [segment.text for segment in segments]
            transcription = "，".join(transcription_segments)
        elif mode == "timeline":
            for segment in segments:
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_remainder = divmod(start_remainder, 60)
                start_seconds, start_milliseconds = divmod(start_remainder, 1)
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_remainder = divmod(end_remainder, 60)
                end_seconds, end_milliseconds = divmod(end_remainder, 1)
                transcription += "[%02d:%02d:%02d.%03d -> %02d:%02d:%02d.%03d] %s\n" % (
                    int(start_hours), int(start_minutes), int(start_seconds), int(start_milliseconds * 1000),
                    int(end_hours), int(end_minutes), int(end_seconds), int(end_milliseconds * 1000),
                    segment.text
                )
        elif mode == "subtitle":
            for i, segment in enumerate(segments, 1):
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_seconds = divmod(start_remainder, 60)
                start_milliseconds = (segment.start - int(segment.start)) * 1000
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_seconds = divmod(end_remainder, 60)
                end_milliseconds = (segment.end - int(segment.end)) * 1000
                transcription += "%d\n%02d:%02d:%02d.%03d --> %02d:%02d:%02d.%03d\n%s\n\n" % (
                    i,
                    start_hours, start_minutes, int(start_seconds), start_milliseconds,
                    end_hours, end_minutes, int(end_seconds), end_milliseconds,
                    segment.text
                )
        print("辨識完成！結果如下：")
        print(transcription)
        file_name = os.path.splitext(os.path.basename(audio_path))[0]
        with open(f"{file_name}_轉譯.txt", "w") as file:
            file.write(transcription)
        print(f"辨識結果已保存為 {file_name}_轉譯.txt")

        try:
            colab_files.download(f"{file_name}_轉譯.txt")
        except ImportError:
            print("自動下載功能只在 Colab 環境中有效。")


## 控制件初始化

In [6]:

mode_selector = widgets.Dropdown(
    options=[('一般版本', 'normal'), ('加入時間軸版本', 'timeline'), ('產生字幕檔的版本', 'subtitle')],
    value='normal',
    description='模式:',
    layout=Layout(width="250px", align_self="center")  # 調整下拉選單寬度和對齊
)

upload_button = widgets.Button(
    description='上傳檔案',
    button_style='info',
    icon='upload',
    layout=Layout(width="150px", align_self="center")  # 調整按鈕寬度和對齊
)

transcribe_button = widgets.Button(
    description='進行語音辨識',
    button_style='success',
    icon='check',
    layout=Layout(width="150px", align_self="center")  # 調整按鈕寬度和對齊
)

output = widgets.Output()


## 檔案上傳
檔案上傳速度"非常慢"

In [7]:

uploaded_files = []

def on_upload_button_clicked(b):
    global uploaded_files
    uploaded = colab_files.upload()
    for f in uploaded.keys():
        print(f"已上傳檔案: {f}")
        uploaded_files.append(f)

def on_transcribe_button_clicked(b):
    if not uploaded_files:
        with output:
            clear_output()
            print("尚未上傳檔案，請先上傳！")
        return

    audio_path = uploaded_files[-1]
    mode = mode_selector.value
    if os.path.exists(audio_path):
        transcribe(audio_path, mode)
    else:
        with output:
            clear_output()
            print("指定的檔案路徑不存在，請檢查！")

In [12]:
upload_button.on_click(on_upload_button_clicked)
transcribe_button.on_click(on_transcribe_button_clicked)

controls = HBox(
    [mode_selector, upload_button, transcribe_button],
    layout=Layout(justify_content="flex-start", align_items="flex-start", spacing="10px")  # 調整控制元件的佈局
)

ui = VBox([controls, output], layout=Layout(spacing="20px"))
display(ui)

正在進行語音辨識，請稍候...




---





# **掛接google drive version**

## 初始化whisper model

In [5]:
# 初始化 WhisperModel
model_size = "large-v2"  # 可以根據需求調整模型大小：tiny, base, small, medium, large, large-v2, large-v3
model = WhisperModel( model_size, device="cuda", compute_type="float16")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

## 掛載google drive

In [6]:
# 掛載 Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 轉譯辨識    
轉譯結果會有簡體中文

In [None]:
from ipywidgets import widgets, VBox, Dropdown

def transcribe(audio_path, mode):
    transcription = ""
    with output:
        clear_output()
        print("正在進行語音辨識，請稍候...")
        segments, info = model.transcribe(audio_path, beam_size=5,initial_prompt="繁體")

        if mode == "normal":
            transcription_segments = [segment.text for segment in segments]
            transcription = "，".join(transcription_segments)
        elif mode == "timeline":
            for segment in segments:
                # 計算開始時間
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_remainder = divmod(start_remainder, 60)
                start_seconds, start_milliseconds = divmod(start_remainder, 1)
                # 計算結束時間
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_remainder = divmod(end_remainder, 60)
                end_seconds, end_milliseconds = divmod(end_remainder, 1)
                # 格式化成 hh:mm:ss.SSS
                transcription += "[%02d:%02d:%02d.%03d -> %02d:%02d:%02d.%03d] %s\n" % (
                    int(start_hours), int(start_minutes), int(start_seconds), int(start_milliseconds * 1000),
                    int(end_hours), int(end_minutes), int(end_seconds), int(end_milliseconds * 1000),
                    segment.text
                )
        elif mode == "subtitle":
            for i, segment in enumerate(segments, 1):
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_seconds = divmod(start_remainder, 60)
                start_milliseconds = (segment.start - int(segment.start)) * 1000
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_seconds = divmod(end_remainder, 60)
                end_milliseconds = (segment.end - int(segment.end)) * 1000
                transcription += "%d\n%02d:%02d:%02d.%03d --> %02d:%02d:%02d.%03d\n%s\n\n" % (
                    i,
                    start_hours, start_minutes, int(start_seconds), start_milliseconds,
                    end_hours, end_minutes, int(end_seconds), end_milliseconds,
                    segment.text
                )
        print("辨識完成！結果如下：")
        print(transcription)

        file_name = os.path.splitext(os.path.basename(audio_path))[0]
        with open(f"{file_name}_transcription.txt", "w") as file:
            file.write(transcription)
        print(f"辨識結果已保存為 {file_name}_transcription.txt")

        try:
            from google.colab import files
            files.download(f"{file_name}_transcription.txt")
        except ImportError:
            print("自動下載功能只在 Colab 環境中有效。")

mode_selector = Dropdown(
    options=[('一般版本', 'normal'), ('加入時間軸版本', 'timeline'), ('產生字幕檔的版本', 'subtitle')],
    value='normal',
    description='模式:',
)

file_path_input = widgets.Text(
    value='',
    placeholder='請輸入檔案路徑',
    description='檔案路徑:',
    disabled=False
)
transcribe_button = widgets.Button(
    description='進行語音辨識',
    disabled=False,
    button_style='info',
    tooltip='Click me',
    icon='check'
)
output = widgets.Output()

def on_transcribe_button_clicked(b):
    audio_path = file_path_input.value
    mode = mode_selector.value
    if os.path.exists(audio_path):
        transcribe(audio_path, mode)
    else:
        with output:
            clear_output()
            print("指定的檔案路徑不存在，請檢查！")

transcribe_button.on_click(on_transcribe_button_clicked)

clear_output()

ui = VBox([file_path_input, mode_selector, transcribe_button, output])
display(ui)


# 轉譯辨識(cc)
強制簡體轉繁體

In [None]:
! pip install opencc

from opencc import OpenCC

In [None]:
# CC 版本
from ipywidgets import widgets, VBox, Dropdown

def transcribe(audio_path, mode):
    transcription = ""
    cc = OpenCC('s2t')  # 將簡體轉為繁體
    with output:
        clear_output()
        print("正在進行語音辨識，請稍候...")
        segments, info = model.transcribe(
                                  audio_path,
                                  beam_size=5,
                                  language="zh",
                                  initial_prompt="繁體"
                              )


        if mode == "normal":
            transcription_segments = [segment.text for segment in segments]
            transcription = "，".join(transcription_segments)

        elif mode == "timeline":
            for segment in segments:
                # 計算開始時間
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_remainder = divmod(start_remainder, 60)
                start_seconds, start_milliseconds = divmod(start_remainder, 1)
                # 計算結束時間
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_remainder = divmod(end_remainder, 60)
                end_seconds, end_milliseconds = divmod(end_remainder, 1)
                # 格式化成 hh:mm:ss.SSS
                transcription += "[%02d:%02d:%02d.%03d -> %02d:%02d:%02d.%03d] %s\n" % (
                    int(start_hours), int(start_minutes), int(start_seconds), int(start_milliseconds * 1000),
                    int(end_hours), int(end_minutes), int(end_seconds), int(end_milliseconds * 1000),
                    segment.text
                )

        elif mode == "subtitle":
            for i, segment in enumerate(segments, 1):
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_seconds = divmod(start_remainder, 60)
                start_milliseconds = (segment.start - int(segment.start)) * 1000
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_seconds = divmod(end_remainder, 60)
                end_milliseconds = (segment.end - int(segment.end)) * 1000
                transcription += "%d\n%02d:%02d:%02d.%03d --> %02d:%02d:%02d.%03d\n%s\n\n" % (
                    i,
                    start_hours, start_minutes, int(start_seconds), start_milliseconds,
                    end_hours, end_minutes, int(end_seconds), end_milliseconds,
                    segment.text
                )

        # 使用 OpenCC 進行簡體轉繁體
        transcription = cc.convert(transcription)

        print("辨識完成！結果如下：")
        print(transcription)

        file_name = os.path.splitext(os.path.basename(audio_path))[0]
        with open(f"{file_name}_轉譯.txt", "w") as file:
            file.write(transcription)
        print(f"辨識結果已保存為 {file_name}_轉譯.txt")

        try:
            from google.colab import files
            files.download(f"{file_name}_轉譯.txt")
        except ImportError:
            print("自動下載功能只在 Colab 環境中有效。")

mode_selector = Dropdown(
    options=[('一般版本', 'normal'), ('加入時間軸版本', 'timeline'), ('產生字幕檔的版本', 'subtitle')],
    value='normal',
    description='模式:',
)

file_path_input = widgets.Text(
    value='',
    placeholder='請輸入檔案路徑',
    description='檔案路徑:',
    disabled=False
)
transcribe_button = widgets.Button(
    description='進行語音辨識',
    disabled=False,
    button_style='info',
    tooltip='Click me',
    icon='check'
)
output = widgets.Output()

def on_transcribe_button_clicked(b):
    audio_path = file_path_input.value
    mode = mode_selector.value
    if os.path.exists(audio_path):
        transcribe(audio_path, mode)
    else:
        with output:
            clear_output()
            print("指定的檔案路徑不存在，請檢查！")

transcribe_button.on_click(on_transcribe_button_clicked)

clear_output()

ui = VBox([file_path_input, mode_selector, transcribe_button, output])
display(ui)



---



#  轉譯辨識
initial_prompt="As a default, provide response in #zh-tw unless specified otherwise"

In [8]:
from ipywidgets import widgets, VBox, Dropdown

def transcribe(audio_path, mode):
    transcription = ""
    with output:
        clear_output()
        print("正在進行語音辨識，請稍候...")
        segments, info = model.transcribe(
                          audio_path,
                          beam_size=5,
                          language="zh",
                          initial_prompt="As a default, provide response in zh-tw unless specified otherwise"
                      )

        if mode == "normal":
            transcription_segments = [segment.text for segment in segments]
            transcription = "，".join(transcription_segments)
        elif mode == "timeline":
            for segment in segments:
                # 計算開始時間
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_remainder = divmod(start_remainder, 60)
                start_seconds, start_milliseconds = divmod(start_remainder, 1)
                # 計算結束時間
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_remainder = divmod(end_remainder, 60)
                end_seconds, end_milliseconds = divmod(end_remainder, 1)
                # 格式化成 hh:mm:ss.SSS
                transcription += "[%02d:%02d:%02d.%03d -> %02d:%02d:%02d.%03d] %s\n" % (
                    int(start_hours), int(start_minutes), int(start_seconds), int(start_milliseconds * 1000),
                    int(end_hours), int(end_minutes), int(end_seconds), int(end_milliseconds * 1000),
                    segment.text
                )
        elif mode == "subtitle":
            for i, segment in enumerate(segments, 1):
                start_hours, start_remainder = divmod(segment.start, 3600)
                start_minutes, start_seconds = divmod(start_remainder, 60)
                start_milliseconds = (segment.start - int(segment.start)) * 1000
                end_hours, end_remainder = divmod(segment.end, 3600)
                end_minutes, end_seconds = divmod(end_remainder, 60)
                end_milliseconds = (segment.end - int(segment.end)) * 1000
                transcription += "%d\n%02d:%02d:%02d.%03d --> %02d:%02d:%02d.%03d\n%s\n\n" % (
                    i,
                    start_hours, start_minutes, int(start_seconds), start_milliseconds,
                    end_hours, end_minutes, int(end_seconds), end_milliseconds,
                    segment.text
                )
        print("辨識完成！結果如下：")
        print(transcription)

        file_name = os.path.splitext(os.path.basename(audio_path))[0]
        with open(f"{file_name}_transcription.txt", "w") as file:
            file.write(transcription)
        print(f"辨識結果已保存為 {file_name}_transcription.txt")

        try:
            from google.colab import files
            files.download(f"{file_name}_transcription.txt")
        except ImportError:
            print("自動下載功能只在 Colab 環境中有效。")

mode_selector = Dropdown(
    options=[('一般版本', 'normal'), ('加入時間軸版本', 'timeline'), ('產生字幕檔的版本', 'subtitle')],
    value='normal',
    description='模式:',
)

file_path_input = widgets.Text(
    value='',
    placeholder='請輸入檔案路徑',
    description='檔案路徑:',
    disabled=False
)
transcribe_button = widgets.Button(
    description='進行語音辨識',
    disabled=False,
    button_style='info',
    tooltip='Click me',
    icon='check'
)
output = widgets.Output()

def on_transcribe_button_clicked(b):
    audio_path = file_path_input.value
    mode = mode_selector.value
    if os.path.exists(audio_path):
        transcribe(audio_path, mode)
    else:
        with output:
            clear_output()
            print("指定的檔案路徑不存在，請檢查！")

transcribe_button.on_click(on_transcribe_button_clicked)

clear_output()

ui = VBox([file_path_input, mode_selector, transcribe_button, output])
display(ui)


VBox(children=(Text(value='', description='檔案路徑:', placeholder='請輸入檔案路徑'), Dropdown(description='模式:', options…