<a href="https://colab.research.google.com/github/0ldriku/CAF-Annotator/blob/main/Maruko_Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to use
 **Important: Ensure that the runtime of Google Colab is set to the GPU runtime for optimal performance and faster processing.**

To use this notebook:
1. Upload the audio or video files you wish to transcribe. You can select and upload multiple files at once.
2. Adjust the settings according to your preferences, selecting the model size and specifying the language.
3. Click the "Transcribe" cell to initiate the transcription process.
4. **Important** :Ensure that the runtime of Google Colab is set to the GPU runtime for optimal performance and faster processing.



In [None]:
#@title **1. Upload Local File**
# @markdown You can upload multiple files.
from google.colab import files

use_drive = False
uploaded = files.upload()
file_names = []
file_names.extend(list(uploaded.keys()))
print('File uploaded，please continue to upload more or execute next cell')


In [None]:
#@title **2. Required settings:**


# @markdown **【IMPORTANT】:**Select uploaded file type.

# encoding:utf-8

# @markdown <br/>Model size will affect the processing time and transcribe quality.
# @markdown <br/>The default model is the stable large-v2 model
# @markdown <br/>The model size will affect the transcription time and quality.
# @markdown <br/>The default recognition language is English. If your audio is in other languages, please change the language codes such as 'en', 'ja'.
# @markdown <br/>【Please note】: large-v3 may not necessarily be better than large-v2 or earlier models in some cases. Users should choose for themselves.

model_size = "large-v2"  # @param ["base","small","medium", "large-v1","large-v2","large-v3"]
language = "en"  # @param {type:"string"}
set_beam_size = 5
is_vad_filter = "False"



In [None]:
#@title **3. Transcribe**
#@markdown Transcription files will be auto downloaded after finish.

!pip install faster-whisper

import os
import sys
import time
from pathlib import Path
import json
import zipfile
import torch
from faster_whisper import WhisperModel
from faster_whisper.tokenizer import Tokenizer
from google.colab import files
from IPython.display import clear_output

file_basenames = []

if use_drive:
    output_dir = os.path.dirname(drive_dir[0])
    try:
        file_names = drive_dir
        for i in range(len(file_names)):
            file_basenames.append(file_names[i].split('.')[0])
            # print(file_name)
        output_dir = os.path.dirname(drive_dir[0])
    except Exception as e:
        print(f'error: {e}')
else:
    sys.path.append('/drive/content')
    if not os.path.exists(file_names[0]):
        raise ValueError(f"No {file_names[0]} found in current path.")
    else:
        try:
            for i in range(len(file_names)):
                file_basenames.append(Path(file_names[i]).stem)
            output_dir = Path(file_names[0]).parent.resolve()
            # print(file_basename)
            # print(output_dir)
        except Exception as e:
            print(f'error: {e}')

clear_output()
print('Loading model...')
model = WhisperModel(model_size, device="cuda", compute_type="float16")
#model = WhisperModel(model_size, device="cpu", compute_type="int8")
torch.cuda.empty_cache()

# Don't include arabic numbers and symobls for sequencematcher.py
tokenizer = Tokenizer(tokenizer=model.hf_tokenizer, task="transcribe", language="en", multilingual=True)
number_tokens = [i for i in range(tokenizer.eot) if all(c in "0123456789$@\\*" for c in tokenizer.decode([i]).removeprefix(" "))]

# Create a single ZIP file
combined_zip_filename = "transcription_results.zip"
with zipfile.ZipFile(combined_zip_filename, 'w') as zipf:
    for i in range(len(file_names)):
        file_name = file_names[i]
        file_basename = file_basenames[i]
        _, extension = os.path.splitext(file_name)

        tic = time.time()
        clear_output()
        print("Transcribe in progress...")
        segments, info = model.transcribe(file_name, suppress_tokens=[-1] + number_tokens, beam_size=5, word_timestamps=True, language="en")

        transcribe_results = []
        for segment in segments:
            transcribe_tmp = {
                "start": segment.start,
                "end": segment.end,
                "subtitle": segment.text,
                "word_timestamps": []
            }
            for word in segment.words:
                transcribe_tmp["word_timestamps"].append({
                    "start": word[0],
                    "end": word[1],
                    "text": word[2]
                })
            transcribe_results.append(transcribe_tmp)
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

        #Time comsumed
        toc = time.time()
        print('Done')
        print(f'Time consumpution {toc-tic}s')

        # Save the transcription results to a JSON file
        output_file = f"{file_basename}{extension}.transcribe.json"
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(transcribe_results, json_file, indent=2, ensure_ascii=False)

        # Extract and save subtitles to a text file
        subtitles_path = f"{file_basename}{extension}.subtitles.txt"
        with open(subtitles_path, "w", encoding="utf-8") as file:
            for result in transcribe_results:
                file.write(result["subtitle"] + "\n")

        # Write the JSON file and the subtitles text file to the ZIP archive
        zipf.write(output_file, f"{file_basename}/{output_file}")
        zipf.write(subtitles_path, f"{file_basename}/{subtitles_path}")

        print(i+1, 'file(s) was completed!')


    torch.cuda.empty_cache()

print('All done!')

# Download the single ZIP file containing all the transcription results and subtitles
files.download(combined_zip_filename)

In [None]:
#@title **DON' RUN [Select File From Google Drive**]

# @markdown Importing files from Google Drive is not recommended as it can take a considerable amount of time to import all the data.


!pip install geemap
from google.colab import drive
from google.colab import files
import os
import logging
from IPython.display import clear_output
import geemap

clear_output()
drive.mount('/drive')

print('Google Drive is mounted，please select file')


from ipytree import Tree, Node
import ipywidgets as widgets
from ipywidgets import interactive
# import os
from google.colab import output
output.enable_custom_widget_manager()
use_drive = True
global drive_dir
drive_dir = []

def file_tree():
    # create widgets as a simple file browser
    full_widget = widgets.HBox()
    left_widget = widgets.VBox()
    right_widget = widgets.VBox()

    path_widget = widgets.Text()
    path_widget.layout.min_width = '300px'
    select_widget = widgets.Button(
      description='Select', button_style='primary', tooltip='Select current media file.'
      )
    drive_url = widgets.Output()

    right_widget.children = [select_widget]
    full_widget.children = [left_widget]

    tree_widget = widgets.Output()
    tree_widget.layout.max_width = '300px'
    tree_widget.overflow = 'auto'

    left_widget.children = [path_widget,tree_widget]

    # init file tree
    my_tree = Tree(multiple_selection=False)
    my_tree_dict = {}
    media_names = []

    def select_file(b):
        drive_dir.append(path_widget.value)
        # full_widget.disabled = True
        # clear_output()
        print('File selected，please continue to select more or execute next cell')
    #     if (out_file not in my_tree_dict.keys()) and (out_dir in my_tree_dict.keys()):
    #         node = Node(os.path.basename(out_file))
    #         my_tree_dict[out_file] = node
    #         parent_node = my_tree_dict[out_dir]
    #         parent_node.add_node(node)

    select_widget.on_click(select_file)

    def handle_file_click(event):
        if event['new']:
            cur_node = event['owner']
            for key in my_tree_dict.keys():
                if (cur_node is my_tree_dict[key]) and (os.path.isfile(key)):
                    try:
                        with open(key) as f:
                            path_widget.value = key
                            path_widget.disabled = False
                            select_widget.disabled = False
                            full_widget.children = [left_widget, right_widget]
                    except Exception as e:
                        path_widget.value = key
                        path_widget.disabled = True
                        select_widget.disabled = True

                        return

    def handle_folder_click(event):
        if event['new']:
            full_widget.children = [left_widget]

    # redirect cwd to default drive root path and add nodes
    my_dir = '/drive/MyDrive'
    my_root_name = my_dir.split('/')[-1]
    my_root_node = Node(my_root_name)
    my_tree_dict[my_dir] = my_root_node
    my_tree.add_node(my_root_node)
    my_root_node.observe(handle_folder_click, 'selected')

    for root, d_names, f_names in os.walk(my_dir):
        folders = root.split('/')
        for folder in folders:
            if folder.startswith('.'):
                continue
        for d_name in d_names:
            if d_name.startswith('.'):
                d_names.remove(d_name)
        for f_name in f_names:
            # if f_name.startswith('.'):
            #     f_names.remove(f_name)
            # only add media files
            if f_name.lower().endswith(('mp3','m4a','flac','aac','wav','mp4','mkv','ts','flv')):
                media_names.append(f_name)

        d_names.sort()
        f_names.sort()
        media_names.sort()
        keys = my_tree_dict.keys()

        if root not in my_tree_dict.keys():
          # print(f'root name is {root}') # folder path
          name = root.split('/')[-1] # folder name
          # print(f'folder name is {name}')
          dir_name = os.path.dirname(root) # parent path of folder
          # print(f'dir name is {dir_name}')
          parent_node = my_tree_dict[dir_name]
          node = Node(name)
          my_tree_dict[root] = node
          parent_node.add_node(node)
          node.observe(handle_folder_click, 'selected')

        if len(media_names) > 0:
              parent_node = my_tree_dict[root] # parent folders
              # print(parent_node)
              parent_node.opened = False
              for f_name in media_names:
                  node = Node(f_name)
                  node.icon = 'file'
                  full_path = os.path.join(root, f_name)
                  # print(full_path)
                  my_tree_dict[full_path] = node
                  parent_node.add_node(node)
                  node.observe(handle_file_click, 'selected')
        media_names.clear()

    with tree_widget:
      tree_widget.clear_output()
      display(my_tree)

    return full_widget


tree= file_tree()
tree
