In [None]:
import whisperx
import pandas as pd
import os
import torch
import re
import csv

# import custom-made functions
from utils.textgrid_export import export_transcript_as_textgrid


input_folder = "../online/Audiofiles/" # this folder contains wav or mp4 files to be transcribed
output_folder = "../output/" # whisper output txt files and TextGrid files will be saved here in the "tsv" and "textgrid" folders, respectively

In [None]:
### Load the whisper model
# set the device, batch size, and compute type
device = 'cuda' if torch.cuda.is_available() else 'cpu' # setting device on GPU if available, else CPU
batch_size = 16 if device == "cuda" else 4 # reduce to 4 if low on GPU memory
compute_type = "float16" if device == "cuda" else "default"
model_size = "large-v3" 
print(f"* Using device: {device} \n* Batch size: {batch_size} \n* Model size: {model_size} \n* Compute type: {compute_type}")

# load model from whisper
model = whisperx.load_model(model_size, device, compute_type=compute_type, language='nl')


### Transcribe all audio/video files in the input folder
# iterate over files in the videos folder & apply whisper model on each videos
for filename in os.listdir(input_folder):
    path = os.path.join(input_folder, filename)

    # check if it is a wav file
    if filename.endswith(".wav") or filename.endswith(".mp4"):
        # check if the output file already exists
        output_filename = filename.split(".")[0] + ".txt"
        if os.path.exists(os.path.join(output_folder, "tsv", output_filename)):
            print(f"{output_filename} already exists in the output folder")
        else:
            #apply whisper model on each file
            
            # 1. Transcribe with original whisper (batched)
            try:
                audio = whisperx.load_audio(path)
            except:
                print("The audio files is not working:" + filename)
            else:
                result = model.transcribe(audio, batch_size=batch_size)

                # 2. Align whisper output
                model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
                result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

                transcribed_words = list()

                # make sure punctuation is removed and everything is lowercase
                for i in result['word_segments']:
                    # remove punctuation
                    word = re.sub(r'[^\w\s]', '',i["word"] )
                    # make lowercase
                    word = word.lower()
                    i["word"] = word
                    transcribed_words.append(word)

                # tsv/txt output
                with open(output_filename, 'w', newline='') as f_output:
                    tsv_output = csv.writer(f_output, delimiter='\t')
                    tsv_output.writerow(transcribed_words) 
                
                # textgrid output
                if len(transcribed_words) > 0:
                    export_transcript_as_textgrid(result, output_filename, output_folder)
            
            
            
