# 🗣️ [**AudioToText**](https://github.com/Carleslc/AudioToText)

[![Donate](https://www.ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/carleslc)

### 🛠 [Whisper by OpenAI](https://github.com/openai/whisper)


## [Step 1] ⚙️ Install the required libraries

Click ▶️ button below to install the dependencies for this notebook.

In [None]:
!nvidia-smi
!apt-get install libcudnn8
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')
#@title { display-mode: "form" }
import subprocess
from sys import platform as sys_platform

# Install ffmpeg if not present
status, ffmpeg_version = subprocess.getstatusoutput("ffmpeg -version")
if status != 0:
    if sys_platform == 'linux':
        !apt-get -qq update && apt-get -qq install -y ffmpeg
    else:
        print("Install ffmpeg: https://ffmpeg.org/download.html")
else:
    print(ffmpeg_version.split('\n')[0])

# Install faster-whisper and other dependencies
print("Installing Python dependencies...")
!pip install --upgrade pip
!pip install faster-whisper deepgram-sdk pydub typing-extensions
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install openai-whisper
!pip install srt requests tqdm googletrans==4.0.0rc1 httpx aiometer
# https://stackoverflow.com/a/77671445
!apt install libcublas11

## [Step 2] 📁 Upload your audio files to the Files folder

⬅️ Files folder in Google Colab is on the left menu

Almost any audio or video file format is [supported](https://gist.github.com/Carleslc/1d6b922c8bf4a7e9627a6970d178b3a6).

In [2]:
#@title { display-mode: "form" }
import os
import torch
import math
from faster_whisper import WhisperModel
import subprocess
from pydub import AudioSegment
from pydub.silence import split_on_silence

# select task
task = "Transcribe" #@param ["Transcribe", "Translate to English"]
task = "transcribe" if task == "Transcribe" else "translate"

# set model
use_model = "large-v2" #@param ["tiny", "base", "small", "medium", "large-v1", "large-v2"]

# other parameters
prompt = "" #@param {type:"string"}
coherence_preference = "More coherence, but may repeat text" #@param ["More coherence, but may repeat text", "Less repetitions, but may have less coherence"]
api_key = '' #@param {type:"string"}

# Set device
if api_key:
    print("Using API")
    from pydub import AudioSegment
    from pydub.silence import split_on_silence
else:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {'GPU' if DEVICE == 'cuda' else 'CPU ⚠️'}")
    if DEVICE == "cuda":
        !nvidia-smi -L
    else:
        print("Not using GPU can result in a very slow execution")
        print("Ensure Hardware accelerator by GPU is enabled in Google Colab: Runtime > Change runtime type")

# Load model
if not api_key:
    model_size = use_model
    print(f"\nLoading {model_size} model...")
    model = WhisperModel(model_size, device=DEVICE, compute_type="float16" if DEVICE == "cuda" else "int8")
    print(f"Model {model_size} loaded successfully.\n")

# Set options for faster-whisper
options = {
    'task': task,
    'beam_size': 5,
    'vad_filter': True,
    'initial_prompt': prompt or None,
    'word_timestamps': False,
    'language': None # Auto-detect language
}

Using GPU
GPU 0: Tesla T4 (UUID: GPU-0fb922ce-7d06-95a6-1d70-d27ce0d02a89)

Loading large-v2 model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Model large-v2 loaded successfully.



In [None]:
#@title { display-mode: "form" }
import os
import zipfile
import glob
from whisper.utils import get_writer

# Define the root directory to search for video files
directory_to_scan = '.'  # Scans the current directory. Change this if needed.

# List of video file extensions to look for
video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.webm', '.mka']

# set output folder
output_dir = "audio_transcription"
os.makedirs(output_dir, exist_ok=True)

# set output formats: https://github.com/openai/whisper/blob/v20231117/whisper/utils.py#L283
output_formats = "srt" #@param ["txt,vtt,srt,tsv,json", "txt,vtt,srt", "txt,vtt", "txt,srt", "txt", "vtt", "srt", "tsv", "json"] {allow-input: true}
output_formats = [f.strip() for f in output_formats.split(',')]


# --- New logic: Find video files and check for existing subtitles ---
audio_files = []
for file_path in glob.glob(os.path.join(directory_to_scan, '*')):
    if os.path.isfile(file_path):
        name, ext = os.path.splitext(file_path)
        if ext.lower() in video_extensions:
            # Check if a subtitle file already exists for the main output format
            srt_output_path = os.path.join(output_dir, f"{os.path.basename(name)}.{output_formats[0]}")
            if os.path.exists(srt_output_path):
                print(f"Skipping {os.path.basename(file_path)}: Subtitle file already exists.")
            else:
                audio_files.append(file_path)

if not audio_files:
    print("No new video files found to process.")
else:
    # Perform transcription
    if task == "translate":
        print("-- TRANSLATE TO ENGLISH --")
    else:
        print("-- TRANSCRIPTION --")

    results = {}

    for audio_path in audio_files:
        print(f"\nProcessing: {os.path.basename(audio_path)}\n")

        if not api_key:
            # Faster-Whisper
            segments, info = model.transcribe(audio_path, **options)

            transcription_text = []
            result_segments = []
            for segment in segments:
                transcription_text.append(segment.text)
                result_segments.append({
                    'start': segment.start,
                    'end': segment.end,
                    'text': segment.text
                })

            result = {
                'text': "\n".join(transcription_text),
                'segments': result_segments,
                'language': info.language
            }
            print(f"Detected language: {result['language'].title()}\n")

            for segment in result_segments:
                print(f"[{segment['start']:.2f} --> {segment['end']:.2f}] {segment['text']}")

        else:
            # API processing logic would go here
            pass

        results[audio_path] = result

    # Save results
    print("\nWriting results...")

    # Helper function to correctly save files
    def save_file_from_result(result, output_format, output_dir, output_file_name):
        writer = get_writer(output_format, output_dir)
        writer(result, output_file_name)
        output_file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
        print(f"Saved: {output_file_path}")

    for audio_path, result in results.items():
        output_file_name = os.path.basename(audio_path)
        output_file_name_without_ext, _ = os.path.splitext(output_file_name)

        for output_format in output_formats:
            try:
                save_file_from_result(result, output_format, output_dir, output_file_name_without_ext)
            except Exception as e:
                print(f"Could not save {output_file_name_without_ext}.{output_format}: {e}")

    # --- New logic: Create a zip file of the subtitles ---
    zip_filename = "subtitles.zip"
    zip_path = os.path.join('.', zip_filename)

    with zipfile.ZipFile(zip_path, 'w') as zf:
        print(f"\nCreating {zip_filename}...")
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                zf.write(file_path, os.path.relpath(file_path, output_dir))

    print(f"All subtitles have been archived into {zip_filename}.")