# Imports

In [None]:
# Install faster-whisper
!pip install faster-whisper

In [None]:
# Import necessary libraries
from faster_whisper import WhisperModel
import os
from google.colab import files

# Functions

In [None]:
# Function to check if file is a video
def is_video_file(file_path):
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.webm']  # Add other video extensions if needed
    file_extension = os.path.splitext(file_path)[1].lower()
    return file_extension in video_extensions

# Function to extract audio from video
def extract_audio_from_video(video_file, output_audio_file):
    command = ['ffmpeg', '-i', video_file, '-q:a', '0', '-map', 'a', output_audio_file, '-y']
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Load model function
def load_model(model_size, compute_type, device):
    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    print(f"Model loaded: {model_size} | Compute type: {compute_type} | Device: {device}")
    return model

# Transcription function
def transcribe_file(file_path, language, model_size, compute_type, beam_size, condition_on_previous_text, word_timestamps):
    if file_path is None:
        return "Please upload a file"

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = load_model(model_size, compute_type, device)

    if is_video_file(file_path):
        print(f"Extracting audio from video file: {file_path}...")
        audio_file = os.path.splitext(file_path)[0] + ".mp3"
        extract_audio_from_video(file_path, audio_file)
        file_path = audio_file

    print(f"Transcribing {file_path}...")
    segments, info = model.transcribe(file_path, language=language, beam_size=beam_size,
                                      condition_on_previous_text=condition_on_previous_text,
                                      word_timestamps=word_timestamps)

    transcription = ""
    for segment in segments:
        if word_timestamps:
            for word in segment.words:
                transcription += f"{word.start:.2f} -> {word.end:.2f} {word.word}\n"
        else:
            transcription += f"{segment.text}\n"

    # Save the transcript
    output_path = "transcript.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(transcription)

    print("Transcription completed. Transcript saved as 'transcript.txt'.")
    return transcription

# Main

### Legend:
- **Model Size**: Larger models are more accurate but slower and require more memory.
- **Compute Type**: float16 is faster, float32 is more precise, int8 is fastest but less accurate.
- **Beam Size**: Higher values may improve accuracy but increase processing time.
- **Condition on Previous Text**: If checked, uses previous text to improve transcription continuity.
- **Word-level timestamps**: If checked, provides timestamps for individual words instead of sentences.

In [None]:
# Parameters
language = "en" # en, es, fr, de, it
model_size = "large-v3" # tiny, base, small, medium, large-v3
compute_type = "float16" # float16, float32, int8
beam_size = 10 # 1 - 10
condition_on_previous_text = False # True, False
word_timestamps = False # True, False

In [None]:
# Main execution in Colab
if __name__ == "__main__":
    # Check parameters
    assert language in ["en", "es", "fr", "de", "it"], "Invalid language"
    assert model_size in ["tiny", "base", "small", "medium", "large-v3"], "Invalid model size"
    assert compute_type in ["float16", "float32", "int8"], "Invalid compute type"
    assert 1 <= beam_size <= 10, "Invalid beam size"
    assert isinstance(condition_on_previous_text, bool), "Invalid condition_on_previous_text"
    assert isinstance(word_timestamps, bool), "Invalid word_timestamps"

    # File upload
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]

    # Transcribe the file
    transcription = transcribe_file(file_path, language, model_size, compute_type, beam_size,
                                    condition_on_previous_text, word_timestamps)

    # Output transcription
    print("\nTranscription:\n")
    print(transcription)

    # Download the transcript
    files.download('transcript.txt')