In [None]:
import torch
from transformers import pipeline
import time
from datetime import datetime
from pydub import AudioSegment
import math
import os

# Define the audio file path
audio_file = "audio/5760-Nano-L2.mp3"

# Check if Metal Performance Shaders (MPS) is available for GPU acceleration on macOS
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS device if available
    print("Using Metal GPU acceleration")
else:
    device = torch.device("cpu")  # Fallback to CPU if MPS is not available
    print("Metal not available, using CPU")

# Set up the automatic speech recognition pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",  # Specify the Whisper model to use
    torch_dtype=torch.float16,  # Use float16 for faster computation and reduced memory usage
    device=device,  # Use the selected device (MPS or CPU)
    return_timestamps=True,  # Return word-level timestamps in the output
)

# Generate a unique filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f'transcription_{timestamp}.txt'

# Define a function to process audio in chunks and write to file
def process_in_chunks(audio_file, chunk_length_s=30, output_file=output_filename):
    # Load the audio file using pydub
    audio = AudioSegment.from_mp3(audio_file)

    # Calculate the number of chunks
    chunk_length_ms = chunk_length_s * 1000  # Convert chunk length to milliseconds
    chunks = math.ceil(len(audio) / chunk_length_ms)  # Calculate the number of chunks needed

    # Start measuring the transcription time
    start_time = time.time()

    with open(output_file, 'w', encoding='utf-8') as f:
        for i in range(chunks):
            # Determine the start and end times for the current chunk
            start_time_chunk = i * chunk_length_ms
            end_time_chunk = (i + 1) * chunk_length_ms
            chunk = audio[start_time_chunk:end_time_chunk]  # Extract the audio chunk

            # Create a temporary file for the audio chunk
            chunk_file = f"temp_chunk_{i}.wav"
            chunk.export(chunk_file, format="wav")  # Export the chunk as a WAV file

            # Perform transcription on the audio chunk
            result = pipe(chunk_file, language="en")  # Transcribe the chunk

            # Write the chunk's transcription to file
            f.write(result["text"] + " ")
            f.flush()  # Ensure it's written immediately

            # Clean up the temporary file
            os.remove(chunk_file)

            # Print progress
            print(f"Processed chunk {i+1}/{chunks}")

    # Calculate the elapsed time
    elapsed_time = time.time() - start_time
    print(f"Transcription completed in {elapsed_time:.2f} seconds")

    return output_file

# Process the entire audio file in chunks and write to file
try:
    output_file = process_in_chunks(audio_file)
    print(f"Transcription saved to '{output_file}'")
except Exception as e:
    print(f"An error occurred during transcription: {e}")

# Print the device and model data type information
print(f"Device: {pipe.device}")
print(f"Model dtype: {pipe.model.dtype}")
