In [None]:
import torch
from transformers import pipeline
import time

# Define the audio file path
audio_file = "audio/5760-Nano-L2.mp3"

# Check if Metal Performance Shaders (MPS) is available for GPU acceleration on macOS
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS device if available
    print("Using Metal GPU acceleration")
else:
    device = torch.device("cpu")  # Fallback to CPU if MPS is not available
    print("Metal not available, using CPU")

# Set up the automatic speech recognition pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",  # Specify the Whisper model to use
    torch_dtype=torch.float16,  # Use float16 for faster computation and reduced memory usage
    device=device,  # Use the selected device (MPS or CPU)
    return_timestamps=True,  # Return word-level timestamps in the output
)

# Start measuring the transcription time
start_time = time.time()

# Perform the transcription using the pipeline
result = pipe(audio_file)

# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the transcription results
print(f"Transcription completed in {elapsed_time:.2f} seconds")
print("Transcription result:")
print(result["text"])

# Print the device and model data type information
print(f"Device: {pipe.device}")
print(f"Model dtype: {pipe.model.dtype}")


# Define a function to process audio in chunks
def process_in_chunks(audio_file, chunk_length_s=30):
    from pydub import AudioSegment
    import math
    import os

    # Load the audio file using pydub
    audio = AudioSegment.from_mp3(audio_file)

    # Calculate the number of chunks
    chunk_length_ms = chunk_length_s * 1000  # Convert chunk length to milliseconds
    chunks = math.ceil(
        len(audio) / chunk_length_ms
    )  # Calculate the number of chunks needed

    transcriptions = []  # Initialize a list to store transcriptions for each chunk

    # Iterate through the chunks
    for i in range(chunks):
        # Determine the start and end times for the current chunk
        start_time = i * chunk_length_ms
        end_time = (i + 1) * chunk_length_ms
        chunk = audio[
            start_time:end_time
        ]  # Extract the audio chunk from the full audio

        # Create a temporary file for the audio chunk
        chunk_file = f"temp_chunk_{i}.wav"
        chunk.export(
            chunk_file, format="wav"
        )  # Export the chunk as a WAV file

        # Perform transcription on the audio chunk
        result = pipe(
            chunk_file, language="en"
        )  # Transcribe the chunk with language specified

        transcriptions.append(
            result["text"]
        )  # Add the chunk's transcription to the list

        # Clean up the temporary file
        os.remove(chunk_file)  # Remove the temporary chunk file

    # Join all the transcriptions into a single string
    return " ".join(
        transcriptions
    )  # Concatenate the transcriptions with a space in between

# After processing the full transcription
full_transcription = process_in_chunks(audio_file)

# Write the full transcription to a text file
with open('transcription.txt', 'w', encoding='utf-8') as f:
    f.write(full_transcription)

print("Transcription saved to 'transcription.txt'")


# # Process the entire audio file in chunks to handle long audio
# full_transcription = process_in_chunks(audio_file)
# print("Full transcription:")
# print(full_transcription)


  from .autonotebook import tqdm as notebook_tqdm


Using Metal GPU acceleration


Device set to use mps
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
