In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = 'drive/My Drive/1'

In [1]:
! pip install whisper
! pip install -U openai-whisper
!choco install ffmpeg
!pip install setuptools-rust

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=7ed786d50d9e8ffe2afcaced7d30822216d2c96cf3442e5bc979441df1c4e252
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149299b5a7d0
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m43.

In [4]:
import torch
import whisper
import gc
from tqdm import tqdm
import time
from datetime import datetime, timedelta

class OptimizedTranscriber:
    def __init__(self, model_name="large"):
        self.start_time = time.time()
        print(f"Initializing transcriber at {datetime.now().strftime('%H:%M:%S')}")

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Enable TensorFloat-32 for better performance on Ampere GPUs
        if torch.cuda.is_available():
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True

        # Load model with optimizations
        self.model = whisper.load_model(model_name)
        self.model.to(self.device)

        if torch.cuda.is_available():
            # Convert model to half precision
            self.model = self.model.half()

        self.model_load_time = time.time() - self.start_time
        print(f"Model loading took: {timedelta(seconds=self.model_load_time)}")

    def transcribe(self, audio_path):
        start_time = time.time()
        print(f"\nStarting transcription of {audio_path} at {datetime.now().strftime('%H:%M:%S')}")

        try:
            # Enable cuda graphs for repeated operations
            if torch.cuda.is_available():
                torch.cuda.synchronize()

            # Use modern autocast syntax
            with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu'), \
                 torch.no_grad():

                result = self.model.transcribe(
                    audio_path,
                    fp16=torch.cuda.is_available(),
                    verbose=True
                )

            # Force CUDA synchronization
            if torch.cuda.is_available():
                torch.cuda.synchronize()

            process_time = time.time() - start_time
            print(f"Transcription completed in: {timedelta(seconds=process_time)}")

            return result, process_time

        finally:
            # Clean up memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()

    def process_multiple(self, audio_files):
        """Process multiple audio files sequentially with progress bar"""
        results = []
        total_time = 0
        start_batch = time.time()

        for audio_file in tqdm(audio_files):
            result, process_time = self.transcribe(audio_file)
            results.append(result)
            total_time += process_time

        batch_time = time.time() - start_batch
        print(f"\nBatch Processing Summary:")
        print(f"Total files processed: {len(audio_files)}")
        print(f"Total processing time: {timedelta(seconds=batch_time)}")
        print(f"Average time per file: {timedelta(seconds=batch_time/len(audio_files))}")

        return results

def format_time(seconds):
    return str(timedelta(seconds=round(seconds)))

# Usage
def main():
    total_start = time.time()

    # Initialize transcriber
    transcriber = OptimizedTranscriber(model_name="large")

    # Single file transcription
    result, process_time = transcriber.transcribe("/content/or.mp3")

    print("\n--- Full Transcription ---")
    print(result["text"])

    print("\n--- Transcription Details ---")
    print(f"Language Detected: {result['language']}")
    print(f"Approximate Duration: {len(result['segments'])} segments")
    print(f"Processing Time: {format_time(process_time)}")

    total_time = time.time() - total_start
    print(f"\nTotal execution time (including model loading): {format_time(total_time)}")

    # For processing multiple files:
    # audio_files = ["/content/1.mp3", "/content/2.mp3", ...]
    # results = transcriber.process_multiple(audio_files)

if __name__ == "__main__":
    main()

Initializing transcriber at 08:00:17
Using device: cuda


  checkpoint = torch.load(fp, map_location=device)


Model loading took: 0:00:24.214966

Starting transcription of /content/or.mp3 at 08:00:41
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.640 --> 00:04.400]  Albert Einstein grew up in a middle class family.
[00:04.400 --> 00:11.920]  When he was born in his family was scared that something was wrong with him and he had
[00:11.920 --> 00:19.120]  very large in miss shape and head.
[00:19.120 --> 00:27.000]  But fortunately within the first few weeks the shape of his head became normal.
[00:27.000 --> 00:32.620]  But their worries didn't stop there when he was very young.
[00:32.620 --> 00:40.760]  His parents thought he might be intellectually disabled because he was very slow to learn
[00:40.760 --> 00:49.680]  to talk and did not speak until he was 4 years old.
[00:49.680 --> 00:56.980]  At that time he often formed foody sentences in his throat but did not speak his use to
[00:56.980 --> 01:05.660]  prac