<a href="https://colab.research.google.com/github/AbhayMishraVK/CADETSHIP-INFOTECH-INTERNSHIP/blob/main/whisper_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install pydub
! pip install transformers
! pip install torch

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [5]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


Small Whisper Model

In [8]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from time import time
import csv
from pydub import AudioSegment


# Define the device and supported torch data types
device = "cpu"

# Function to get audio duration in minutes
def audio_duration_in_minutes(file_path):
    audio = AudioSegment.from_file(file_path)
    return len(audio) / 60000.0

# Specify the local path to the pre-trained model directory
model_id = "openai/whisper-small"

# Load the pre-trained model from the local directory
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float32
)

# Move the model to the selected device
model.to(device)

# Load the processor from the specific snapshot directory within the model directory
processor = AutoProcessor.from_pretrained(
   "openai/whisper-small")

# Parameters
dtype_precision_values = [torch.float32]
batch_size_values = [1, 8, 9]

# INPUT FILES
audio_files = [
    "testing (1).mp3",
    "testing (2).mp3",
    "testing (3).mp3",
    "testing (4).mp3",
    "testing (5).mp3",
    "testing (6).mp3",
    "testing (7).mp3",
    "testing (8).mp3",
    "testing (9).mp3",
]

# Initialize a CSV file for recording experiment logs
csv_file_name = "inference_logs_SmallBatch_16_batch.csv"
with open(csv_file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Dtype Precision', 'Batch Size', 'Inference Time', 'File(s) Processed'])

    for dtype_precision in dtype_precision_values:
        torch_dtype = dtype_precision
        print("===================================")
        for batch_size in batch_size_values:
            # Print the current batch size being processed
            print(f"Processing batch size: {batch_size}")

            # Initialize the pipeline outside the loop to avoid repeated initialization
            pipe = pipeline(
                "automatic-speech-recognition",
                model=model,
                device=device,
                chunk_length_s=30,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=torch_dtype,
                batch_size = 8
            )

            # Run inference in batches
            for i in range(0, len(audio_files), batch_size):
                batch_files = audio_files[i:i + batch_size]
                filenames = list(batch_files)  # Capture filenames

                inference_start_time = time()
                results = pipe(batch_files)
                inference_end_time = time()

                total_inference_time = inference_end_time - inference_start_time
                avg_inference_time_per_file = total_inference_time / len(batch_files) if batch_files else 0

                # Append audio duration for single-file batches
                if len(filenames) == 1:
                    filenames[0] += f" ({audio_duration_in_minutes(filenames[0]):.2f} mins)"

                # Record the experiment logs in the CSV file
                csv_writer.writerow([torch_dtype, batch_size, avg_inference_time_per_file, ", ".join(filenames)])

                print("Result : ", results)
                print(f"Total Inference Time for the batch: {total_inference_time}")
                print("===================================")
            print("\n")

    print(f"Logs saved to {csv_file_name}")


"""
Document :

(1) It is fast then others but accuracy low.
(2) Distil-Whisper is currently only available for English speech recognition.
"""



Processing batch size: 1
Result :  [{'text': ' Today we will tell you story. The topic is the crow and the fox. One day the crow was eating a piece of cheese. A cunning fox came there. He thought of asking the crow to sing a song. The foolish crow opened its mouth to sing. The cheese fell down. The cunning fox took the cheese and ran away. Moron, think before you act. Story from Panjatandra Tales. Thank you.'}]
Total Inference Time for the batch: 23.040893077850342
Result :  [{'text': ' Artificial Intelligence has stood up to the front line of a real-world problem solving and business transformation. While intelligent document processing become a vital component in the global effort to drive intelligent automation into cooperative worldwide. IDP solution reads the unstructured raw data in complicated document using various AI related technologies. Including RPA boards, Optical Character Recognition, Natural Language Processing, Computer Vision and Machine Learning. IDP then gathers the

'\nDocument :\n\n(1) It is fast then others but accuracy low.\n(2) Distil-Whisper is currently only available for English speech recognition.\n'

Distil Small

In [15]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from time import time
import csv
from pydub import AudioSegment

# Define the device and supported torch data types
device = "cuda" if torch.cuda.is_available() else "cpu"

# Function to get audio duration in minutes
def audio_duration_in_minutes(file_path):
    audio = AudioSegment.from_file(file_path)
    return len(audio) / 60000.0

# Specify the local path to the pre-trained model directory
model_id = "distil-whisper/distil-small.en"


# Load the pre-trained model from the local directory
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float32
)

# Move the model to the selected device
model.to(device)

# Load the processor from the specific snapshot directory within the model directory
processor = AutoProcessor.from_pretrained(
    model_id
)

# Parameters
dtype_precision_values = [torch.float16]
batch_size_values = [1, 8, 9]

# INPUT FILES
# INPUT FILES
audio_files = [
    "testing (1).mp3",
    "testing (2).mp3",
    "testing (3).mp3",
    "testing (4).mp3",
    "testing (5).mp3",
    "testing (6).mp3",
    "testing (7).mp3",
    "testing (8).mp3",
    "testing (9).mp3",
]

# Initialize a CSV file for recording experiment logs
csv_file_name = "inference_logs_distilSmallBatch_16_gpu.csv"
with open(csv_file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Dtype Precision', 'Batch Size', 'Inference Time', 'File(s) Processed'])

    for dtype_precision in dtype_precision_values:
        torch_dtype = dtype_precision
        print("===================================")
        for batch_size in batch_size_values:
            # Print the current batch size being processed
            print(f"Processing batch size: {batch_size}")

            # Initialize the pipeline outside the loop to avoid repeated initialization
            pipe = pipeline(
                "automatic-speech-recognition",
                model=model.half(),
                device=device,
                chunk_length_s=30,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=torch_dtype
            )

            # Run inference in batches
            for i in range(0, len(audio_files), batch_size):
                batch_files = audio_files[i:i + batch_size]
                filenames = list(batch_files)  # Capture filenames

                inference_start_time = time()
                results = pipe(batch_files)
                inference_end_time = time()

                total_inference_time = inference_end_time - inference_start_time
                avg_inference_time_per_file = total_inference_time / len(batch_files) if batch_files else 0

                # Append audio duration for single-file batches
                if len(filenames) == 1:
                    filenames[0] += f" ({audio_duration_in_minutes(filenames[0]):.2f} mins)"

                # Record the experiment logs in the CSV file
                csv_writer.writerow([torch_dtype, batch_size, avg_inference_time_per_file, ", ".join(filenames)])

                print("Result : ", results)
                print(f"Total Inference Time for the batch: {total_inference_time}")
                print("===================================")
            print("\n")

    print(f"Logs saved to {csv_file_name}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Processing batch size: 1
Result :  [{'text': ' The next day the crow was eating a piece of cheese. Today will tell you a story. The topic is the crow and the fox. One day the crow was eating a piece of cheese. A cunning fox came there. He thought of asking the crow to sing a song. The foolish crow opened its mouth to sing. The cheese fell down. The cunning fox took the cheese under away. More thing before you act. Story from Pangyadandratil. Thank you.'}]
Total Inference Time for the batch: 2.591956615447998
Result :  [{'text': ' Artificial intelligence has stayed up to the front line of a real world problem solving and business transformation. While intelligent document processing become a vital component in the global effort to drive intelligent automation into cooperative worldwide. IDP solution reads the undestructured raw data in complicated document using various AI related technologies including RPA about optical character recognition, natural language processing, computer visio

'\nDocument :\n\n(1) It is fast then others but accuracy low.\n(2) Distil-Whisper is currently only available for English speech recognition.\n'

Whisper Medium

In [16]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from time import time
import csv

# Define the device and supported torch data types
device = "cuda" if torch.cuda.is_available() else "cpu"

# Function to get audio duration in minutes
def audio_duration_in_minutes(file_path):
    # Implement your logic to get audio duration using pydub or other libraries
    pass

# Specify the local path to the pre-trained model directory
model_id = "openai/whisper-medium"

# Load the pre-trained model from the local directory
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float16
)

# Move the model to the selected device
model.to(device)

# Load the processor from the specific snapshot directory within the model directory
processor = AutoProcessor.from_pretrained(
    "openai/whisper-medium"
)

# Parameters
dtype_precision_values = [torch.float16]
batch_size_values = [1, 8, 9]

# INPUT FILES
audio_files = [
    "testing (1).mp3",
    "testing (2).mp3",
    "testing (3).mp3",
    "testing (4).mp3",
    "testing (5).mp3",
    "testing (6).mp3",
    "testing (7).mp3",
    "testing (8).mp3",
    "testing (9).mp3",
]

# Initialize a CSV file for recording experiment logs
csv_file_name = "inference_logs_mediumBatch_gpu.csv"
with open(csv_file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Dtype Precision', 'Batch Size', 'Inference Time'])

    for dtype_precision in dtype_precision_values:
        torch_dtype = dtype_precision
        print("===================================")
        for batch_size in batch_size_values:
            # Print the current batch size being processed
            print(f"Processing batch size: {batch_size}")

            # Initialize the pipeline outside the loop to avoid repeated initialization
            pipe = pipeline(
                "automatic-speech-recognition",
                model=model.half(),
                device=device,
                chunk_length_s=30,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                torch_dtype=torch_dtype
            )

            # Run inference in batches
            for i in range(0, len(audio_files), batch_size):
                batch_files = audio_files[i:i + batch_size]
                print("Batch Files: ", batch_files)

                inference_start_time = time()
                results = pipe(batch_files)
                inference_end_time = time()

                total_inference_time = inference_end_time - inference_start_time
                avg_inference_time_per_file = total_inference_time / len(batch_files) if batch_files else 0

                # Record the experiment logs in the CSV file
                csv_writer.writerow([torch_dtype, batch_size, avg_inference_time_per_file])

                print('Total Inference Time for the batch: ', total_inference_time)
                print("===================================")
            print("\n")

print(f"Logs saved to {csv_file_name}")


Processing batch size: 1
Batch Files:  ['testing (1).mp3']
Total Inference Time for the batch:  1.9090971946716309
Batch Files:  ['testing (2).mp3']
Total Inference Time for the batch:  4.4130096435546875
Batch Files:  ['testing (3).mp3']
Total Inference Time for the batch:  6.836504697799683
Batch Files:  ['testing (4).mp3']
Total Inference Time for the batch:  3.924983024597168
Batch Files:  ['testing (5).mp3']
Total Inference Time for the batch:  4.520305633544922
Batch Files:  ['testing (6).mp3']
Total Inference Time for the batch:  2.4291632175445557
Batch Files:  ['testing (7).mp3']
Total Inference Time for the batch:  2.7300164699554443
Batch Files:  ['testing (8).mp3']
Total Inference Time for the batch:  1.4631083011627197
Batch Files:  ['testing (9).mp3']
Total Inference Time for the batch:  6.214992523193359


Processing batch size: 8
Batch Files:  ['testing (1).mp3', 'testing (2).mp3', 'testing (3).mp3', 'testing (4).mp3', 'testing (5).mp3', 'testing (6).mp3', 'testing (7).

With the change of batch size

In [5]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from time import time
import csv
from pydub import AudioSegment


# Define the device and supported torch data types
device = "cpu"

# Function to get audio duration in minutes
def audio_duration_in_minutes(file_path):
    audio = AudioSegment.from_file(file_path)
    return len(audio) / 60000.0

# Specify the local path to the pre-trained model directory
model_id = "openai/whisper-small"

# Load the pre-trained model from the local directory
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float32
)

# Move the model to the selected device
model.to(device)

# Load the processor from the specific snapshot directory within the model directory
processor = AutoProcessor.from_pretrained(
   "openai/whisper-small")

# Parameters
dtype_precision_values = [torch.float32]
batch_size_values = [1, 8, 9]

# INPUT FILES
audio_files = [
    "testing (1).mp3",
    "testing (2).mp3",
    "testing (3).mp3",
    "testing (4).mp3",
    "testing (5).mp3",
    "testing (6).mp3",
    "testing (7).mp3",
    "testing (8).mp3",
    "testing (9).mp3",
]

# Initialize a CSV file for recording experiment logs
csv_file_name = "try_SmallBatch_16.csv"
with open(csv_file_name, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Dtype Precision', 'Batch Size', 'Inference Time', 'File(s) Processed'])

    for dtype_precision in dtype_precision_values:
        torch_dtype = dtype_precision
        print("===================================")
        for batch_size in batch_size_values:
            # Print the current batch size being processed
            print(f"Processing batch size: {batch_size}")

            # Run inference in batches
            for i in range(0, len(audio_files), batch_size):

                # Initialize the pipeline outside the loop to avoid repeated initialization
                pipe = pipeline(
                    "automatic-speech-recognition",
                    model=model,
                    device=device,
                    chunk_length_s=30,
                    tokenizer=processor.tokenizer,
                    feature_extractor=processor.feature_extractor,
                    torch_dtype=torch_dtype,
                    batch_size = batch_size
                )

                batch_files = audio_files[i:i + batch_size]
                filenames = list(batch_files)  # Capture filenames

                inference_start_time = time()
                results = pipe(batch_files)
                inference_end_time = time()

                total_inference_time = inference_end_time - inference_start_time
                avg_inference_time_per_file = total_inference_time / len(batch_files) if batch_files else 0

                # Append audio duration for single-file batches
                if len(filenames) == 1:
                    filenames[0] += f" ({audio_duration_in_minutes(filenames[0]):.2f} mins)"

                # Record the experiment logs in the CSV file
                csv_writer.writerow([torch_dtype, batch_size, avg_inference_time_per_file, ", ".join(filenames)])

                print("Result : ", results)
                print(f"Total Inference Time for the batch: {total_inference_time}")
                print("===================================")
            print("\n")

    print(f"Logs saved to {csv_file_name}")


"""
Document :

(1) It is fast then others but accuracy low.
(2) Distil-Whisper is currently only available for English speech recognition.
"""



Processing batch size: 1
Result :  [{'text': ' Today we will tell you story. The topic is the crow and the fox. One day the crow was eating a piece of cheese. A cunning fox came there. He thought of asking the crow to sing a song. The foolish crow opened its mouth to sing. The cheese fell down. The cunning fox took the cheese and ran away. Moron, think before you act. Story from Panjatandra Tales. Thank you.'}]
Total Inference Time for the batch: 29.467626571655273
Result :  [{'text': ' Artificial Intelligence has stood up to the front line of a real-world problem solving and business transformation. While intelligent document processing become a vital component in the global effort to drive intelligent automation into cooperative worldwide. IDP solution reads the unstructured raw data in complicated document using various AI related technologies. Including RPA boards, Optical Character Recognition, Natural Language Processing, Computer Vision and Machine Learning. IDP then gathers the

'\nDocument :\n\n(1) It is fast then others but accuracy low.\n(2) Distil-Whisper is currently only available for English speech recognition.\n'