<a href="https://colab.research.google.com/github/AlirezaFazli29/Whisper/blob/main/Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whisper

## Libraries

In [None]:
! pip install --upgrade pip
! pip install torchinfo
! pip install pydub
! pip install hazm
! pip install jiwer

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import matplotlib.pyplot as plt
from torchinfo import summary
import numpy as np
import pandas as pd
from pydub import AudioSegment
import os
import zipfile
import gzip
import glob
from shutil import make_archive
import hazm
import string
from jiwer import wer

## Default Device

In [3]:
if torch.cuda.is_available(): print(torch.cuda.get_device_name())
else: print('cpu')
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'

Tesla T4


## Model

In [8]:
model_id = "openai/whisper-large-v3"
batch_size = 16
model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=model_id).to(default_device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline("automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                max_new_tokens=128,
                chunk_length_s=30,
                batch_size=batch_size,
                return_timestamps=True,
                device=default_device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
summary(model)

Layer (type:depth-idx)                                  Param #
WhisperForConditionalGeneration                         --
├─WhisperModel: 1-1                                     --
│    └─WhisperEncoder: 2-1                              --
│    │    └─Conv1d: 3-1                                 492,800
│    │    └─Conv1d: 3-2                                 4,916,480
│    │    └─Embedding: 3-3                              (1,920,000)
│    │    └─ModuleList: 3-4                             629,637,120
│    │    └─LayerNorm: 3-5                              2,560
│    └─WhisperDecoder: 2-2                              --
│    │    └─Embedding: 3-6                              66,388,480
│    │    └─WhisperPositionalEmbedding: 3-7             573,440
│    │    └─ModuleList: 3-8                             839,557,120
│    │    └─LayerNorm: 3-9                              2,560
├─Linear: 1-2                                           66,388,480
Total params: 1,609,879,040
Trainable params

## Dataset

In [6]:
zip_file_path = './Recordings.zip'
extracted_folder_path = './Recordings'

In [9]:
if (os.path.exists(extracted_folder_path) and os.path.isdir(extracted_folder_path)):
    print(f'folder {extracted_folder_path} exists')
else:
    with zipfile.ZipFile(zip_file_path, mode='r') as zip_ref:
        zip_ref.extractall(extracted_folder_path)
    print('files extracted')

files extracted


In [10]:
paths = glob.glob(extracted_folder_path+'/*')
len(paths)

106

In [13]:
texts = list(pd.read_csv('Transcriptions.csv', header=None)[1])

## Using the model

In [None]:
results = [None for _ in range(len(paths))]

for i, path in enumerate(paths):

    audio = AudioSegment.from_file(path)
    waveform = np.array(audio.normalize().set_channels(1).get_array_of_samples())
    waveform = waveform/max(waveform)
    sr = audio.frame_rate
    results[i] = pipe({'array': waveform, 'sampling_rate':sr})

## Save Transcriptions

In [15]:
transcriptions_folder = './Transcriptions'

In [16]:
if (os.path.exists(transcriptions_folder) and os.path.isdir(transcriptions_folder)):
    print(f'folder {transcriptions_folder} exists')
else:
    os.mkdir(transcriptions_folder)
    print('Folder created')

Folder created


In [17]:
transcription_paths = [transcriptions_folder+'/'+path.split('/')[-1].split('.')[0]+'.txt' for path in paths]

In [18]:
for i, path in enumerate(transcription_paths):
    text_file = open(path, "w")
    text_file.write(results[i]['text'])
    text_file.close()

In [19]:
make_archive(transcriptions_folder, 'zip', transcriptions_folder)

'/content/Transcriptions.zip'

## WER

In [20]:
normalizer = hazm.Normalizer()
punc = string.punctuation + '،' + '؛'

In [25]:
WER = [None for _ in range(len(transcription_paths))]
for i, path in enumerate(transcription_paths):
    num = int(path.split(' ')[-1].split('.')[0][1:-1]) - 1
    reference = ''.join([char for char in texts[num] if char not in punc])
    reference = normalizer.normalize(reference)
    hypothesis = ''.join([char for char in results[i]['text'] if char not in punc])
    hypothesis = normalizer.normalize(hypothesis)
    WER[i] = wer(reference, hypothesis)

print(f'The average WER is equal tt {sum(WER)/len(WER)}')

The average WER is equal tt 0.24018175581077722
