Automatic Speech Recognition

In [None]:
!pip install transformers
!pip install -U datasets
!pip install soundfile
!pip install librosa
!pip install gradio

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("librispeech_asr",
                       split="train.clean.100",
                       streaming=True,
                       trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

In [4]:
example = next(iter(dataset))

In [5]:
dataset_head = dataset.take(5)
dataset_head

IterableDataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    n_shards: 1
})

In [6]:
list(dataset_head)[2]

{'file': '374-180298-0002.flac',
 'audio': {'path': '374-180298-0002.flac',
  'array': array([-2.44140625e-04, -2.44140625e-04, -1.83105469e-04, ...,
          1.83105469e-04,  3.05175781e-05, -1.52587891e-04]),
  'sampling_rate': 16000},
 'text': 'I WISHED ABOVE ALL NOT TO LEAVE MYSELF TIME TO THINK OVER THE POSITION I HAD ACCEPTED FOR IN SPITE OF MYSELF IT WAS A GREAT DISTRESS TO ME THUS MY LIFE GENERALLY SO CALM',
 'speaker_id': 374,
 'chapter_id': 180298,
 'id': '374-180298-0002'}

In [7]:
example

{'file': '374-180298-0000.flac',
 'audio': {'path': '374-180298-0000.flac',
  'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
         -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
  'sampling_rate': 16000},
 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
 'speaker_id': 374,
 'chapter_id': 180298,
 'id': '374-180298-0000'}

In [8]:
from IPython.display import Audio as IPythonAudio

IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

Build the pipeline

In [9]:
from transformers import pipeline

In [10]:
asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/282k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [11]:
asr.feature_extractor.sampling_rate

16000

In [12]:
example['audio']['sampling_rate']

16000

In [13]:
example["text"]

'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED'

Build a shareable app with Gradio

In [14]:
import os
import gradio as gr

from datasets import load_dataset

In [15]:
demo = gr.Blocks()

In [16]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(filepath)
    return output["text"]

In [17]:
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

In [18]:
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [19]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )

demo.launch(debug=True,share=True)

# demo.launch(share=True,
#             server_port=int(os.environ['PORT1']))

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://fd46ac32ac8b2746eb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://fd46ac32ac8b2746eb.gradio.live




In [23]:
demo.close()

Closing server running on port: 7860


Testing with a longer audio file

In [24]:
import soundfile as sf
import io

In [26]:
audio, sampling_rate = sf.read('harvard.wav')

In [27]:
sampling_rate

44100

In [28]:
asr.feature_extractor.sampling_rate

16000

In [29]:
asr(audio)

ValueError: We expect a single channel audio input for AutomaticSpeechRecognitionPipeline

We expect a single channel audio input for AutomaticSpeechRecognitionPipeline
Convert the audio from stereo to mono (Using librosa)

In [30]:
audio.shape


(809508, 2)

In [31]:
import numpy as np

audio_transposed = np.transpose(audio)

In [32]:
import librosa

In [33]:
audio_mono = librosa.to_mono(audio_transposed)

In [34]:
sampling_rate

44100

In [35]:
IPythonAudio(audio_mono,
             rate=sampling_rate)

In [36]:
asr(audio_mono)

AttributeError: 'GenerationConfig' object has no attribute 'lang_to_id'

In [37]:
asr.feature_extractor.sampling_rate

16000

In [38]:
audio_16KHz = librosa.resample(audio_mono,
                               orig_sr=sampling_rate,
                               target_sr=16000)

In [39]:
asr(
    audio_16KHz,
    chunk_length_s=30, # 30 seconds
    batch_size=4,
    return_timestamps=True,
)["chunks"]

AttributeError: 'GenerationConfig' object has no attribute 'lang_to_id'

Build the Gradio interface.

In [40]:
import gradio as gr
demo = gr.Blocks()

In [41]:
def transcribe_long_form(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    output = asr(
      filepath,
      max_new_tokens=256,
      chunk_length_s=30,
      batch_size=8,
    )
    return output["text"]

In [42]:
mic_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never")

file_transcribe = gr.Interface(
    fn=transcribe_long_form,
    inputs=gr.Audio(sources="upload",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription",
                       lines=3),
    allow_flagging="never",
)

In [43]:
with demo:
    gr.TabbedInterface(
        [mic_transcribe,
         file_transcribe],
        ["Transcribe Microphone",
         "Transcribe Audio File"],
    )
demo.launch(share=True,
            server_port=int(os.environ['PORT1']))

KeyError: 'PORT1'

Note: Please stop the demo before continuing with the rest of the lab.


In [44]:
#trying with new audio files

import soundfile as sf
import io

audio, sampling_rate = sf.read('harvard.wav')

In [47]:
sampling_rate

44100

In [48]:
asr.feature_extractor.sampling_rate

16000