In [1]:
import numpy as np
import IPython.display as ipd

import torch
import torchaudio
import torch.nn.functional as F

from datasets import load_dataset

from transformers import AutoModelForAudioClassification, AutoConfig, AutoFeatureExtractor


PATH_TO_AUDIO = "google_speech_recognition_v2"

  from .autonotebook import tqdm as notebook_tqdm
2023-06-04 13:22:36.520105: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-04 13:22:37.163857: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64:
2023-06-04 13:22:37.163935: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64:


## Get the kewyrods label2id and id2label

In [None]:
dataset = load_dataset("speech_commands", "v0.02")

labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label


## Create the neural net

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor_checkpoint = "facebook/wav2vec2-base"
audio_classification_checkpoint = "wav2vec2-base-finetuned-ks-32/checkpoint-3315"
feature_extractor = AutoFeatureExtractor.from_pretrained(feature_extractor_checkpoint)
config = AutoConfig.from_pretrained(feature_extractor_checkpoint)
sampling_rate = feature_extractor.sampling_rate
model = AutoModelForAudioClassification.from_pretrained(audio_classification_checkpoint).to(device)

In [None]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    # attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = scores
    # outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs

## Inference

In [None]:
# with pyaudio
import pyaudio
import wave
import tempfile
import os

CHUNK = 320  # number of audio samples per frame
FORMAT = pyaudio.paInt16  # audio format
CHANNELS = 1  # mono audio
RATE = 16000  # sampling rate in Hz
RECORD_SECONDS = 1  # duration of each recording in seconds
FILE_NAME = f"temp.wav"

def record_audio():
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=1)

    try:
        while True:
            frames = []  # to store audio frames

            for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
                data = stream.read(CHUNK)
                frames.append(data)

            # write frames to temporary WAV file
            
            wav_filename =  FILE_NAME
            wf = wave.open(wav_filename, 'wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))
            wf.close()

            # read contents of WAV file a

            yield wav_filename

    except KeyboardInterrupt:
        pass

    stream.stop_stream()
    stream.close()
    p.terminate()

In [None]:
for wav_data in record_audio():
    # pass the WAV data to your keyword spotter here
    label = predict(wav_data, 16000)
    max = np.argmax(label)
    print(f"most confident keyword = {id2label[str(max)]}, with a confiden of {label[max]}")
    