In [5]:
## Initialization
## Install libraries : librosa torch torchvision torchaudio pydub
# conda install -c pytorch pytorch
# conda install -c pytorch torchvision
# conda install -c pytorch torchaudio 
# conda install -c conda-forge pydub

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
import os

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
from transformers import HubertModel, Wav2Vec2FeatureExtractor

from pydub import AudioSegment
from pydub.silence import split_on_silence

from flask import Flask, request, render_template, Response
import urllib

MIN_AUDIO_LEN = 0.6 # for splicing audio into 4 parts
MIN_THRESH = -50    # for splicing into 4 parts
device = 'cpu'
esp32_ip = '192.168.39.161' # esp32 ip address
filename = "recording.wav"

# helper functions

# load wav file and return spectrogram
def wav2melSpec(AUDIO_PATH):
    audio, sr = librosa.load(AUDIO_PATH)
    return librosa.feature.melspectrogram(y=audio, sr=sr)

#plot spectrogram
def imgSpec(ms_feature):
    fig, ax = plt.subplots()
    ms_dB = librosa.power_to_db(ms_feature, ref=np.max)
    print(ms_feature.shape)
    img = librosa.display.specshow(ms_dB, x_axis='time', y_axis='mel', ax=ax)
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    ax.set(title='Mel-frequency spectrogram');

# load and hear audio
def hear_audio(AUDIO_PATH):
    audio, sr = librosa.load(AUDIO_PATH)
    
    print("\t", end="")
    ipd.display(ipd.Audio(data=audio, rate=sr))
       
def get_audio_info(path, show_melspec=False, label=None):
    spec = wav2melSpec(path)
    if label is not None:
        print("Label:", label)
    if show_melspec is not False:
        imgSpec(spec)
    hear_audio(path)

# load wav file and return np array and sampling rate
def load_audio(AUDIO_PATH):
    audio, sr = librosa.load(AUDIO_PATH)
    return audio, sr

## Handle the testing audio
## Input to model should have similar sample rate and length
def resample(sample, sample_rate, new_sample_rate):
    return librosa.resample(sample, orig_sr=sample_rate, target_sr=16000)

def pad(sample, desired_length=16000):
    # Pad the audio tensor with zeros to a fixed length of 16000*1s
    if len(sample) < desired_length:
        padding = desired_length - len(sample)
        sample = np.pad(sample, (0, padding), 'constant')
    elif len(sample) > desired_length:
        sample = sample[:desired_length]
    return sample

# Possible bug: Somehow, if we load torch first the kernel dies due to lack of ram
# Solved for now
test = 'recording.wav'
spec = wav2melSpec(test)
print("Librosa initialization ok")

model_id = "facebook/hubert-large-ls960-ft"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
hubert_base = HubertModel.from_pretrained(model_id)
class HubertAudioModel(torch.nn.Module):
    def __init__(self, hubert_model=hubert_base):
        super().__init__()
        self.hubert = hubert_model
        self.fc1 = torch.nn.Linear(49*1024, 256)
        self.fc2 = torch.nn.Linear(256, 10)

    def forward(self, audio_array):
        # Resample the audio to the required sample rate (16kHz for Hubert)
        # audio_array = librosa.load(audio_file, sr=16000, mono=False)[0]
        # print(f"audio_array shape before Wav2Vec: {audio_array.shape}")
        input = feature_extractor(audio_array, 
                           sampling_rate=16000,
                           padding=True, 
                           return_tensors='pt').to(device)
        
        # print(f"input.input_values shape after Wav2Vec: {input.input_values.shape}")

        input = input.input_values.squeeze(dim=0)
        # print(f"input shape after squeeze: {input.shape}")

        # Pass the spectrogram through the Hubert model
        output = self.hubert(input)
        # print(f"output.last_hidden_state shape after hubert: {output.last_hidden_state.shape}")

        # Flatten the output of the Hubert model
        output = torch.flatten(output.last_hidden_state, start_dim=1)

        # print(f"output shape after flatten: {output.shape}")

        # Pass the flattened output through two dense layers
        output = torch.nn.functional.relu(self.fc1(output))
        output = self.fc2(output)

        return output

def make_predictions(model, data, device=device):
    pred_probs = []
    model.eval()
    with torch.inference_mode():
        for sample in data:
            # Prepare the sample (add a batch dimension and pass to target device)
            sample = torch.unsqueeze(sample, dim=0).to(device)

            # Forward pass (model outputs raw logits)
            pred_logits = model(sample)

            # Get prediction probability (logit -> prediction probability)
            pred_prob = torch.softmax(pred_logits.squeeze(), dim=0)

            # Get pred_probs off the GPU for further calculations
            pred_probs.append(pred_prob.cpu())
    
    # Stack the pred_probs to turn list into a tensor
    return torch.stack(pred_probs)

# Model initialization
model = HubertAudioModel().to(device)
model.load_state_dict(torch.load("model_hf.pth"))
print("All keys matched successfully")
print("Hubert Model initalization ok\n")

print("Evaluation mode for Hubert model ")
model.eval()

# Splice audio into 4 parts (4 numbers)
def audio_len_check(audio_chunks):
    for chunk in audio_chunks:
        if chunk.duration_seconds < MIN_AUDIO_LEN:
            return False
    return True

def splice_4parts(filename):
    sound_file = AudioSegment.from_wav(filename)
    chunks = 0
    silence_len = 200
    threshold = -20
    clear = True
    
    while True:  # adjust parameters until 4 slices are achieved
    # We limit grid search to  0.025s <= silence_len <= 0.2s , MIN_THRESH dBFS <= threshold <= - 20dBFS
        if threshold < MIN_THRESH:
            if silence_len == 25:
                print("Audio file Unclear... Please retry")
                clear = False
                break
            silence_len = max(25,silence_len-25)
            threshold = -20
        audio_chunks = split_on_silence(sound_file, 
            # must be silent for at least x millisecond
            min_silence_len=silence_len,

            # consider it silent if quieter than y dBFS
            silence_thresh=threshold,

            # keep 300 ms of leading/trailing silence                            
            keep_silence=300
        )
        chunks = len(audio_chunks)
        threshold -= 1
        print(silence_len,threshold)
        if chunks == 4 and audio_len_check(audio_chunks):
            break
        
    
    # output 4 wav files for classification
    for i, chunk in enumerate(audio_chunks):

        out_file = f"output_{i}.wav"
        print ("exporting", out_file)
        chunk.export(out_file, format="wav")
    return clear

def make_predictions_all():
    test_samples = []
    for i in range(0,4):
        sample_path =  f'output_{i}.wav'
        print(f'Adding {sample_path} to test sample')
        sample, sample_rate = load_audio(sample_path)
        if sample_rate != 16000:
            sample = resample(sample, sample_rate, 16000)
        sample = pad(sample)
        sample_tensor = torch.from_numpy(sample)
        #     print(sample_tensor.shape)
        test_samples.append(sample_tensor)
        #test_labels.append(label)
    pred_probs = make_predictions(model=model,data=test_samples)
    pred_classes = pred_probs.argmax(dim=1)
    pred_classes = pred_classes.tolist()
    return pred_classes


# Start Flask server
app = Flask(__name__)
# Create just a single route to read data from our ESP32
@app.route('/prompt', methods = ['GET'])
def addData():
    ''' The one and only route. It extracts the
    data from the request, converts to float if the
    data is not None, then calls the callback if it is set
    '''
    global _callback_
    
    datastr = request.args.get('data') 

    print("\nData from ESP32: ", datastr, "\n")
    if datastr == 'yes':
        urllib.request.urlretrieve(f"http://{esp32_ip}/recording.wav", filename=filename) # Download wav file
        success = splice_4parts(filename)
        print(success)
        if success:
            preds = make_predictions_all()
            print(preds)
            return f"Classification: {preds}", 200
        else:
            return f"Unclear Audio, try again", 200

def main():
    app.run(host = "0.0.0.0", port = '3237')


if __name__ == '__main__':
    main()


Librosa initialization ok


Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All keys matched successfully
Hubert Model initalization ok

Evaluation mode for Hubert model 
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:3237
 * Running on http://192.168.39.243:3237
Press CTRL+C to quit



Data from ESP32:  yes 

200 -21
200 -22
exporting output_0.wav
exporting output_1.wav
exporting output_2.wav
exporting output_3.wav
True
Adding output_0.wav to test sample
Adding output_1.wav to test sample
Adding output_2.wav to test sample
Adding output_3.wav to test sample


192.168.39.161 - - [03/Nov/2023 02:39:23] "GET /prompt?data=yes HTTP/1.1" 200 -


[5, 1, 9, 9]

Data from ESP32:  yes 

200 -21
200 -22
200 -23
200 -24
200 -25
200 -26
200 -27
200 -28
200 -29
200 -30
200 -31
200 -32
200 -33
200 -34
200 -35
200 -36
200 -37
200 -38
200 -39
200 -40
200 -41
200 -42
200 -43
200 -44
200 -45
200 -46
200 -47
200 -48
200 -49
200 -50
200 -51
175 -21
175 -22
175 -23
175 -24
175 -25
175 -26
175 -27
175 -28
175 -29
175 -30
175 -31
175 -32
175 -33
175 -34
175 -35
175 -36
175 -37
175 -38
175 -39
175 -40
175 -41
175 -42
175 -43
175 -44
175 -45
175 -46
175 -47
175 -48
175 -49
175 -50
175 -51
150 -21
150 -22
150 -23
150 -24
150 -25
150 -26
150 -27
150 -28
150 -29
150 -30
150 -31
150 -32
150 -33
150 -34
150 -35
150 -36
150 -37
150 -38
150 -39
150 -40
150 -41
150 -42
150 -43
150 -44
150 -45
150 -46
150 -47
150 -48
150 -49
150 -50
150 -51
125 -21
125 -22
125 -23
125 -24
125 -25
125 -26
125 -27
125 -28
125 -29
125 -30
125 -31
125 -32
125 -33
125 -34
125 -35
125 -36
125 -37
125 -38
125 -39
125 -40
125 -41
125 -42
125 -43
125 -44
125 -45
125 -46
125 -47
12

192.168.39.161 - - [03/Nov/2023 02:40:29] "GET /prompt?data=yes HTTP/1.1" 200 -


25 -43
25 -44
25 -45
25 -46
25 -47
25 -48
25 -49
25 -50
25 -51
Audio file Unclear... Please retry
exporting output_0.wav
False

Data from ESP32:  yes 

200 -21
exporting output_0.wav
exporting output_1.wav
exporting output_2.wav
exporting output_3.wav
True
Adding output_0.wav to test sample
Adding output_1.wav to test sample
Adding output_2.wav to test sample
Adding output_3.wav to test sample


192.168.39.161 - - [03/Nov/2023 02:41:11] "GET /prompt?data=yes HTTP/1.1" 200 -


[5, 0, 1, 9]

Data from ESP32:  yes 

200 -21
200 -22
200 -23
200 -24
200 -25
200 -26
200 -27
200 -28
200 -29
200 -30
200 -31
200 -32
200 -33
200 -34
200 -35
200 -36
200 -37
200 -38
200 -39
200 -40
200 -41
200 -42
200 -43
200 -44
200 -45
200 -46
200 -47
200 -48
200 -49
200 -50
200 -51
175 -21
175 -22
175 -23
175 -24
175 -25
175 -26
175 -27
175 -28
175 -29
175 -30
175 -31
175 -32
175 -33
175 -34
175 -35
175 -36
175 -37
175 -38
175 -39
175 -40
175 -41
175 -42
175 -43
175 -44
175 -45
175 -46
175 -47
175 -48
175 -49
175 -50
175 -51
150 -21
150 -22
150 -23
150 -24
150 -25
150 -26
150 -27
150 -28
150 -29
150 -30
150 -31
150 -32
150 -33
150 -34
150 -35
150 -36
150 -37
150 -38
150 -39
150 -40
150 -41
150 -42
150 -43
150 -44
150 -45
150 -46
150 -47
150 -48
150 -49
150 -50
150 -51
125 -21
125 -22
125 -23
125 -24
125 -25
125 -26
125 -27
125 -28
125 -29
125 -30
125 -31
125 -32
125 -33
125 -34
125 -35
125 -36
125 -37
125 -38
125 -39
125 -40
125 -41
125 -42
125 -43
125 -44
125 -45
125 -46
125 -47
12

192.168.39.161 - - [03/Nov/2023 02:41:52] "GET /prompt?data=yes HTTP/1.1" 200 -


25 -43
25 -44
25 -45
25 -46
25 -47
25 -48
25 -49
25 -50
25 -51
Audio file Unclear... Please retry
exporting output_0.wav
False

Data from ESP32:  yes 

200 -21
200 -22
200 -23
200 -24
200 -25
200 -26
200 -27
200 -28
200 -29
200 -30
200 -31
200 -32
200 -33
200 -34
200 -35
200 -36
200 -37
200 -38
200 -39
200 -40
200 -41
200 -42
200 -43
200 -44
200 -45
200 -46
200 -47
200 -48
200 -49
200 -50
200 -51
175 -21
175 -22
175 -23
175 -24
175 -25
175 -26
175 -27
175 -28
175 -29
175 -30
175 -31
175 -32
175 -33
175 -34
175 -35
175 -36
175 -37
175 -38
175 -39
175 -40
175 -41
175 -42
175 -43
175 -44
175 -45
175 -46
175 -47
175 -48
175 -49
175 -50
175 -51
150 -21
150 -22
150 -23
150 -24
150 -25
150 -26
150 -27
150 -28
150 -29
150 -30
150 -31
150 -32
150 -33
150 -34
150 -35
150 -36
150 -37
150 -38
150 -39
150 -40
150 -41
150 -42
150 -43
150 -44
150 -45
150 -46
150 -47
150 -48
150 -49
150 -50
150 -51
125 -21
125 -22
125 -23
125 -24
125 -25
125 -26
125 -27
125 -28
125 -29
125 -30
125 -31
125 -32
125 -33


192.168.39.161 - - [03/Nov/2023 02:42:30] "GET /prompt?data=yes HTTP/1.1" 200 -


25 -49
25 -50
25 -51
Audio file Unclear... Please retry
exporting output_0.wav
False

Data from ESP32:  yes 

200 -21
200 -22
200 -23
200 -24
200 -25
200 -26
200 -27
200 -28
200 -29
200 -30
200 -31
200 -32
200 -33
200 -34
200 -35
200 -36
200 -37
200 -38
200 -39
200 -40
200 -41
200 -42
200 -43
200 -44
200 -45
200 -46
200 -47
200 -48
200 -49
200 -50
200 -51
175 -21
175 -22
175 -23
175 -24
175 -25
175 -26
175 -27
175 -28
175 -29
175 -30
175 -31
175 -32
175 -33
175 -34
175 -35
175 -36
175 -37
175 -38
175 -39
175 -40
175 -41
175 -42
175 -43
175 -44
175 -45
175 -46
175 -47
175 -48
175 -49
175 -50
175 -51
150 -21
150 -22
150 -23
150 -24
150 -25
150 -26
150 -27
150 -28
150 -29
150 -30
150 -31
150 -32
150 -33
150 -34
150 -35
150 -36
150 -37
150 -38
150 -39
150 -40
150 -41
150 -42
150 -43
150 -44
150 -45
150 -46
150 -47
150 -48
150 -49
150 -50
150 -51
125 -21
125 -22
125 -23
125 -24
125 -25
125 -26
125 -27
125 -28
125 -29
125 -30
125 -31
125 -32
125 -33
125 -34
125 -35
125 -36
125 -37
125 -38
12

192.168.39.161 - - [03/Nov/2023 02:43:12] "GET /prompt?data=yes HTTP/1.1" 200 -


25 -50
25 -51
Audio file Unclear... Please retry
exporting output_0.wav
False

Data from ESP32:  yes 

200 -21
exporting output_0.wav
exporting output_1.wav
exporting output_2.wav
exporting output_3.wav
True
Adding output_0.wav to test sample
Adding output_1.wav to test sample
Adding output_2.wav to test sample
Adding output_3.wav to test sample


192.168.39.161 - - [03/Nov/2023 02:43:45] "GET /prompt?data=yes HTTP/1.1" 200 -


[6, 2, 9, 9]
