In [1]:
import torch
import numpy as np
import librosa
import os
import pandas as pd
import tensorflow as tf

import soundfile as sf

from scipy.io import wavfile
from IPython.display import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

from sklearn.preprocessing import StandardScaler




In [2]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [3]:
def getFiles(data, labels, direct):
    pathList = os.listdir(direct)
    for l in range(len(pathList)):
        if ".wav" in pathList[l]:
            data.append(direct + '/' +  pathList[l])
            continue
        elif ".txt" in pathList[l]:
            labels.append(direct + '/' +  pathList[l])
        else:
            temp = direct + '/' +  pathList[l]
            getFiles(data, labels, temp)
    return(data,labels)

In [4]:
t_data = []
t_label = []
test_data = []
test_label = []

folderName = 'Test'

t_data, t_label = getFiles(t_data,t_label ,folderName)

In [5]:
def extract_features(files):
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(files, sr=16000)
    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))
        # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
    return (mfccs, chroma, mel, contrast, tonnetz)

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.callbacks import EarlyStopping

In [7]:

# Build a simple dense model with early stopping and softmax for categorical classification, remember we have 30 classes
model = Sequential()
model.add(Dense(193, input_shape=(193,), activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(30, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

In [8]:
for i in range(len(t_data)):
    file_name = t_data[i]
    data = wavfile.read(file_name)
    framerate = data[0]
    sounddata = data[1]
    time = np.arange(0,len(sounddata))/framerate
    input_audio1, _ = librosa.load(file_name, sr=16000)
    print(input_audio1)


    train_features = extract_features(file_name)

    features_train = []
    features_train.append(np.concatenate((
        train_features[0],
        train_features[1], 
        train_features[2], 
        train_features[3],
        train_features[4]), axis=0))
    input_audio2 = np.array(features_train)
    input_audio2 = torch.DoubleTensor(input_audio2)


    input_values = tokenizer(input_audio1, return_tensors="pt").input_values
    print(input_audio2)
    print(input_values)
    print(input_values[0][0])
    print(input_audio2)
    
    history = model.fit(input_values, input_values, batch_size=256, epochs=100, 
                        validation_data=(input_values, input_values),
                        callbacks=[early_stop])

    logits = model(input_audio2).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    print(transcription)

[0.09133911 0.09155273 0.09182739 ... 0.01727295 0.01834106 0.01950073]


  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)


tensor([[-3.5938e+02,  5.9476e+01,  1.7440e+01,  3.1372e+01,  5.0060e+00,
          1.1336e+01,  4.9002e+00,  4.9152e+00,  1.7015e+01,  1.0387e+00,
          8.9085e+00,  1.1553e+01,  1.6079e+01,  4.4754e+00,  1.1132e+01,
          1.0684e+01,  8.8060e-01,  3.8631e+00,  1.7999e+00, -1.3960e+00,
         -2.9160e+00,  2.6713e+00, -1.8214e+00,  4.1491e+00,  3.5940e+00,
          3.7886e+00,  4.9795e+00,  3.7504e+00,  4.1073e+00,  5.2912e+00,
          5.7745e+00,  6.2908e+00,  5.3984e+00,  6.0685e+00,  3.8477e+00,
          2.8128e+00,  1.7186e+00,  2.6213e+00,  1.6125e+00,  2.7341e+00,
          5.7337e-01,  5.2178e-01,  5.1428e-01,  5.0119e-01,  5.0786e-01,
          5.6085e-01,  6.0409e-01,  6.4776e-01,  6.6117e-01,  6.8109e-01,
          6.9889e-01,  6.4815e-01,  2.4067e+01,  2.9653e+00,  1.3074e-01,
          5.7556e-02,  8.3556e-01,  7.8879e+00,  1.5408e+01,  1.9627e+01,
          1.3277e+01,  3.6018e+00,  6.3347e-01,  7.3054e-01,  4.4866e-01,
          1.3265e-01,  4.0311e-01,  3.

TypeError: Expected DataType for argument 'Tout' not torch.float32.