In [1]:
import torch
import librosa
import numpy as np
import pandas as pd
import textdistance
import re
from collections import Counter
import soundfile as sf
from scipy.io import wavfile
from IPython.display import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import os

# Load data

In [2]:
dataset = pd.read_csv("snips/smart-lights_close_ASR.csv")

vocab_data = pd.read_csv("snips/merged_GT_data.csv")

# Load Pre-trained speech recognition models

In [101]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Check sampling rate of the audios

In [120]:
file_name = "snips/Close_Wav/audio/0.wav"

In [121]:
data = wavfile.read(file_name)
framerate = data[0]
sounddata = data[1]
time = np.arange(0,len(sounddata))/framerate
print('Sampling rate:',framerate,'Hz')

Sampling rate: 16000 Hz


# Speech Recognition

In [117]:
words = list(vocab_data["transcript"].explode().str.split(" ").explode())
V = set(words) # create a set with all the words present in Groudtruth data
word_freq = {}
word_freq = Counter(words) #calculate frequency of each word
probs = {}
Total = sum(word_freq.values())
for k in word_freq.keys():
    probs[k] = word_freq[k]/Total #calculate probability of each word

def autocorrection(input_word):
    """return the autocorrected input_word"""
    input_word = input_word.lower()
    if input_word in V:
        return input_word
    else:
        sim = [1 - (textdistance.Jaccard(qval = 2).distance(v, input_word)) for v in word_freq.keys()] #find the most similar word in set based in word distance
        auto_df = pd.DataFrame.from_dict(probs, orient = "index").reset_index()
        auto_df = auto_df.rename(columns = {"index":"Word", 0: "Prob"})
        auto_df["Similarity"] = sim
        output = auto_df.sort_values(["Similarity", "Prob"], ascending = False).reset_index()["Word"][0] #sort based in similarity and probability 
        return output

In [106]:
def _wav2vec(file_name):
    """transform wav file into string"""
    if not os.path.isfile(file_name):
        return None
    input_audio, _ = librosa.load(file_name, sr=15555)
    input_values = tokenizer(input_audio, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0].lower()

    trans_words = transcription.split(" ")
    new_words = []

    for word in trans_words:
        if len(word) > 1:
            new_words.append(autocorrection(word))
    
    new_transcription = " ".join(new_words)

    return new_transcription

In [107]:
_wav2vec('snips/Close_Wav/audio/1267.wav')

'turn the light on in the bedroom'

In [108]:
# get wave files with known label
dataset["WAV_FILE"]
_list = []
for name in dataset["WAV_FILE"]:
    _list.append("snips/Close_Wav/audio/"+ name)
df =pd.DataFrame(_list)
df = df.rename(columns = {0:"directory"})

In [109]:
df["transcript"] = df["directory"].apply(_wav2vec)

In [110]:
df["user_action"] = dataset["Intent"]

In [111]:
df = df[["transcript","user_action"]]
df

Unnamed: 0,transcript,user_action
0,activate spotlight like can the entire house,SwitchLightOn
1,activate basement lights,SwitchLightOn
2,Adjust the bedroom light in intensity of thirt...,SetLightBrightness
3,can you please change the light color to pink,SetLightColor
4,Isai the brightness to Nile,SetLightBrightness
...,...,...
1655,turn the large meeting room green,SetLightColor
1656,turn the laundry room lights to twenty two,SetLightBrightness
1657,don't the light intensity to level thirty nine,SetLightBrightness
1658,turned the flat on,SwitchLightOn


In [112]:
df.to_csv("snips/new_ASR_without_labels.csv") #save ASR

In [11]:
vocab_data = pd.read_csv("snips/new_ASR_data.csv")
vocab_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,transcript,user_action,user_action_num
0,0,0,active igtl like an the entire house,SwitchLightOn,1
1,1,1,activate basement lights,SwitchLightOn,1
2,2,2,a djust the bedroom light in tentity of thirty...,SetLightBrightness,4
3,3,3,can you please change the light color to pink,SetLightColor,5
4,4,4,said the rightness to file,SetLightBrightness,4
...,...,...,...,...,...
1655,1655,1655,turn the large meeting room green,SetLightColor,5
1656,1656,1656,turn the laundry room lights to twenty two,SetLightBrightness,4
1657,1657,1657,don't the light intensity to level thirty nine,SetLightBrightness,4
1658,1658,1658,turned the late on,SwitchLightOn,1


vocab_data = pd.read_csv("snips/merged_data_all.csv")