In [1]:
import pandas as pd
import numpy as np
import _helpers as hp
from jiwer import wer

# Load DATA

In [2]:
dataset_meta = pd.read_json("snips/metadata.json")

dataset_csv = pd.read_csv("snips/smart-lights_close_ASR.csv")

# Metadata

In [3]:
df_meta = dataset_meta.T
df_meta = df_meta.reset_index(drop=True)
df_meta

Unnamed: 0,keywords,language,filename,transcript,gender,age
0,"[turn off, living room]",en,record_b527de37-35e3-48d9-8c8e-3f9672ccdd79_16...,i want to turn off the lights for the living room,M,46
1,"[decrease, bedroom]",en,record_58ea6b9c-f9f4-43d2-bc6d-503a43f8da9d_16...,please decrease lights for the bedroom,M,30
2,"[increase, living room]",en,record_6224b626-12f4-48ff-ac9a-d28ee57ed70b_16...,could you increase the lights in the living room,M,29
3,"[increase, bedroom]",en,record_fa3339c0-22b3-4650-8c48-111411eebfeb_16...,please increase lights for the bedroom,M,24
4,"[turn off, bedroom]",en,record_5a2c371e-43f8-4be9-9ec5-7b542b075b3a_16...,i want to turn off the lights for the bedroom,F,28
...,...,...,...,...,...,...
559,"[increase, kitchen]",en,record_f02159ce-cae6-4e03-9c1a-9c17ddcf70bf_16...,please increase lights in the kitchen,M,46
560,"[turn on, bedroom]",en,record_1c94b803-396f-4ec5-8dd6-7c2ccc0eea38_16...,would you turn on the lights in the bedroom,M,30
561,"[turn off, living room]",en,record_9bcdf971-c5b2-4898-b511-58ecfed6826e_16...,turn off lights for the living room please,M,38
562,"[turn off, kitchen]",en,record_0ca383f2-9aaf-43df-bbcc-b40920072b63_16...,can you turn off the lights for the kitchen,F,36


In [4]:
# words in this dataset
df_meta["keywords"].explode().value_counts()

kitchen        193
brightness     192
bedroom        187
living room    184
increase       144
decrease       143
turn off       139
turn on        139
Name: keywords, dtype: int64

# CSV DATA

In [5]:
dataset_csv["WAV_FILE"]

0          0.wav
1          1.wav
2         10.wav
3        100.wav
4       1000.wav
          ...   
1655    1263.wav
1656    1264.wav
1657    1265.wav
1658    1266.wav
1659    1267.wav
Name: WAV_FILE, Length: 1660, dtype: object

In [6]:
df_csv_GT = dataset_csv[["GroundTruth_Sentence", "Intent"]]
df_csv_GT = df_csv_GT.rename(
    columns={"GroundTruth_Sentence": "transcript", "Intent": "user_action"}
)

df_csv_ASR = dataset_csv[["ASR_Sentence", "Intent"]]
df_csv_ASR = df_csv_ASR.rename(
    columns={"ASR_Sentence": "transcript", "Intent": "user_action"}
)

In [7]:
# words in this column dataframe
df_csv_GT["user_action"].value_counts()

SetLightBrightness    296
SetLightColor         294
SwitchLightOff        276
IncreaseBrightness    269
DecreaseBrightness    268
SwitchLightOn         257
Name: user_action, dtype: int64

# Label Meta Data and CSV Data

In [8]:
df_meta = hp.label_data(df_meta)
df_meta = df_meta[["transcript", "user_action", "user_action_num"]]
df_meta

Unnamed: 0,transcript,user_action,user_action_num
0,i want to turn off the lights for the living room,SwitchLightOff,0.0
1,please decrease lights for the bedroom,DecreaseBrightness,3.0
2,could you increase the lights in the living room,IncreaseBrightness,2.0
3,please increase lights for the bedroom,IncreaseBrightness,2.0
4,i want to turn off the lights for the bedroom,SwitchLightOff,0.0
...,...,...,...
559,please increase lights in the kitchen,IncreaseBrightness,2.0
560,would you turn on the lights in the bedroom,SwitchLightOn,1.0
561,turn off lights for the living room please,SwitchLightOff,0.0
562,can you turn off the lights for the kitchen,SwitchLightOff,0.0


In [9]:
# Label ground truth data
df_csv_GT["user_action_num"] = df_csv_GT["user_action"].apply(hp.action2index)
df_csv_GT

Unnamed: 0,transcript,user_action,user_action_num
0,Activate all the lights in the entire house.,SwitchLightOn,1
1,Activate basement lights,SwitchLightOn,1
2,Adjust the bedroom light intensity to thirty nine,SetLightBrightness,4
3,Can you please change the light color to pink,SetLightColor,5
4,Set the brightness to five.,SetLightBrightness,4
...,...,...,...
1655,Turn the large meeting room green.,SetLightColor,5
1656,Turn the laundry room lights to twenty two.,SetLightBrightness,4
1657,Turn the light intensity to level thirty nine,SetLightBrightness,4
1658,Turn the light on,SwitchLightOn,1


In [10]:
# label ASR data
df_csv_ASR["user_action_num"] = df_csv_ASR["user_action"].apply(hp.action2index)
df_csv_ASR

Unnamed: 0,transcript,user_action,user_action_num
0,w. lights in the in her house,SwitchLightOn,1
1,active impeachment light,SwitchLightOn,1
2,just the bedroom like intend to or thirty nine,SetLightBrightness,4
3,can you please changed the white collar being,SetLightColor,5
4,the do,SetLightBrightness,4
...,...,...,...
1655,turn the large meeting room green,SetLightColor,5
1656,don't don't diesel nights drinking too,SetLightBrightness,4
1657,don't like going to be eleven thirty nine,SetLightBrightness,4
1658,delayed on,SwitchLightOn,1


# New ASR without autocorrection

In [11]:
# Dataset based in improved function of speech recognition
df_csv_new_ASR = pd.read_csv("snips/new_ASR_without_labels.csv")
df_csv_new_ASR

Unnamed: 0.1,Unnamed: 0,transcript,user_action
0,0,active igtl like an the entire house,SwitchLightOn
1,1,activate basement lights,SwitchLightOn
2,2,a djust the bedroom light in tentity of thirty...,SetLightBrightness
3,3,can you please change the light color to pink,SetLightColor
4,4,said the rightness to file,SetLightBrightness
...,...,...,...
1655,1655,turn the large meeting room green,SetLightColor
1656,1656,turn the laundry room lights to twenty two,SetLightBrightness
1657,1657,don't the light intensity to level thirty nine,SetLightBrightness
1658,1658,turned the late on,SwitchLightOn


In [12]:
df_csv_new_ASR["user_action_num"] = df_csv_new_ASR["user_action"].apply(hp.action2index)
df_csv_new_ASR[["transcript", "user_action", "user_action_num"]]

Unnamed: 0,transcript,user_action,user_action_num
0,active igtl like an the entire house,SwitchLightOn,1
1,activate basement lights,SwitchLightOn,1
2,a djust the bedroom light in tentity of thirty...,SetLightBrightness,4
3,can you please change the light color to pink,SetLightColor,5
4,said the rightness to file,SetLightBrightness,4
...,...,...,...
1655,turn the large meeting room green,SetLightColor,5
1656,turn the laundry room lights to twenty two,SetLightBrightness,4
1657,don't the light intensity to level thirty nine,SetLightBrightness,4
1658,turned the late on,SwitchLightOn,1


# New ASR with Autocorrection

In [13]:
# Dataset based in improved function of speech recognition
df_csv_new_ASR_AC = pd.read_csv("snips/new_ASR_Autocorrection_without_labels.csv")
df_csv_new_ASR_AC

Unnamed: 0.1,Unnamed: 0,transcript,user_action
0,0,activate light all like and the entire house,SwitchLightOn
1,1,activate basement lights,SwitchLightOn
2,2,adjust the bedroom light intensity of thirty nine,SetLightBrightness
3,3,can you please change the light color to pink,SetLightColor
4,4,rid the brightness to toilet,SetLightBrightness
...,...,...,...
1655,1655,turn the large meeting room green,SetLightColor
1656,1656,turn the laundry room lights to twenty two,SetLightBrightness
1657,1657,don't the light intensity to level thirty nine,SetLightBrightness
1658,1658,turned the flat on,SwitchLightOn


In [14]:
df_csv_new_ASR_AC["user_action_num"] = df_csv_new_ASR_AC["user_action"].apply(
    hp.action2index
)
df_csv_new_ASR_AC[["transcript", "user_action", "user_action_num"]]

Unnamed: 0,transcript,user_action,user_action_num
0,activate light all like and the entire house,SwitchLightOn,1
1,activate basement lights,SwitchLightOn,1
2,adjust the bedroom light intensity of thirty nine,SetLightBrightness,4
3,can you please change the light color to pink,SetLightColor,5
4,rid the brightness to toilet,SetLightBrightness,4
...,...,...,...
1655,turn the large meeting room green,SetLightColor,5
1656,turn the laundry room lights to twenty two,SetLightBrightness,4
1657,don't the light intensity to level thirty nine,SetLightBrightness,4
1658,turned the flat on,SwitchLightOn,1


# Calculate word error rate (WER) 

In [15]:
# Turning transcripts in dataframes to list
gt_trancript_list = df_csv_GT["transcript"].tolist()
asr_transcript_list = df_csv_ASR["transcript"].tolist()
new_asr_transcript_list = df_csv_new_ASR["transcript"].tolist()
new_asr_ac_transcript_list = df_csv_new_ASR_AC["transcript"].tolist()


# Calculating the WER of the ASR and new ASR methods relative to the ground truth data
asr_error = wer(gt_trancript_list, asr_transcript_list)
new_asr_error = wer(gt_trancript_list, new_asr_transcript_list)
new_asr_ac_error = wer(gt_trancript_list, new_asr_ac_transcript_list)

In [16]:
asr_error

0.5630921809970778

In [17]:
new_asr_error

0.3605773488001417

In [18]:
new_asr_ac_error

0.3216151598335252

# Concat and save

In [19]:
df = pd.concat([df_meta, df_csv_GT])
df.to_csv("snips/merged_GT_data.csv")  # save Groudtruth data
df_csv_new_ASR.to_csv("snips/new_ASR_with_labels.csv")
df_csv_new_ASR_AC.to_csv("snips/new_ASR_Autocorrection_with_labels.csv")