### Read data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
snips = pd.read_csv('complete.csv')
asr = pd.read_csv('snips_ASRcomplete.csv', index_col=0)

In [3]:
snips.head()

Unnamed: 0,path,transcription,semantics
0,/snips/wavs/snips_real/smart-lights-en-close-f...,Activate all the lights in the entire house.,"{'intent': 'SwitchLightOn', 'slots': [{'entity..."
1,/snips/wavs/snips_real/smart-lights-en-close-f...,Activate basement lights,"{'intent': 'SwitchLightOn', 'slots': [{'entity..."
2,/snips/wavs/snips_real/smart-lights-en-close-f...,Adjust the bedroom light intensity to thirty nine,"{'intent': 'SetLightBrightness', 'slots': [{'e..."
3,/snips/wavs/snips_real/smart-lights-en-close-f...,Can you please change the light color to pink,"{'intent': 'SetLightColor', 'slots': [{'entity..."
4,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness to five.,"{'intent': 'SetLightBrightness', 'slots': [{'e..."


In [4]:
asr.head()

Unnamed: 0,asr_transcript,uniqueId
0,Activate all life and needs her house,sample0000-0000
1,Activate basement lights,sample0001-0001
2,I just the beggar blight intensity a thirty nine,sample0002-0010
3,Can you please changed the light coloured to t...,sample0003-0100
4,Said the mate mystifying,sample0004-1000


### Merge GT and ASR data

In [5]:
snips = snips.reset_index()
snips['speakerId'] = snips['index'].apply(lambda x: '{0:04}'.format(x))
snips['speakerId'] = snips['speakerId'].apply(lambda x: 'sample'+str(x))
snips['sampleId'] = snips['path'].apply(lambda x: int(x.split("/")[-1].split(".")[0]))
snips['sampleId'] = snips['sampleId'].apply(lambda x: '{0:04}'.format(x))
snips['uniqueId'] = snips[['speakerId', 'sampleId']].agg('-'.join, axis=1)
snips = snips.drop(columns='index')

snips_asr = pd.merge(snips,asr,on='uniqueId')
snips_asr

Unnamed: 0,path,transcription,semantics,speakerId,sampleId,uniqueId,asr_transcript
0,/snips/wavs/snips_real/smart-lights-en-close-f...,Activate all the lights in the entire house.,"{'intent': 'SwitchLightOn', 'slots': [{'entity...",sample0000,0000,sample0000-0000,Activate all life and needs her house
1,/snips/wavs/snips_real/smart-lights-en-close-f...,Activate basement lights,"{'intent': 'SwitchLightOn', 'slots': [{'entity...",sample0001,0001,sample0001-0001,Activate basement lights
2,/snips/wavs/snips_real/smart-lights-en-close-f...,Adjust the bedroom light intensity to thirty nine,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample0002,0010,sample0002-0010,I just the beggar blight intensity a thirty nine
3,/snips/wavs/snips_real/smart-lights-en-close-f...,Can you please change the light color to pink,"{'intent': 'SetLightColor', 'slots': [{'entity...",sample0003,0100,sample0003-0100,Can you please changed the light coloured to t...
4,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness to five.,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample0004,1000,sample0004-1000,Said the mate mystifying
...,...,...,...,...,...,...,...
1655,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness level of light to twenty,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample1655,0995,sample1655-0995,Set the brightness level of white to twenty
1656,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness level to seventy-three,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample1656,0996,sample1656-0996,Said the brightness lovell two seventy three
1657,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness of the light bulbs to fifty.,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample1657,0997,sample1657-0997,Said the brightness of the light bulbs to fifty
1658,/snips/wavs/snips_real/smart-lights-en-close-f...,Set the brightness on the light bulbs to fifty.,"{'intent': 'SetLightBrightness', 'slots': [{'e...",sample1658,0998,sample1658-0998,Said the brightness of the light bulbs to fifty


### Split into train/valid/test (80/10/10)

In [6]:
intent_text_list = [semantics[12:].split("'")[0] for semantics in snips_asr['semantics']] #"SwitchLightOn"

snips_asr['intent'] = intent_text_list

train, valtest = train_test_split(snips_asr, test_size=0.2, random_state=42)
valid, test = train_test_split(valtest, test_size=0.5, random_state=42)

In [7]:
len(train), len(valid), len(test)

(1328, 166, 166)

### Save the original (this dataset serves as gt_gt)

In [8]:
train[['path','transcription','asr_transcript','semantics', 'intent']].to_csv('train_data.csv')
valid[['path','transcription','asr_transcript','semantics', 'intent']].to_csv('valid_data.csv')
test[['path','transcription','asr_transcript','semantics', 'intent']].to_csv('test_data.csv')

### Merge the ground truth and asr transcripts into ONE column

In [9]:
train_raw = pd.read_csv("train_data.csv", index_col=0)
train_asr = pd.read_csv("train_data.csv", index_col=0)

valid_raw = pd.read_csv("valid_data.csv", index_col=0)
valid_asr = pd.read_csv("valid_data.csv", index_col=0)

test_raw = pd.read_csv("test_data.csv", index_col=0)
test_asr = pd.read_csv("test_data.csv", index_col=0)

In [10]:
train_raw['type'] = 'raw'
train_raw = train_raw.drop(columns=['asr_transcript'])
train_asr['type'] = 'asr'
train_asr = train_asr.drop(columns=['transcription'])
train_asr.columns = ['path', 'transcription', 'semantics', 'intent', 'type']

In [11]:
display(train_raw.head(5))
display(train_asr.head(5))

Unnamed: 0,path,transcription,semantics,intent,type
266,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn the bedroom light to seventy-three,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,raw
148,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn brightness down in bed room,"{'intent': 'DecreaseBrightness', 'slots': [{'e...",DecreaseBrightness,raw
567,/snips/wavs/snips_real/smart-lights-en-close-f...,make the living room lighter,"{'intent': 'IncreaseBrightness', 'slots': [{'e...",IncreaseBrightness,raw
744,/snips/wavs/snips_real/smart-lights-en-close-f...,Change the color of the lights to blue in the ...,"{'intent': 'SetLightColor', 'slots': [{'entity...",SetLightColor,raw
588,/snips/wavs/snips_real/smart-lights-en-close-f...,please give me some light,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,raw


Unnamed: 0,path,transcription,semantics,intent,type
266,/snips/wavs/snips_real/smart-lights-en-close-f...,During the bedroom light he seventy three,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,asr
148,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn brightness down and bedroom,"{'intent': 'DecreaseBrightness', 'slots': [{'e...",DecreaseBrightness,asr
567,/snips/wavs/snips_real/smart-lights-en-close-f...,Make the living room lighter,"{'intent': 'IncreaseBrightness', 'slots': [{'e...",IncreaseBrightness,asr
744,/snips/wavs/snips_real/smart-lights-en-close-f...,Change the colour of the lights to blue and th...,"{'intent': 'SetLightColor', 'slots': [{'entity...",SetLightColor,asr
588,/snips/wavs/snips_real/smart-lights-en-close-f...,Please give me some light,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,asr


In [12]:
valid_raw['type'] = 'raw'
valid_raw = valid_raw.drop(columns=['asr_transcript'])
valid_asr['type'] = 'asr'
valid_asr = valid_asr.drop(columns=['transcription'])
valid_asr.columns = ['path', 'transcription', 'semantics', 'intent', 'type']

In [13]:
display(valid_raw.head(5))
display(valid_asr.head(5))

Unnamed: 0,path,transcription,semantics,intent,type
664,/snips/wavs/snips_real/smart-lights-en-close-f...,Adjust the cella lights to twenty two,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,raw
361,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn the waiting room brightness to two,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,raw
1615,/snips/wavs/snips_real/smart-lights-en-close-f...,Put the lights on,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,raw
366,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn up the lighting,"{'intent': 'IncreaseBrightness', 'slots': []}",IncreaseBrightness,raw
1247,/snips/wavs/snips_real/smart-lights-en-close-f...,Kill the bathroom lights.,"{'intent': 'SwitchLightOff', 'slots': [{'entit...",SwitchLightOff,raw


Unnamed: 0,path,transcription,semantics,intent,type
664,/snips/wavs/snips_real/smart-lights-en-close-f...,I just the saw lights to twenty two,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,asr
361,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn on the waiting room brightness to too,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,asr
1615,/snips/wavs/snips_real/smart-lights-en-close-f...,Put the lights on,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,asr
366,/snips/wavs/snips_real/smart-lights-en-close-f...,Turn up the lighting,"{'intent': 'IncreaseBrightness', 'slots': []}",IncreaseBrightness,asr
1247,/snips/wavs/snips_real/smart-lights-en-close-f...,Kill the bathroom lights,"{'intent': 'SwitchLightOff', 'slots': [{'entit...",SwitchLightOff,asr


In [14]:
test_raw['type'] = 'raw'
test_raw = test_raw.drop(columns=['asr_transcript'])
test_asr['type'] = 'asr'
test_asr = test_asr.drop(columns=['transcription'])
test_asr.columns = ['path', 'transcription', 'semantics', 'intent', 'type']

In [15]:
display(test_raw.head(5))
display(test_asr.head(5))

Unnamed: 0,path,transcription,semantics,intent,type
1445,/snips/wavs/snips_real/smart-lights-en-close-f...,More light please,"{'intent': 'IncreaseBrightness', 'slots': []}",IncreaseBrightness,raw
1368,/snips/wavs/snips_real/smart-lights-en-close-f...,Make the kids bedroom brighter with more light,"{'intent': 'IncreaseBrightness', 'slots': [{'e...",IncreaseBrightness,raw
1093,/snips/wavs/snips_real/smart-lights-en-close-f...,I want to turn off the patio lamps,"{'intent': 'SwitchLightOff', 'slots': [{'entit...",SwitchLightOff,raw
1516,/snips/wavs/snips_real/smart-lights-en-close-f...,Can you make the brightness twelve?,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,raw
1253,/snips/wavs/snips_real/smart-lights-en-close-f...,Let there be light,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,raw


Unnamed: 0,path,transcription,semantics,intent,type
1445,/snips/wavs/snips_real/smart-lights-en-close-f...,More light please,"{'intent': 'IncreaseBrightness', 'slots': []}",IncreaseBrightness,asr
1368,/snips/wavs/snips_real/smart-lights-en-close-f...,Make the keats bedroom brighter with more light,"{'intent': 'IncreaseBrightness', 'slots': [{'e...",IncreaseBrightness,asr
1093,/snips/wavs/snips_real/smart-lights-en-close-f...,I want to turn off the patio laps,"{'intent': 'SwitchLightOff', 'slots': [{'entit...",SwitchLightOff,asr
1516,/snips/wavs/snips_real/smart-lights-en-close-f...,Can you make the brightness twelve,"{'intent': 'SetLightBrightness', 'slots': [{'e...",SetLightBrightness,asr
1253,/snips/wavs/snips_real/smart-lights-en-close-f...,Let there be lie,"{'intent': 'SwitchLightOn', 'slots': []}",SwitchLightOn,asr


In [16]:
train_combined = pd.concat([train_raw, train_asr])
valid_combined = pd.concat([valid_raw, valid_asr])
test_combined = pd.concat([test_raw, test_asr])

In [17]:
len(train_combined), len(valid_combined), len(test_combined)

(2656, 332, 332)

In [18]:
len(train_raw), len(valid_raw), len(test_raw)

(1328, 166, 166)

In [19]:
train_combined_sh = shuffle(train_combined, random_state=42)
valid_combined_sh = shuffle(valid_combined, random_state=42)
test_combined_sh = shuffle(test_combined, random_state=42)

### Save gt_asr

In [20]:
# train_raw[['path','transcription','semantics', 'intent', 'type']].to_csv('train_data.csv')
# valid_raw[['path','transcription','semantics', 'intent', 'type']].to_csv('valid_data.csv')
# test_asr[['path','transcription','semantics', 'intent', 'type']].to_csv('test_data.csv')

### Save gtasr_gt

In [21]:
# train_combined_sh[['path','transcription','semantics', 'intent', 'type']].to_csv('train_data.csv')
# valid_combined_sh[['path','transcription','semantics', 'intent', 'type']].to_csv('valid_data.csv')
# test_raw[['path','transcription','semantics', 'intent', 'type']].to_csv('test_data.csv')

### Save gtasr_asr

In [22]:
# train_combined_sh[['path','transcription','semantics', 'intent', 'type']].to_csv('train_data.csv')
# valid_combined_sh[['path','transcription','semantics', 'intent', 'type']].to_csv('valid_data.csv')
# test_asr[['path','transcription','semantics', 'intent', 'type']].to_csv('test_data.csv')