### Read data

In [1]:
import pandas as pd
from sklearn.utils import shuffle

In [2]:
train = pd.read_csv('train_data.csv')
valid = pd.read_csv('valid_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
train_asr = pd.read_csv('fluentai_ASRtrain.csv', index_col=0)
valid_asr = pd.read_csv('fluentai_ASRvalid.csv', index_col=0)
test_asr = pd.read_csv('fluentai_ASRtest.csv', index_col=0)

In [4]:
train.head()

Unnamed: 0,path,speakerId,transcription,action,object,location,sampleId,uniqueId
0,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Change language,change language,none,none,0a3129c0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-0a3129c0447411e9a9a55dbec3b8816a
1,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Resume,activate,music,none,0ee42a80447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-0ee42a80447411e9a9a55dbec3b8816a
2,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Turn the lights on,activate,lights,none,144d5be0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-144d5be0447411e9a9a55dbec3b8816a
3,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Switch on the lights,activate,lights,none,1811b6e0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-1811b6e0447411e9a9a55dbec3b8816a
4,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Switch off the lights,deactivate,lights,none,1d9f3920447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-1d9f3920447411e9a9a55dbec3b8816a


In [5]:
train_asr.head()

Unnamed: 0,asr_transcript,uniqueId
0,In gran'ther he'd being in their barbran,2BqVo8kVB2Skwgyb-029f6450447a11e9a9a55dbec3b8816a
1,It's than life all eh,2BqVo8kVB2Skwgyb-03592c80447c11e9a9a55dbec3b8816a
2,Dawn that bedroom life often,2BqVo8kVB2Skwgyb-063aa8f0447911e9a9a55dbec3b8816a
3,Length heath,2BqVo8kVB2Skwgyb-07858300447a11e9a9a55dbec3b8816a
4,Then they he'd up in the logs shortly,2BqVo8kVB2Skwgyb-0887f780447d11e9a9a55dbec3b8816a


### Merge GT and ASR data

In [6]:
train['sampleId'] = train['path'].apply(lambda x: x.split("/")[-1].split(".")[0].replace("-",""))
valid['sampleId'] = valid['path'].apply(lambda x: x.split("/")[-1].split(".")[0].replace("-",""))
test['sampleId'] = test['path'].apply(lambda x: x.split("/")[-1].split(".")[0].replace("-",""))

train['uniqueId'] = train[['speakerId', 'sampleId']].agg('-'.join, axis=1)
valid['uniqueId'] = valid[['speakerId', 'sampleId']].agg('-'.join, axis=1)
test['uniqueId'] = test[['speakerId', 'sampleId']].agg('-'.join, axis=1)

train_gt_asr = pd.merge(train, train_asr, on='uniqueId')
valid_gt_asr = pd.merge(valid, valid_asr, on='uniqueId')
test_gt_asr = pd.merge(test, test_asr, on='uniqueId')

In [7]:
train_gt_asr.head()

Unnamed: 0,path,speakerId,transcription,action,object,location,sampleId,uniqueId,asr_transcript
0,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Change language,change language,none,none,0a3129c0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-0a3129c0447411e9a9a55dbec3b8816a,Change languish
1,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Resume,activate,music,none,0ee42a80447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-0ee42a80447411e9a9a55dbec3b8816a,Assume
2,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Turn the lights on,activate,lights,none,144d5be0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-144d5be0447411e9a9a55dbec3b8816a,Don the lights on
3,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Switch on the lights,activate,lights,none,1811b6e0447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-1811b6e0447411e9a9a55dbec3b8816a,On the lay this
4,/fluent_speech_commands_dataset/wavs/speakers/...,2BqVo8kVB2Skwgyb,Switch off the lights,deactivate,lights,none,1d9f3920447411e9a9a55dbec3b8816a,2BqVo8kVB2Skwgyb-1d9f3920447411e9a9a55dbec3b8816a,This off the lights


### Save the original (this dataset serves as gt_gt)

In [8]:
train_gt_asr[['path','transcription','asr_transcript','action','object','location']].to_csv('train_data.csv')
valid_gt_asr[['path','transcription','asr_transcript','action','object','location']].to_csv('valid_data.csv')
test_gt_asr[['path','transcription','asr_transcript','action','object','location']].to_csv('test_data.csv')

### Merge the ground truth and asr transcripts into ONE column

In [9]:
train_raw = pd.read_csv("train_data.csv", index_col=0)
train_asr = pd.read_csv("train_data.csv", index_col=0)

valid_raw = pd.read_csv("valid_data.csv", index_col=0)
valid_asr = pd.read_csv("valid_data.csv", index_col=0)

test_raw = pd.read_csv("test_data.csv", index_col=0)
test_asr = pd.read_csv("test_data.csv", index_col=0)

In [10]:
train_raw['type'] = 'raw'
train_raw = train_raw.drop(columns=['asr_transcript'])
train_asr['type'] = 'asr'
train_asr = train_asr.drop(columns=['transcription'])
train_asr.columns = ['path', 'transcription', 'action', 'object', 'location', 'type']

In [11]:
display(train_raw.head(5))
display(train_asr.head(5))

Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Change language,change language,none,none,raw
1,/fluent_speech_commands_dataset/wavs/speakers/...,Resume,activate,music,none,raw
2,/fluent_speech_commands_dataset/wavs/speakers/...,Turn the lights on,activate,lights,none,raw
3,/fluent_speech_commands_dataset/wavs/speakers/...,Switch on the lights,activate,lights,none,raw
4,/fluent_speech_commands_dataset/wavs/speakers/...,Switch off the lights,deactivate,lights,none,raw


Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Change languish,change language,none,none,asr
1,/fluent_speech_commands_dataset/wavs/speakers/...,Assume,activate,music,none,asr
2,/fluent_speech_commands_dataset/wavs/speakers/...,Don the lights on,activate,lights,none,asr
3,/fluent_speech_commands_dataset/wavs/speakers/...,On the lay this,activate,lights,none,asr
4,/fluent_speech_commands_dataset/wavs/speakers/...,This off the lights,deactivate,lights,none,asr


In [12]:
valid_raw['type'] = 'raw'
valid_raw = valid_raw.drop(columns=['asr_transcript'])
valid_asr['type'] = 'asr'
valid_asr = valid_asr.drop(columns=['transcription'])
valid_asr.columns = ['path', 'transcription', 'action', 'object', 'location', 'type']

In [13]:
display(valid_raw.head(5))
display(valid_asr.head(5))

Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Turn on the lights,activate,lights,none,raw
1,/fluent_speech_commands_dataset/wavs/speakers/...,Turn off the lights,deactivate,lights,none,raw
2,/fluent_speech_commands_dataset/wavs/speakers/...,Change language,change language,none,none,raw
3,/fluent_speech_commands_dataset/wavs/speakers/...,Pause the music,deactivate,music,none,raw
4,/fluent_speech_commands_dataset/wavs/speakers/...,Resume,activate,music,none,raw


Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Turn on the lights,activate,lights,none,asr
1,/fluent_speech_commands_dataset/wavs/speakers/...,Turn off the lights,deactivate,lights,none,asr
2,/fluent_speech_commands_dataset/wavs/speakers/...,Change language,change language,none,none,asr
3,/fluent_speech_commands_dataset/wavs/speakers/...,Caused the music,deactivate,music,none,asr
4,/fluent_speech_commands_dataset/wavs/speakers/...,Resumed,activate,music,none,asr


In [14]:
test_raw['type'] = 'raw'
test_raw = test_raw.drop(columns=['asr_transcript'])
test_asr['type'] = 'asr'
test_asr = test_asr.drop(columns=['transcription'])
test_asr.columns = ['path', 'transcription', 'action', 'object', 'location', 'type']

In [15]:
display(test_raw.head(5))
display(test_asr.head(5))

Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Turn on the lights,activate,lights,none,raw
1,/fluent_speech_commands_dataset/wavs/speakers/...,Turn off the lights,deactivate,lights,none,raw
2,/fluent_speech_commands_dataset/wavs/speakers/...,Change language,change language,none,none,raw
3,/fluent_speech_commands_dataset/wavs/speakers/...,Pause the music,deactivate,music,none,raw
4,/fluent_speech_commands_dataset/wavs/speakers/...,Resume,activate,music,none,raw


Unnamed: 0,path,transcription,action,object,location,type
0,/fluent_speech_commands_dataset/wavs/speakers/...,Turn on the lights,activate,lights,none,asr
1,/fluent_speech_commands_dataset/wavs/speakers/...,Turn off the lights,deactivate,lights,none,asr
2,/fluent_speech_commands_dataset/wavs/speakers/...,Change language,change language,none,none,asr
3,/fluent_speech_commands_dataset/wavs/speakers/...,Paused the music,deactivate,music,none,asr
4,/fluent_speech_commands_dataset/wavs/speakers/...,Resumed,activate,music,none,asr


In [16]:
train_combined = pd.concat([train_raw, train_asr])
valid_combined = pd.concat([valid_raw, valid_asr])
test_combined = pd.concat([test_raw, test_asr])

In [17]:
len(train_combined), len(valid_combined), len(test_combined)

(46264, 6236, 7586)

In [18]:
len(train_raw), len(valid_raw), len(test_raw)

(23132, 3118, 3793)

In [19]:
train_combined_sh = shuffle(train_combined, random_state=42)
valid_combined_sh = shuffle(valid_combined, random_state=42)
test_combined_sh = shuffle(test_combined, random_state=42)

### Save gt_asr

In [20]:
# train_raw[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('train_data.csv')
# valid_raw[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('valid_data.csv')
# test_asr[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('test_data.csv')

### Save gtasr_gt

In [21]:
# train_combined_sh[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('train_data.csv')
# valid_combined_sh[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('valid_data.csv')
# test_raw[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('test_data.csv')

### Save gtasr_asr

In [22]:
# train_combined_sh[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('train_data.csv')
# valid_combined_sh[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('valid_data.csv')
# test_asr[['path', 'transcription', 'action', 'object', 'location', 'type']].to_csv('test_data.csv')