# prepare text and audio_paths files

## preprocess audio 

In [43]:
import os
import pandas as pd
import random

seed = 17
random.seed(seed)

data_dir = "data/midea_2173"   # your midea data directory

df_trans = pd.read_csv(os.path.join(data_dir, 'transcripts.csv'))

print(df_trans.shape)
df_trans.rename(columns={'content': 'sentence', 'record_file_id': 'file_id'}, inplace=True)
df_trans.head()

(2173, 4)


Unnamed: 0,category_name,sentence,user_id,file_id
0,Model,CMSN 20 si,yeungchimkuen,2956c101-d82f-4f2c-8f1b-6119fa87694d.wav
1,Model,CMSRO 20di cr,yeungchimkuen,c124f9b4-38e1-4cee-8b41-49f57f069806.wav
2,Model,CMSRO 20di rd,yeungchimkuen,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.wav
3,Model,AF-52CS1TRHK(H),yeungchimkuen,5102309c-8fb3-4232-b257-2bd8ed6ef53e.wav
4,Model,AF-74CS1TRHK(H),yeungchimkuen,51ab811e-12a0-47b6-834d-939b8b4ec144.wav


## convert amr to wav

In [44]:
!pip install pydub

In [45]:
from pydub import AudioSegment
from tqdm import tqdm

for i, row in tqdm(df_trans.iterrows(), total=df_trans.shape[0]):
    filename = row['file_id'][:-3]
    amr_audio = AudioSegment.from_file(data_dir+f'/amrs/{filename}amr', format="amr")
    amr_audio.export(data_dir+f'/wavs/{filename}wav', format="wav")
    df_trans.loc[i, 'file_id'] = f'{filename}wav'

df_trans.to_csv(data_dir+'/transcripts.csv', index=False)
df_trans.head()

## combine different datasets to form a custom dataset

In [46]:
custom_dir = 'data/custom_data_v0' # 5k mdcc, 5k cmcc, 1303 midea

os.makedirs(custom_dir, exist_ok=True)

text_path = os.path.join(custom_dir, 'text')
audio_paths_path = os.path.join(custom_dir, 'audio_paths')

### MDCC data

In [47]:
import uuid
import shutil

mdcc_dir = 'data/MDCC'

splits = ['train', 'valid', 'test']

counts = [5000, 500, 500]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    df = pd.read_csv(os.path.join(mdcc_dir, f'cnt_asr_{split}_metadata.csv'))
    print(split, df.shape)
    print(df.head())
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            with open(os.path.join(mdcc_dir, row['text_path'][2:])) as fi:
                trans = fi.read()
                trans = trans.strip()

            audio_path = os.path.realpath(mdcc_dir+'/'+row['audio_path'][2:])
            
            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")

train (65120, 4)
                                          audio_path  \
0     ./audio/447_1709011939_16988_371.88_376.92.wav   
1       ./audio/447_1810221351_22994_56.66_57.71.wav   
2     ./audio/447_1711162014_17384_806.04_809.18.wav   
3     ./audio/447_1711171151_23819_347.16_349.66.wav   
4  ./audio/447_1810221419_57894_113.52_121.42001.wav   

                                           text_path     sex  duration  
0  ./transcription/447_1709011939_16988_371.88_37...    male   5.04000  
1  ./transcription/447_1810221351_22994_56.66_57....    male   1.05000  
2  ./transcription/447_1711162014_17384_806.04_80...    male   3.14000  
3  ./transcription/447_1711171151_23819_347.16_34...    male   2.50000  
4  ./transcription/447_1810221419_57894_113.52_12...  female   7.90001  
valid (5663, 4)
                                         audio_path  \
0      ./audio/447_1709011939_75569_596.5_600.1.wav   
1  ./audio/447_1711171106_19828_2.19996_6.49001.wav   
2       ./audio/447_1707171

## Common Voice 17.0

In [48]:
import uuid
import shutil

common_dir = 'data/cv-corpus-17.0-2024-03-15/zh-HK'
splits = ['train', 'valid', 'test']

counts = [5000, 500, 500]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    
    df = pd.read_csv(os.path.join(common_dir, f'{split}.tsv'), sep='\t')
    print(split, df.shape)
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            trans = row['sentence']
            audio_path = os.path.realpath(common_dir+'/clips/'+row['path'])

            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")

train (8429, 13)
valid (5595, 13)
test (5595, 13)


## Midea Data

In [49]:
import uuid
import shutil

midea_dir = 'data/midea_2173'
splits = ['train', 'valid', 'test']
ratios = [0.6, 0.2, 0.2]

df = pd.read_csv(os.path.join(midea_dir, f'transcripts.csv'))
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
print(df.shape)
print(df.head())

prev_count = 0
num = df.shape[0]
for split, ratio in zip(splits, ratios):
    save_dir = os.path.join(custom_dir, split)
    
    count = int(num*ratio)
    print(split, count)

    df_sample = df.loc[prev_count:prev_count+count]
    prev_count += count
    
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            trans = row['sentence']
            audio_path = os.path.realpath(midea_dir+'/wavs/'+row['file_id'])

            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")

(2173, 4)
      category_name      sentence           user_id  \
0             Model  AW-7480H(IL)  ex_hangyee.kwong   
1           Surname             盛  ex_hangyee.kwong   
2             Model   RC-5SLIH(W)     yeungchimkuen   
3  Product Category           氣炸鍋     yeungchimkuen   
4             Model        TAS-X3     yeungchimkuen   

                                    file_id  
0  16193c60-6ddf-4647-a9f1-4c576ec73076.wav  
1  5aae676f-f5f6-4d85-8a54-33f3d98b4b83.wav  
2  d935c585-8b32-4abf-9306-081f0ca41256.wav  
3  e5c00f0c-81f4-43a1-8c00-987ee1952c89.wav  
4  ce627395-3fa6-4ec8-94dd-b42b1fc700b9.wav  
train 1303
valid 434
test 434


In [24]:
# # remove dirs

# shutil.rmtree(custom_dir)

# process data

In [50]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/test \
--output_data_dir data/${custom_dir}/test

Casting the dataset: 100%|██████████| 835/835 [00:00<00:00, 24586.98 examples/s]
Saving the dataset (1/1 shards): 100%|█| 835/835 [00:03<00:00, 242.64 examples/s
Data preparation done


In [51]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/valid \
--output_data_dir data/${custom_dir}/valid

Casting the dataset: 100%|█████████| 835/835 [00:00<00:00, 512098.82 examples/s]
Saving the dataset (1/1 shards): 100%|█| 835/835 [00:05<00:00, 159.94 examples/s
Data preparation done


In [52]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/train \
--output_data_dir data/${custom_dir}/train

Casting the dataset: 100%|██████| 3304/3304 [00:00<00:00, 1630734.34 examples/s]
Saving the dataset (2/2 shards): 100%|█| 3304/3304 [00:20<00:00, 160.18 examples
Data preparation done
