# prepare text and audio_paths files

## preprocess audio 

In [43]:
import os
import pandas as pd
import random

seed = 17
random.seed(seed)

data_dir = "data/midea_2173"   # your midea data directory

df_trans = pd.read_csv(os.path.join(data_dir, 'transcripts.csv'))

print(df_trans.shape)
df_trans.rename(columns={'content': 'sentence', 'record_file_id': 'file_id'}, inplace=True)
df_trans.head()

(2173, 4)


Unnamed: 0,category_name,sentence,user_id,file_id
0,Model,CMSN 20 si,yeungchimkuen,2956c101-d82f-4f2c-8f1b-6119fa87694d.wav
1,Model,CMSRO 20di cr,yeungchimkuen,c124f9b4-38e1-4cee-8b41-49f57f069806.wav
2,Model,CMSRO 20di rd,yeungchimkuen,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.wav
3,Model,AF-52CS1TRHK(H),yeungchimkuen,5102309c-8fb3-4232-b257-2bd8ed6ef53e.wav
4,Model,AF-74CS1TRHK(H),yeungchimkuen,51ab811e-12a0-47b6-834d-939b8b4ec144.wav


## convert amr to wav

In [44]:
!pip install pydub

In [45]:
from pydub import AudioSegment
from tqdm import tqdm

for i, row in tqdm(df_trans.iterrows(), total=df_trans.shape[0]):
    filename = row['file_id'][:-3]
    amr_audio = AudioSegment.from_file(data_dir+f'/amrs/{filename}amr', format="amr")
    amr_audio.export(data_dir+f'/wavs/{filename}wav', format="wav")
    df_trans.loc[i, 'file_id'] = f'{filename}wav'

df_trans.to_csv(data_dir+'/transcripts.csv', index=False)
df_trans.head()

## combine different datasets to form a custom dataset

In [46]:
# custom_dir = 'data/custom_data_v1' # 2k mdcc, 2k cmcc, 318 midea
# custom_dir = 'data/custom_data_v2' # 65120 mdcc, 8429 cmcc, 300 midea
# custom_dir = 'data/custom_data_v3' # 65120 mdcc, 8429 cmcc
# custom_dir = 'data/custom_data_v4' # 5k mdcc, 5k cmcc
# custom_dir = 'data/custom_data_v5' # 2k mdcc, 2k cmcc
# custom_dir = 'data/custom_data_v6' # 5k mdcc, 5k cmcc, 318 midea
custom_dir = 'data/custom_data_v7' # 5k mdcc, 5k cmcc, 1303 midea
# custom_dir = 'data/custom_data_v8' # 1k mdcc, 1k cmcc, 1303 midea

os.makedirs(custom_dir, exist_ok=True)

text_path = os.path.join(custom_dir, 'text')
audio_paths_path = os.path.join(custom_dir, 'audio_paths')

### MDCC data

In [47]:
import uuid
import shutil

mdcc_dir = 'data/MDCC'

splits = ['train', 'valid', 'test']
# counts = [2000, 200, 200]
# counts = [65120, 5663, 12492]
# counts = [65120, 5663, 12492]
# counts = [5000, 1000, 1000]
# counts = [2000, 200, 200]
# counts = [5000, 200, 200]
counts = [5000, 500, 500]
# counts = [1000, 200, 200]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    df = pd.read_csv(os.path.join(mdcc_dir, f'cnt_asr_{split}_metadata.csv'))
    print(split, df.shape)
    print(df.head())
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            with open(os.path.join(mdcc_dir, row['text_path'][2:])) as fi:
                trans = fi.read()
                trans = trans.strip()

            audio_path = os.path.realpath(mdcc_dir+'/'+row['audio_path'][2:])

            # shutil.copy2(audio_src, audio_dest)
            
            # if i < df_sample.shape[0] - 1:
            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")
            # else:
            #     fo1.write(f"{unique_id} {trans}\n")
            #     fo2.write(f"{unique_id} {audio_path}\n")

train (65120, 4)
                                          audio_path  \
0     ./audio/447_1709011939_16988_371.88_376.92.wav   
1       ./audio/447_1810221351_22994_56.66_57.71.wav   
2     ./audio/447_1711162014_17384_806.04_809.18.wav   
3     ./audio/447_1711171151_23819_347.16_349.66.wav   
4  ./audio/447_1810221419_57894_113.52_121.42001.wav   

                                           text_path     sex  duration  
0  ./transcription/447_1709011939_16988_371.88_37...    male   5.04000  
1  ./transcription/447_1810221351_22994_56.66_57....    male   1.05000  
2  ./transcription/447_1711162014_17384_806.04_80...    male   3.14000  
3  ./transcription/447_1711171151_23819_347.16_34...    male   2.50000  
4  ./transcription/447_1810221419_57894_113.52_12...  female   7.90001  
valid (5663, 4)
                                         audio_path  \
0      ./audio/447_1709011939_75569_596.5_600.1.wav   
1  ./audio/447_1711171106_19828_2.19996_6.49001.wav   
2       ./audio/447_1707171

## Common Voice 17.0

In [48]:
import uuid
import shutil

common_dir = 'data/cv-corpus-17.0-2024-03-15/zh-HK'
splits = ['train', 'valid', 'test']
# counts = [2000, 200, 200]
# counts = [8429, 5595, 5595]
# counts = [8429, 5595, 5595]
# counts = [5000, 1000, 1000]
# counts = [2000, 200, 200]
# counts = [5000, 200, 200]
counts = [5000, 500, 500]
# counts = [1000, 200, 200]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    
    df = pd.read_csv(os.path.join(common_dir, f'{split}.tsv'), sep='\t')
    print(split, df.shape)
    # print(df.head())
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            trans = row['sentence']
            audio_path = os.path.realpath(common_dir+'/clips/'+row['path'])

            # shutil.copy2(audio_src, audio_dest)

            # if i < df_sample.shape[0] - 1:
            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")
            # else:
            #     fo1.write(f"{unique_id} {trans}\n")
            #     fo2.write(f"{unique_id} {audio_path}\n")

train (8429, 13)
valid (5595, 13)
test (5595, 13)


## Midea Data

In [49]:
import uuid
import shutil

midea_dir = 'data/midea_2173'
splits = ['train', 'valid', 'test']
ratios = [0.6, 0.2, 0.2]

df = pd.read_csv(os.path.join(midea_dir, f'transcripts.csv'))
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
print(df.shape)
print(df.head())

prev_count = 0
num = df.shape[0]
for split, ratio in zip(splits, ratios):
    save_dir = os.path.join(custom_dir, split)
    
    count = int(num*ratio)
    print(split, count)

    df_sample = df.loc[prev_count:prev_count+count]
    prev_count += count
    
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            trans = row['sentence']
            audio_path = os.path.realpath(midea_dir+'/wavs/'+row['file_id'])

            # shutil.copy2(audio_src, audio_dest)

            # if i < df_sample.shape[0] - 1:
            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")
            # else:
            #     fo1.write(f"{unique_id} {trans}")
            #     fo2.write(f"{unique_id} {audio_path}")

(2173, 4)
      category_name      sentence           user_id  \
0             Model  AW-7480H(IL)  ex_hangyee.kwong   
1           Surname             盛  ex_hangyee.kwong   
2             Model   RC-5SLIH(W)     yeungchimkuen   
3  Product Category           氣炸鍋     yeungchimkuen   
4             Model        TAS-X3     yeungchimkuen   

                                    file_id  
0  16193c60-6ddf-4647-a9f1-4c576ec73076.wav  
1  5aae676f-f5f6-4d85-8a54-33f3d98b4b83.wav  
2  d935c585-8b32-4abf-9306-081f0ca41256.wav  
3  e5c00f0c-81f4-43a1-8c00-987ee1952c89.wav  
4  ce627395-3fa6-4ec8-94dd-b42b1fc700b9.wav  
train 1303
valid 434
test 434


In [24]:
# # remove dirs

# shutil.rmtree(custom_dir)

# process data

In [50]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/test \
--output_data_dir data/${custom_dir}/test

Casting the dataset: 100%|██████████| 835/835 [00:00<00:00, 24586.98 examples/s]
Saving the dataset (1/1 shards): 100%|█| 835/835 [00:03<00:00, 242.64 examples/s
Data preparation done


In [51]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/valid \
--output_data_dir data/${custom_dir}/valid

Casting the dataset: 100%|█████████| 835/835 [00:00<00:00, 512098.82 examples/s]
Saving the dataset (1/1 shards): 100%|█| 835/835 [00:05<00:00, 159.94 examples/s
Data preparation done


In [52]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/train \
--output_data_dir data/${custom_dir}/train

Casting the dataset: 100%|██████| 3304/3304 [00:00<00:00, 1630734.34 examples/s]
Saving the dataset (2/2 shards): 100%|█| 3304/3304 [00:20<00:00, 160.18 examples
Data preparation done


# Try to load the processed data

In [None]:
# import argparse
# from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets

# train_datasets = ['data/custom_data_v1/train']
# eval_datasets = ['data/custom_data_v1/valid']

# def load_custom_dataset(split):
#     ds = []
#     if split == 'train':
#         for dset in train_datasets:
#             ds.append(load_from_disk(dset))
#     if split == 'eval':
#         for dset in eval_datasets:
#             ds.append(load_from_disk(dset))

#     ds_to_return = concatenate_datasets(ds)
#     ds_to_return = ds_to_return.shuffle(seed=22)
#     return ds_to_return

# ds_to_return = load_custom_dataset('eval')
# ds_to_return

# for ex in ds_to_return:
#     print(ex)

# def prepare_dataset(batch):
#     # load and (possibly) resample audio data to 16kHz
#     audio = batch["audio"]

#     # compute log-Mel input features from input audio array 
#     batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
#     # compute input length of audio sample in seconds
#     batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
#     # optional pre-processing steps
#     transcription = batch["sentence"]
#     if do_lower_case:
#         transcription = transcription.lower()
#     if do_remove_punctuation:
#         transcription = normalizer(transcription).strip()
    
#     # encode target text to label ids
#     batch["labels"] = processor.tokenizer(transcription).input_ids
#     return batch

# max_label_length = 225 # model.config.max_length
# min_input_length = 0.0
# max_input_length = 30.0
# def is_in_length_range(length, labels):
#     return min_input_length < length < max_input_length and 0 < len(labels) < max_label_length


# print('DATASET PREPARATION IN PROGRESS...')
# raw_dataset = DatasetDict()
# # raw_dataset["train"] = load_custom_dataset('train')
# raw_dataset["eval"] = load_custom_dataset('eval')

# raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
# raw_dataset = raw_dataset.map(prepare_dataset, num_proc=args.num_proc)

# raw_dataset = raw_dataset.filter(
#     is_in_length_range,
#     input_columns=["input_length", "labels"],
#     num_proc=args.num_proc,
# )

# ###############################     DATA COLLATOR AND METRIC DEFINITION     ########################

# @dataclass
# class DataCollatorSpeechSeq2SeqWithPadding:
#     processor: Any

#     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
#         # split inputs and labels since they have to be of different lengths and need different padding methods
#         # first treat the audio inputs by simply returning torch tensors
#         input_features = [{"input_features": feature["input_features"]} for feature in features]
#         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

#         # get the tokenized label sequences
#         label_features = [{"input_ids": feature["labels"]} for feature in features]
#         # pad the labels to max length
#         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

#         # replace padding with -100 to ignore loss correctly
#         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

#         # if bos token is appended in previous tokenization step,
#         # cut bos token here as it's append later anyways
#         if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
#             labels = labels[:, 1:]

#         batch["labels"] = labels

#         return batch

# data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
# print('DATASET PREPARATION COMPLETED')

# separate different channels of wavs

In [32]:
# # %pip install scipy

# import numpy as np
# import scipy.io.wavfile as wavfile

# # Read the multi-channel WAV file
# input_filename = 'data/midea_0612/wavs/d5f2afaa-53af-4dcb-ac24-3827a99c748e.wav'
# sample_rate, data = wavfile.read(input_filename)

# # Ensure the data is two-dimensional (i.e., multiple channels)
# if len(data.shape) == 1:
#     raise ValueError("The provided WAV file is not multi-channel.")

# # Get the number of channels
# num_channels = data.shape[1]

# # Loop through each channel and save it as a separate WAV file
# for i in range(num_channels):
#     channel_data = data[:, i]
    
#     output_filename = input_filename[:-4].replace('wavs', 'wavs_1channel')+f'_c{i}.wav'
#     wavfile.write(output_filename, sample_rate, channel_data)
#     print(f'Channel {i+1} saved as {output_filename}')


# ## another way to separate different channels

In [None]:
# !pip install pydub

In [None]:
from pydub import AudioSegment

def split_channels(audio_path, output_prefix):
    # Load the multi-channel audio file
    audio = AudioSegment.from_file(audio_path)

    # Get the number of channels
    channels = audio.split_to_mono()
    print(len(channels))

    # Save each channel as a separate single-channel audio file
    for i, channel in enumerate(channels):
        output_path = f"{output_prefix}_channel_{i+1}.wav"
        channel.export(output_path, format="wav")
        print(f"Saved {output_path}")

# Example usage
audio_path = 'data/midea_0612/wavs/3ee4b9f4-7674-4978-9066-d89b46c9adb4.wav'
output_prefix = "data/midea_0612/wavs_1channel/3ee4b9f4-7674-4978-9066-d89b46c9adb4"
split_channels(audio_path, output_prefix)

In [2]:
from pydub import AudioSegment
import os
from tqdm import tqdm

# Function to split audio file into chunks of given duration
def split_audio(file_path, chunk_length_ms=10000):
    audio = AudioSegment.from_wav(file_path)
    total_length_ms = len(audio)
    
    chunks = []
    for start_ms in range(0, total_length_ms, chunk_length_ms):
        end_ms = min(start_ms + chunk_length_ms, total_length_ms)
        chunk = audio[start_ms:end_ms]
        chunks.append(chunk)
    
    return chunks

# Function to save audio chunks to files and separate channels
def save_chunks_with_channels(chunks, output_dir, base_filename="chunk", chunk_length=10):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for i, chunk in enumerate(chunks):
        # Separate channels
        left_channel = chunk.split_to_mono()[0]
        right_channel = chunk.split_to_mono()[1]
        
        # Save left and right channels separately
        left_filename = os.path.join(output_dir, f"{base_filename}_{i*chunk_length}_customer.wav")
        right_filename = os.path.join(output_dir, f"{base_filename}_{i*chunk_length}_agent.wav")
        
        left_channel.export(left_filename, format="wav")
        right_channel.export(right_filename, format="wav")
        
        # print(f"Saved chunk {i} left channel: {left_filename}")
        # print(f"Saved chunk {i} right channel: {right_filename}")

# Define the file path and output directory
data_dir = 'data/midea_dialogue/long'
filenames = [filename for filename in os.listdir('data/midea_dialogue/long') if filename.endswith('wav')]
print(len(filenames), filenames[:5])

chunk_length_ms = 30000 # ms

for filename in tqdm(filenames, total=len(filenames)):
    input_file_path = os.path.join(data_dir, filename)
    output_directory = "data/midea_dialogue/short_30s"

    # Split the audio file into 10-second chunks
    chunks = split_audio(input_file_path, chunk_length_ms=chunk_length_ms)

    basename = filename.split('.')[0]
    # Save the chunks with separate channels
    save_chunks_with_channels(chunks, output_directory, basename, chunk_length=(chunk_length_ms//1000))

18 ['f4857f5a-201d-4eea-ad3b-b8b048e72b68.wav', 'aec95ea8-e448-45a6-be21-63a6b6326b9f.wav', '8fcf8eb4-d4da-4146-a42c-d3ba711b4ed0.wav', 'd5f2afaa-53af-4dcb-ac24-3827a99c748e.wav', '9bdd9ccf-d94b-4f38-8a5d-35e4dd43aeee.wav']


100%|██████████| 18/18 [00:15<00:00,  1.18it/s]
