# prepare text and audio_paths files

## preprocess audio 

In [11]:
import os
import pandas as pd
import random

seed = 17
random.seed(seed)

data_dir = "data/midea_data_500"

df_trans = pd.read_csv(os.path.join(data_dir, 'transcripts.csv'))

print(df_trans.shape)
df_trans.rename(columns={'content': 'sentence', 'record_file_id': 'file_id'}, inplace=True)
df_trans.head()

(531, 4)


Unnamed: 0,category_name,sentence,user_id,file_id
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.mp3
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.mp3
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.mp3
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.mp3
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.mp3


## convert amr to mp3

In [12]:
#!pip install pydub

In [13]:
# from pydub import AudioSegment
# from tqdm import tqdm

# for i, row in tqdm(df_trans.iterrows(), total=df_trans.shape[0]):
#     filename = row['file_id']
#     amr_audio = AudioSegment.from_file(data_dir+f'/amrs/{filename}', format="amr")
#     amr_audio.export(data_dir+f'/clips/{filename[:-3]}mp3', format="mp3")
#     df_trans.loc[i, 'file_id'] = f'{filename[:-3]}mp3'

# df_trans.to_csv(data_dir+'/transcripts.csv', index=False)
# df_trans.head()

## combine different datasets to form a custom dataset

In [14]:
custom_dir = 'data/custom_data_v1' # 2k mdcc, 2k cmcc, 300 midea
custom_dir = 'data/custom_data_v2' # 65120 mdcc, 8429 cmcc, 300 midea
custom_dir = 'data/custom_data_v3' # 65120 mdcc, 8429 cmcc
custom_dir = 'data/custom_data_v4' # 5k mdcc, 5k cmcc

os.makedirs(custom_dir, exist_ok=True)

text_path = os.path.join(custom_dir, 'text')
audio_paths_path = os.path.join(custom_dir, 'audio_paths')

### MDCC data

In [15]:
import uuid
import shutil

mdcc_dir = 'data/MDCC'

splits = ['train', 'valid', 'test']
# counts = [2000, 200, 200]
counts = [65120, 5663, 12492]
counts = [5000, 1000, 1000]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    df = pd.read_csv(os.path.join(mdcc_dir, f'cnt_asr_{split}_metadata.csv'))
    print(split, df.shape)
    print(df.head())
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            with open(os.path.join(mdcc_dir, row['text_path'][2:])) as fi:
                trans = fi.read()
                trans = trans.strip()

            audio_path = os.path.realpath(mdcc_dir+'/'+row['audio_path'][2:])

            # shutil.copy2(audio_src, audio_dest)
            
            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")

train (65120, 4)
                                          audio_path  \
0     ./audio/447_1709011939_16988_371.88_376.92.wav   
1       ./audio/447_1810221351_22994_56.66_57.71.wav   
2     ./audio/447_1711162014_17384_806.04_809.18.wav   
3     ./audio/447_1711171151_23819_347.16_349.66.wav   
4  ./audio/447_1810221419_57894_113.52_121.42001.wav   

                                           text_path     sex  duration  
0  ./transcription/447_1709011939_16988_371.88_37...    male   5.04000  
1  ./transcription/447_1810221351_22994_56.66_57....    male   1.05000  
2  ./transcription/447_1711162014_17384_806.04_80...    male   3.14000  
3  ./transcription/447_1711171151_23819_347.16_34...    male   2.50000  
4  ./transcription/447_1810221419_57894_113.52_12...  female   7.90001  
valid (5663, 4)
                                         audio_path  \
0      ./audio/447_1709011939_75569_596.5_600.1.wav   
1  ./audio/447_1711171106_19828_2.19996_6.49001.wav   
2       ./audio/447_1707171

## Common Voice 17.0

In [16]:
import uuid
import shutil

common_dir = 'data/cv-corpus-17.0-2024-03-15/zh-HK'
splits = ['train', 'valid', 'test']
# counts = [2000, 200, 200]
# counts = [8429, 5595, 5595]
counts = [5000, 1000, 1000]

for split, count in zip(splits, counts):
    save_dir = os.path.join(custom_dir, split)
    
    df = pd.read_csv(os.path.join(common_dir, f'{split}.tsv'), sep='\t')
    print(split, df.shape)
    # print(df.head())
    
    df_sample = df.sample(n=min(count, df.shape[0]), random_state=seed)
    with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
        for i, row in df_sample.iterrows():
            unique_id = str(uuid.uuid4())
            trans = row['sentence']
            audio_path = os.path.realpath(common_dir+'/clips/'+row['path'])

            # shutil.copy2(audio_src, audio_dest)

            fo1.write(f"{unique_id} {trans}\n")
            fo2.write(f"{unique_id} {audio_path}\n")   

train (8429, 13)
valid (5595, 13)
test (5595, 13)


## Midea Data

In [None]:
# import uuid
# import shutil

# midea_dir = 'data/midea_data_500'
# splits = ['train', 'valid', 'test']
# ratios = [0.6, 0.2, 0.2]

# df = pd.read_csv(os.path.join(midea_dir, f'transcripts.csv'))
# df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
# print(df.shape)
# # print(df.head())

# prev_count = 0
# num = df.shape[0]
# for split, ratio in zip(splits, ratios):
#     save_dir = os.path.join(custom_dir, split)
    
#     count = int(num*ratio)
#     print(split, count)

#     df_sample = df.loc[prev_count:prev_count+count]
#     prev_count += count
    
#     with open(os.path.join(custom_dir, split, 'text'), 'a') as fo1, open(os.path.join(custom_dir, split, 'audio_paths'), 'a') as fo2:
#         for i, row in df_sample.iterrows():
#             unique_id = str(uuid.uuid4())
#             trans = row['sentence']
#             audio_path = os.path.realpath(midea_dir+'/clips/'+row['file_id'])

#             # shutil.copy2(audio_src, audio_dest)

#             fo1.write(f"{unique_id} {trans}\n")
#             fo2.write(f"{unique_id} {audio_path}\n") 

In [None]:
## remove dirs

# shutil.rmtree(custom_dir)

# process data

In [17]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/test \
--output_data_dir data/${custom_dir}/test

Casting the dataset: 100%|██████| 2000/2000 [00:00<00:00, 1047397.68 examples/s]
Saving the dataset (1/1 shards): 100%|█| 2000/2000 [00:12<00:00, 160.02 examples
Data preparation done


In [18]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/valid \
--output_data_dir data/${custom_dir}/valid

Casting the dataset: 100%|██████| 2000/2000 [00:00<00:00, 1046874.83 examples/s]
Saving the dataset (1/1 shards): 100%|█| 2000/2000 [00:13<00:00, 153.49 examples
Data preparation done


In [19]:
!python3 finetune/custom_data/data_prep.py \
--source_data_dir data/${custom_dir}/train \
--output_data_dir data/${custom_dir}/train

Casting the dataset: 100%|████| 10000/10000 [00:00<00:00, 3320379.99 examples/s]
Saving the dataset (5/5 shards): 100%|█| 10000/10000 [00:47<00:00, 210.93 exampl
Data preparation done


# Try to load the processed data

In [20]:
73549 // (64)

1149

In [None]:
import argparse
from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets

In [None]:
train_datasets = ['data/custom_data_v1/train']
eval_datasets = ['data/custom_data_v1/valid']

def load_custom_dataset(split):
    ds = []
    if split == 'train':
        for dset in train_datasets:
            ds.append(load_from_disk(dset))
    if split == 'eval':
        for dset in eval_datasets:
            ds.append(load_from_disk(dset))

    ds_to_return = concatenate_datasets(ds)
    ds_to_return = ds_to_return.shuffle(seed=22)
    return ds_to_return

In [None]:
ds_to_return = load_custom_dataset('eval')
ds_to_return

In [None]:
for ex in ds_to_return:
    print(ex)

In [None]:
def prepare_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    # optional pre-processing steps
    transcription = batch["sentence"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = normalizer(transcription).strip()
    
    # encode target text to label ids
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

max_label_length = 225 # model.config.max_length
min_input_length = 0.0
max_input_length = 30.0
def is_in_length_range(length, labels):
    return min_input_length < length < max_input_length and 0 < len(labels) < max_label_length


print('DATASET PREPARATION IN PROGRESS...')
raw_dataset = DatasetDict()
# raw_dataset["train"] = load_custom_dataset('train')
raw_dataset["eval"] = load_custom_dataset('eval')

raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=args.sampling_rate))
raw_dataset = raw_dataset.map(prepare_dataset, num_proc=args.num_proc)

raw_dataset = raw_dataset.filter(
    is_in_length_range,
    input_columns=["input_length", "labels"],
    num_proc=args.num_proc,
)

###############################     DATA COLLATOR AND METRIC DEFINITION     ########################

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print('DATASET PREPARATION COMPLETED')

# finetuning on huggingface dataset

In [None]:
!pip install huggingface_hub

In [None]:
# !huggingface-cli login 

# get your account token from https://huggingface.co/settings/tokens
token = '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'

# # import the relavant libraries for loggin in
# from huggingface_hub import HfApi, HfFolder

# # set api for login and save token
# api=HfApi()
# api.set_access_token(token)
# folder = HfFolder()
# folder.save_token(token)

from huggingface_hub import login
login(token)

In [None]:
%%writefile train_hf.sh

ngpu=1  # number of GPUs to perform distributed training on.

torchrun --nproc_per_node=${ngpu} finetune/train/fine-tune_on_hf_dataset.py \
--model_name /home/ec2-user/SageMaker/efs/Models/whisper-large-v3 \
--language Cantonese \
--sampling_rate 16000 \
--num_proc ${ngpu} \
--train_strategy steps \
--learning_rate 3e-3 \
--warmup 1000 \
--train_batchsize 1 \
--eval_batchsize 1 \
--num_steps 10000 \
--resume_from_ckpt None \
--output_dir checkpoint \
--train_datasets mozilla-foundation/common_voice_17_0  \
--train_dataset_configs yue \
--train_dataset_splits validation \
--train_dataset_text_columns sentence \
--eval_datasets mozilla-foundation/common_voice_17_0 \
--eval_dataset_configs yue \
--eval_dataset_splits test \
--eval_dataset_text_columns sentence

In [None]:
!bash train_hf.sh

# finetuning on custom data

In [None]:
%%writefile train.sh

ngpu=1  # number of GPUs to perform distributed training on.

torchrun --nproc_per_node=${ngpu} finetune/train/fine-tune_on_custom_dataset.py \
--model_name /home/ec2-user/SageMaker/efs/Models/whisper-large-v3 \
--language Cantonese \
--sampling_rate 16000 \
--num_proc ${ngpu} \
--train_strategy epoch \
--learning_rate 3e-3 \
--warmup 1000 \
--train_batchsize 1 \
--eval_batchsize 1 \
--num_epochs 2 \
--resume_from_ckpt None \
--output_dir checkpoint \
--train_datasets data/midea_data \
--eval_datasets data/midea_data

In [None]:
!bash train.sh