## data preprocessing 
* 先前以確認且處理過Rick的資料集（英文help與alarm）
* other 從yt找了多組的不同場域的聲音和真實人閒聊的影片音訊擷取 隨機採剪而成
* 這隻程式碼將進行資料的擴增

* This code is mainly based on Rick's fold 1 clean and check his data is okay.
* I found some problems in the v1. For example :the raw data is not the real raw, and some audio has a different preprocessing.

* We have a new target, so we need to make a new corrected data, and also need to make the automatic flowork data building and analysis. (if have time the EDA is also important)

**<font color=#808080> == notes == </font>**

| 類別               | 數量 (原始)      | train 數量 (擴增後的訓練集) | val 數量 (未擴增驗證集)(只有padding) | test 數量 (未擴增驗證集)(只有padding)      |
|--------------------|----------------|--------------------|----------------------- |----------------------- |
| 0: Environment     | 4381           | 8759              | 206                    | 11                    |
| 1: en_help         | 140            | 560                | 24                     | 10                     |
| 2: ch_help         | 412            | 1648               | 30                     | 10                     |
| 3: ja_help         | 79             | 316                | 20                     | 10                     |
| 4: tw_help         | 238            | 952                | 26                     | 10                     |
| 5: dog             | 517            | 2067               | 35                     | 15                     |
| 6: cat             | 219            | 876                | 36                     | 20                     |
| 7: flush           | 207            | 828                | 26                     | 10                     |
| 8: alarm           | 201            | 804                | 20                     | 4                     |
| 9: glass_breaking  | 140            | 560                | 24                     | 10                     |


In [1]:
DATA_PATH = '/home/sail/sound_project/DATA/using_data_v4/clip_raw'

seed = 1123
sr = 16000

In [2]:
from IPython.display import Audio
from pydub import AudioSegment
import librosa
import librosa.display
from scipy.io import wavfile

from collections  import Counter
import numpy as np
import random
import os

random.seed(seed)

In [3]:
class_dict = {
                'other':0, 'Environment':0, 'alarm': 8, 'glass_breaking':9,
                'en_help': 1, 'ch_help': 2, 'ja_help': 3, 'tw_help': 4, #'hk_help': 5, 'yue_help':6,
                'dog':5, 'cat':6, 'flush':7
              } 

clip_type = {'alarm': 'random', 'other': 'random',
             'en_help': 'start', 'ch_help': 'start', 
             'ja_help': 'start', 'tw_help': 'start', 
             'hk_help': 'start', 'yue_help':'start',
             'dog':'start', 'cat':'start', 'flush':'start',
             'alarm':'start', 'glass_breaking':'start'
             }

In [4]:
save_path = '/home/sail/sound_project/DATA/using_data_v4/v4_4_traindata_1s'

TEST_path = '/home/sail/sound_project/DATA/using_data_v4/clip_raw/TEST'

In [5]:
import os

folder_path = f'{save_path}/for_training/train'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

folder_path = f'{save_path}/for_training/test'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

folder_path = f'{save_path}/for_training/val'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

folder_path = f'{save_path}/no_padding_only_clip1s'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [6]:
def load_data(wav_path, sr=sr, type='librosa'):
    if type == 'librosa':
        return librosa.load(wav_path, sr=sr)[0]
    elif type == 'wavfile':
        return wavfile.read(wav_path)[1]

def clip_1s(audio, sr=sr, type='start'):
    if type =='start':
        return audio[:sr]
    elif type == 'end':
        return audio[-sr:]
    elif type == 'random':
        start = random.randint(0, len(audio) - sr)
        return audio[start:start+sr]
    else:
        # return audio[type:type+sr]
        raise ValueError('type must be start, end or random.')
    
def long_random_clip(audio, sr, count):
    audio_list = []
    random_time_list = [random.randint(0, len(audio)) for _ in range(count)]
    random_time_list = list(dict.fromkeys(random_time_list))
    for start in random_time_list:
        audio_list.append(audio[start:start+sr])
    return audio_list

def padding_zero(audio, sr=sr, secent=1, type='a'):
    if len(audio) < sr*secent:
        if type=='ab':
            total_padding = sr*secent - len(audio)
            return np.pad(audio, (total_padding // 2, total_padding - (total_padding // 2)), 'constant', constant_values=(0, 0))
        elif type=='a':
            total_padding = sr*secent - len(audio)
            return np.pad(audio, (0, total_padding), 'constant', constant_values=(0, 0))
    else:
        return clip_1s(audio)
    
def add_noise(audio, noise_factor=0.0005):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    return augmented_audio

def add_loader(audio, gain = 1.5):
    audio = audio * gain
    audio = np.clip(audio, -1.0, 1.0)
    return audio
    

In [7]:
def preprocess_audio(audio):
    if (audio.shape[0] >= sr) & (audio.shape[0] <= sr*2):
        audio_1 = clip_1s(audio, sr=sr, type='start')
        audio_2 = clip_1s(audio, sr=sr, type='end')
        audio_list = [audio_1, audio_2]
    elif (audio.shape[0] > sr*2):
        audio_list = long_random_clip(audio, sr,  int(audio.shape[0]/1.5//sr)+1)
    else:
        audio_list = [audio]
    return audio_list

def preprocess_audio_not_clip(audio):
    if (audio.shape[0] >= sr) & (audio.shape[0] <= sr*2):
        audio_1 = clip_1s(audio, sr=sr, type='start')
        # audio_2 = clip_1s(audio, sr=sr, type='end')
        audio_list = [audio_1]
        # audio_list = [audio_1, audio_2]
    # elif (audio.shape[0] > sr*2):
    #     audio_list = long_random_clip(audio, sr,  int(audio.shape[0]/1.5//sr)+1)
    else:
        audio_list = [audio]
    return audio_list


In [8]:
X, y = [],[]

def process_subfolder(subfolder_path, label):
    for wav_file in os.listdir(subfolder_path):
        if wav_file == 'xx' or wav_file[:2] == 'X_':
            continue
        wav_file_path = os.path.join(subfolder_path, wav_file)
        if wav_file.endswith(('.wav', '.mp3')):
            audio = load_data(wav_file_path, sr, type='librosa')
            X.extend(preprocess_audio(audio))
            y.extend([label]*len(preprocess_audio(audio)))              
        else:
            process_subfolder(wav_file_path, label)
            
for file in ['other_disastermovie_1s','other', 'alarm','dog', 'cat', 'flush', 'glass_breaking']:  # ############################################################
    print(file)
    folder_path = os.path.join(DATA_PATH, file)
    for wav_file_0 in os.listdir(folder_path):
        if wav_file_0 == 'xx'or wav_file_0[:2] == 'X_':
            continue
        wav_file_0_path = os.path.join(folder_path, wav_file_0)
        # print(wav_file_0_path)
        if wav_file_0_path.endswith(('.wav', '.mp3')):
            if file == 'other':
                audio = load_data(wav_file_0_path, sr, type='librosa')
                count = audio.shape[0] // sr // 10
                evn_audio = long_random_clip(audio, sr, count)
                X.extend(evn_audio)
                y.extend([class_dict[file]]*len(evn_audio))
            elif file == 'other_disastermovie_1s':
                audio = load_data(wav_file_0_path, sr, type='librosa')
                X.extend([audio])
                y.extend([class_dict['other']])
                # print(audio.shape)

            elif (wav_file_0.endswith(('.wav', '.mp3'))) and (file == 'help_data' or file == 'dog' or file == 'cat' or file == 'flush'or file == 'glass_breaking'):# ############################################################
                audio = load_data(wav_file_0_path, sr, type='librosa')
                X.extend(preprocess_audio_not_clip(audio))
                y.extend([class_dict[file]]*len(preprocess_audio_not_clip(audio)))
            elif file == 'alarm':
                audio = load_data(wav_file_0_path, sr, type='librosa')
                X.extend(preprocess_audio(audio))
                y.extend([class_dict[file]]*len(preprocess_audio(audio)))
        else:
            process_subfolder(wav_file_0_path, class_dict[wav_file_0])
            print("！！檢查子路徑！！")


HELP_DATA_PATH = '/home/sail/sound_project/DATA/using_data_v4/clip_raw/help_data/'
for file in ['en_help', 'ch_help', 'ja_help', 'tw_help']: #, 'hk_help': 5, 'yue_help':6,]:  
    folder_path = os.path.join(HELP_DATA_PATH, file)

    for wav_file_0 in os.listdir(folder_path):
        wav_file_0_path = os.path.join(folder_path,wav_file_0)
        for wav_file_1 in os.listdir(wav_file_0_path):
            if wav_file_0 == 'xx'or wav_file_0[:2] == 'X_' or wav_file_1 == 'xx'or wav_file_1[:2] == 'X_':
                continue
            else:
                wav_file_1_path = os.path.join(folder_path, wav_file_0, wav_file_1)
                audio = load_data(wav_file_1_path, sr, type='librosa')
                X.extend(preprocess_audio_not_clip(audio))
                y.extend([class_dict[file]]*len(preprocess_audio_not_clip(audio)))

print(len(X), len(y))            


other_disastermovie_1s
other
alarm
dog
cat
flush
glass_breaking
6141 6141


In [9]:
Counter(y) # 7585 7585


Counter({0: 3988,
         5: 517,
         2: 412,
         4: 238,
         6: 219,
         7: 207,
         8: 201,
         9: 140,
         1: 140,
         3: 79})

In [10]:
# # save the data for processing in the feature

for c,i in enumerate(y):
    wavfile.write(f'{save_path}/no_padding_only_clip1s/{[key for key, value in class_dict.items() if value == i][0]}_{Counter(y[:c])[i]}.wav', sr, X[c])


In [11]:
val_list = ['_1.wav', '_15.wav', '_161.wav', '_136.wav', '_280.wav', '_80.wav', '_60.wav', '_255.wav', '_44.wav', '_73.wav']
TEST_other_list = ['_1111.wav', '_1500.wav', '_1800.wav', '_1943.wav', '_2765.wav', '_5475.wav', '_3464.wav', '_4545.wav', '_2245.wav', '_3344.wav', '_4599.wav']

In [12]:
for i, audio_path in enumerate(os.listdir(os.path.join(save_path, 'no_padding_only_clip1s'))):
    audio_a = padding_zero(load_data(os.path.join(save_path, 'no_padding_only_clip1s', audio_path), sr, type='librosa'), secent=1, type='a')
    audio_ab = padding_zero(load_data(os.path.join(save_path, 'no_padding_only_clip1s', audio_path), sr, type='librosa'), secent=1, type='ab')

    # 先處理其他
    if ('other_' in audio_path):

        for test_name in TEST_other_list:
            if test_name in audio_path:
                wavfile.write(f'{TEST_path}/other/{audio_path[:-4]}_padA.wav', sr, audio_a)

        TF = False
        for _ in range(100):
            num, A_AB = random.randint(0, Counter(y)[0]), random.randint(1, 2)

            val_name_ = f'_{num}.wav'
            if val_name_ in audio_path:
                TF = True                

            if TF:
                if A_AB == 1: 
                    wavfile.write(f'{save_path}/for_training/val/{audio_path[:-4]}_padA.wav', sr, audio_a)
                elif A_AB == 2:
                    wavfile.write(f'{save_path}/for_training/val/{audio_path[:-4]}_padAB.wav', sr, audio_ab)
            else:
                if A_AB == 1: 
                    wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_padA.wav', sr, audio_a)
                elif A_AB == 2:
                    wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_padAB.wav', sr, audio_ab)


    else:
        # 先分出驗證集
        for val_name in val_list:
            A_AB = random.randint(1, 2)
            if val_name in audio_path:
                wavfile.write(f'{save_path}/for_training/val/{audio_path[:-4]}_padA.wav', sr, audio_a)
                wavfile.write(f'{save_path}/for_training/val/{audio_path[:-4]}_padAB.wav', sr, audio_ab)
            else:
                if A_AB == 1: 
                    wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_padA.wav', sr, audio_a)
                elif A_AB == 2:
                    wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_padAB.wav', sr, audio_ab)     

    if ('other_' not in audio_path):
        A_AB = random.randint(1, 2)
        audio_no = add_noise(load_data(os.path.join(save_path, 'no_padding_only_clip1s', audio_path), sr, type='librosa'))
        audio_no_a = padding_zero(audio_no, secent=1, type='a')
        audio_no_ab = padding_zero(audio_no, secent=1, type='ab')

        audio_lo = add_loader(load_data(os.path.join(save_path, 'no_padding_only_clip1s', audio_path), sr, type='librosa'))
        audio_lo_a = padding_zero(audio_lo, secent=1, type='a')
        audio_lo_ab = padding_zero(audio_lo, secent=1, type='ab')

        if A_AB == 1: 
            wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_no_padA.wav', sr, audio_no_a)
            wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_lo_padA.wav', sr, audio_lo_a)
        elif A_AB == 2:
            wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_no_padAB.wav', sr, audio_no_ab)
            wavfile.write(f'{save_path}/for_training/train/{audio_path[:-4]}_lo_padAB.wav', sr, audio_lo_ab)


In [13]:
for wav_file in os.listdir(TEST_path):
    for i, aud_name in enumerate(os.listdir(os.path.join(TEST_path, wav_file))):
        audio = padding_zero(clip_1s(load_data(os.path.join(TEST_path, wav_file,aud_name))), secent=1, type='ab')
        if wav_file[:2] != 'X_':
            wavfile.write(f'{save_path}/for_training/val/{wav_file}_TEST_{i}.wav', sr, audio)
            wavfile.write(f'{save_path}/for_training/test/{wav_file}_TEST_{i}.wav', sr, audio)

In [14]:
# save npz

sounds_train, sounds_test, sounds_val = [], [], []
labels_train, labels_test, labels_val = [], [], []

for file in os.listdir(os.path.join(save_path, 'for_training')):
    print(file)
    for wav_path in os.listdir(os.path.join(save_path, 'for_training',file)):
        path = os.path.join(save_path, 'for_training',file, wav_path)
        wav = load_data(path, sr, type='librosa')
        if file == 'train':
            sounds_train.append(wav)
            try:
                labels_train.append(class_dict[wav_path.split('_')[0]])
            except Exception as e:
                labels_train.append(class_dict[wav_path.split('_')[0]+'_'+wav_path.split('_')[1]])            

        elif file == 'val':
            sounds_val.append(wav)
            try:
                labels_val.append(class_dict[wav_path.split('_')[0]])
            except Exception as e:
                labels_val.append(class_dict[wav_path.split('_')[0]+'_'+wav_path.split('_')[1]])

        elif file == 'test':
            sounds_test.append(wav)
            try:
                labels_test.append(class_dict[wav_path.split('_')[0]])
            except Exception as e:
                labels_test.append(class_dict[wav_path.split('_')[0]+'_'+wav_path.split('_')[1]])            


print(len(sounds_train), len(labels_train), len(sounds_val), len(labels_val), len(sounds_test), len(labels_test))


val
train
test
16583 16583 438 438 110 110


In [15]:
val
train
test
17370 17370 447 447 110 110

SyntaxError: invalid syntax (1566311223.py, line 4)

In [16]:
# np.savez(r'/home/sail/sound_project/DATA/using_data_v4/data_v4_4.npz', sounds_train=sounds_train, labels_train=labels_train, sounds_val=sounds_val, 
#          labels_val=labels_val, sounds_test=sounds_test, labels_test=labels_test)

In [17]:
data = np.load('/home/sail/sound_project/DATA/using_data_v4/data_v4_4.npz', allow_pickle=True) 

In [18]:
labels_val = data['labels_val']
labels_train = data['labels_train']
labels_test = data['labels_test']

In [19]:
Counter(labels_val)

Counter({0: 197,
         6: 36,
         5: 35,
         2: 30,
         4: 26,
         7: 26,
         1: 24,
         9: 24,
         3: 20,
         8: 20})

In [20]:
Counter(labels_train)


Counter({0: 7973,
         5: 2068,
         2: 1648,
         4: 952,
         6: 876,
         7: 828,
         8: 804,
         1: 559,
         9: 559,
         3: 316})

In [21]:
Counter(labels_test)


Counter({6: 20, 5: 15, 0: 11, 2: 10, 4: 10, 7: 10, 1: 10, 9: 10, 3: 10, 8: 4})

In [22]:
max(np.concatenate(sounds_train)), min(np.concatenate(sounds_train))

(1.7263024, -1.5727631)

# other test

In [None]:
# Audio(np.concatenate(X),rate=sr)