本檔案將對資料進行前處理，可分為5個部份：
1. audio: 將音訊載入，並將其填充0（padding）到相同長度（3秒）
2. clinical: 病史資料的篩選與處理
3. MelSpectrogram: 將處理完的音訊進行梅爾頻譜（Mel-spectrogram）的轉換
4. MFCC: 將處理完的音訊進行梅爾頻率倒譜係數（Mel-Frequency Cepstral Coefficients）的轉換
5. label: 目標類別

下方為Training Dataset的資料處理，Public Dataset 與 Private Dataset使用相同的資料處理，因此將不冗述

## audio

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm, notebook
import librosa
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [2]:
sample_rate = sr = 16000   # 採樣頻率16k

file_path = "/home/user8008/sdk/sail/data/Training_Dataset/"
wav_path = file_path + 'training_voice_data/'
all_label = pd.read_csv(file_path+'training datalist.csv') 
label = all_label["Disease category"]
ID = all_label["ID"]
all_label

Unnamed: 0,ID,Sex,Age,Disease category,Narrow pitch range,Decreased volume,Fatigue,Dryness,Lumping,heartburn,...,Onset of dysphonia,Noise at work,Occupational vocal demand,Diabetes,Hypertension,CAD,Head and Neck Cancer,Head injury,CVA,Voice handicap index - 10
0,1202f15,2,39,1,1,1,1,1,1,0,...,2,3,1,0,0,0,0,0,0,22.0
1,0600ve0,1,69,2,1,1,1,1,0,0,...,2,1,3,0,0,0,0,0,1,19.0
2,1001o7l,2,59,2,1,1,1,1,0,0,...,2,3,4,0,0,0,0,0,0,18.0
3,1201c1t,2,47,1,1,0,1,1,1,0,...,3,1,1,0,0,0,0,0,0,27.0
4,0402jvt,1,87,1,0,0,0,0,0,0,...,1,1,4,0,1,0,0,0,0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0G00ftn,1,75,3,0,0,0,0,0,0,...,1,1,4,1,1,1,0,0,0,21.0
996,1201pkr,1,40,4,0,0,1,0,0,0,...,2,1,2,0,0,0,0,0,0,21.0
997,0202p64,2,68,3,0,1,1,0,0,0,...,5,2,1,0,0,0,0,0,0,27.0
998,12021au,2,42,2,0,0,1,1,1,0,...,4,1,2,0,0,0,0,0,0,12.0


In [3]:
data_path = []
wav_id = []
for audio in os.listdir(wav_path):
    if os.path.splitext(audio)[1] == ".wav":
        fin_path = wav_path+audio
        data_path.append(fin_path)
        wav_id.append(audio[0:7]) # 擷取音訊檔案中的ID

new_label = []
new_id = []
new_data_path = []
for i in tqdm(range(len(data_path))):
    for j in range(len(label)):
        if ID[i] == wav_id[j]:
            new_label.append(label[i])  # 確保ID,label,audio資料都有對齊
            new_id.append(wav_id[j])
            new_data_path.append(data_path[j])

train_audio = []
for i in tqdm(range(len(new_data_path))):
    data, _ = librosa.load(new_data_path[i], sr=sample_rate)
    data = librosa.util.normalize(data)   # 音訊標準化
    if data.shape!=(48000,):
        data = np.concatenate((data, np.zeros(48000-data.shape[0])))  # 將音訊padding到相同長度
    train_audio.append(data)    
train_audio = np.array(train_audio)

100%|██████████| 1000/1000 [00:02<00:00, 454.39it/s]
100%|██████████| 1000/1000 [00:34<00:00, 29.26it/s]


## clinical

In [4]:
all_label = pd.read_csv(file_path+'training datalist.csv')

def medical_data_proccessing(df):
    medical_col = ['Sex', 'Age', 'Narrow pitch range','Choking', 'PPD', 'Drinking',
                    'Onset of dysphonia ', 'Noise at work','Occupational vocal demand',
                    'Voice handicap index - 10']    # 特徵選取後的10個病史特徵
    df = df.loc[df['Disease category'].isin([1, 2, 3, 4, 5]), medical_col]
    df['Sex'] = df['Sex'] - 1
    df['PPD'] = df['PPD'].fillna(0)
    df['Voice handicap index - 10'] = df['Voice handicap index - 10'].fillna(0)
    df['Age'] = df['Age'] / 50
    return df
train_clinical = medical_data_proccessing(all_label)
train_clinical = np.array(train_clinical)

## MFCC

In [6]:
from audioflux.display import fill_spec

In [7]:
MFCC = []
import audioflux as af
for i in range(train_audio.shape[0]):
# for i in range(10):
    audio_len = train_audio[i].shape[-1]
    cc_arr, _ = af.mfcc(train_audio[i], samplate=sample_rate, cc_num=43)
    MFCC.append(cc_arr)
MFCC = np.array(MFCC)

## MelSpectrogram

In [8]:
MelS = []
import audioflux as af
for i in range(train_audio.shape[0]):
    spec_obj = af.MelSpectrogram(num=184, samplate=sample_rate, radix2_exp=10)
    spec_arr = spec_obj.spectrogram(train_audio[i])
    spec_dB_arr = af.utils.power_to_db(spec_arr)

    MelS.append(spec_dB_arr)
MelS = np.array(MelS)

## label

In [5]:
labelencoder = LabelEncoder()
train_y = labelencoder.fit_transform(new_label)

onehotencoder = OneHotEncoder()
train_y_bln = onehotencoder.fit_transform(np.array(new_label).reshape(-1,1)).toarray() # 用於切分資料時保持類別分佈平衡