In [1]:
import numpy as np
import pandas as pd

import librosa
import pywt
from skimage.restoration import denoise_wavelet

from loess.loess_1d import loess_1d
import statsmodels.api as sm
import scipy

import matplotlib.pyplot as plt
from IPython.display import Audio
from tqdm import tqdm

import argparse
from os import listdir
from os.path import join

In [2]:

def read_audio_filenames(directory: str) -> list:
    '''

    '''
    return [ filename for filename in listdir(directory) if filename.endswith('.wav') ]


def read_audio_content(filepath: str, sample_rate=4000):
    '''
    '''
    data, sr = librosa.load(filepath, sr=sample_rate)
    return data, sr


def segment_audio_content(audio, sample_rate=4000, segment_length=5):
    ''''''
    segments = []
    audio_length = len(audio) / sample_rate

    # print('Audio length', audio_length)

    segment_index = 0
    while segment_index < audio_length:
        t_start = segment_index * sample_rate
        t_end = t_start + segment_length * sample_rate

        segment = audio[ t_start : t_end ]
        segments.append(np.pad(segment, (0, sample_rate * segment_length - len(segment)), 'constant'))
        segment_index += segment_length

    # print(segment_index / segment_length, 'segments read')
    # print([ len(s) for s in segments ])
    return segments


def preprocess(audio):
    ''''''
    processed = wavelet_denoise(audio)
    processed = apply_loess(processed)
    processed = zscore_normalize(processed)
    return processed


def wavelet_denoise(audio):
    '''Wavelet denoise
    '''
    return denoise_wavelet(audio, wavelet='db5', method='BayesShrink', mode='soft', wavelet_levels=4)
    # return pywt.wavedec(audio, 'db5', mode='zero', level=4)[0]


def apply_loess(audio, frac=0.1):
    '''
    '''
    lowess = sm.nonparametric.lowess
    l = lowess(audio, np.arange(0, len(audio), 1), frac=frac)[:, 1]
    return audio - l
    

def zscore_normalize(audio):
    '''
    '''
    return scipy.stats.zscore(audio)



In [2]:
LABELS_OF_INTEREST = ['Healthy', 'Asthma', 'Pneumonia', 'Bron', 'COPD', 'Heart failure']

SEGMENTS_LENGTH = 5

JOIN_DATASETS = True

ICBHI_SAVE_PATH = 'arr_icbhi.npy'
KING_ABDULLAH_SAVE_PATH = 'arr_king_abdullah.npy'

In [5]:
KING_ABDULLAH_DATASET_BASEDIR = join('Audio_Files')

king_abdullah_audio_filenames = read_audio_filenames(KING_ABDULLAH_DATASET_BASEDIR)


labels_map = {
    'N': 'Healthy',
    'Copd': 'COPD',
}

audios = []
for filename in tqdm(king_abdullah_audio_filenames):

    (id, diagnosis) = filename.split(',')[0].split('_')
    diagnosis = diagnosis.capitalize()
    audio, sample_rate = read_audio_content(join(KING_ABDULLAH_DATASET_BASEDIR, filename))
    segments = segment_audio_content(audio, sample_rate, SEGMENTS_LENGTH)

    for (seg_no, segment) in enumerate(segments):

        label = labels_map.get(diagnosis, diagnosis)

        processed_segment = preprocess(segment)

        row = {'ID': id, 'segment_no': seg_no, 'audio_data': processed_segment, 'Diagnosis': label}
        audios.append(row)

df_audio_diagnosis_ka = pd.DataFrame(audios)
df_audio_diagnosis_ka = df_audio_diagnosis_ka[df_audio_diagnosis_ka.Diagnosis.isin(LABELS_OF_INTEREST)]
df_audio_diagnosis_ka.groupby('Diagnosis')['Diagnosis'].count()

100%|██████████| 336/336 [1:19:59<00:00, 14.28s/it]


Diagnosis
Asthma           366
Bron              27
COPD             111
Healthy          418
Heart failure    183
Pneumonia         63
Name: Diagnosis, dtype: int64

In [7]:

np.save(KING_ABDULLAH_SAVE_PATH, df_audio_diagnosis_ka.to_numpy())

In [4]:
ICBHI_DATASET_BASEDIR = 'Respiratory_Sound_Database'
ICBHI_AUDIO_TXT_DIR = join(ICBHI_DATASET_BASEDIR, 'audio_and_txt_files')


def create_icbhi2017_audio_dataframe(basedir: str, patient_diagnosis: dict) -> pd.DataFrame:
    '''
    '''
    rows = []

    filenames = read_audio_filenames(basedir)

    for filename in tqdm(filenames):

        parts = filename.split('.')[0].split('_')
        id = int(parts[0])

        if id not in patient_diagnosis:
            continue


        diagnosis = patient_diagnosis[id]
        audio_content, sample_rate = read_audio_content(join(basedir, filename))
        segments = segment_audio_content(audio_content)

        for (segment_no, segment) in enumerate(segments):
            
            # Because wavelet denoise throws an ValueError when the array is only zeroes.
            # Also, a segment with only zeroes might be the end of the audio filled with zeroes.
            if np.all(segment == 0):
                continue

            processed_segment = preprocess(segment)

            row = { 'ID': str(id), 'segment_no': segment_no, 'audio_data': processed_segment, 
                'Diagnosis': diagnosis }
            rows.append(row)

    df = pd.DataFrame(rows)
    return df


df_diagnosis_icbhi = pd.read_table('ICBHI_Challenge_diagnosis.txt', header=None, names=['ID', 'Diagnosis'])

filter_bron_icbhi = df_diagnosis_icbhi['Diagnosis'].isin(['Bronchiectasis', 'Bronchiolitis'])
df_diagnosis_icbhi.loc[filter_bron_icbhi, 'Diagnosis'] = 'Bron'

filter_diagnosis_icbhi = df_diagnosis_icbhi['Diagnosis'].isin(LABELS_OF_INTEREST)
df_diagnosis_icbhi = df_diagnosis_icbhi[filter_diagnosis_icbhi]

patient_diagnosis_icbhi_dict = df_diagnosis_icbhi.set_index('ID').to_dict()['Diagnosis']

df_audio_diagnosis_icbhi = create_icbhi2017_audio_dataframe(ICBHI_AUDIO_TXT_DIR, patient_diagnosis_icbhi_dict)

100%|██████████| 920/920 [3:57:22<00:00, 15.48s/it]  


In [6]:
np.save(ICBHI_SAVE_PATH, df_audio_diagnosis_icbhi.to_numpy())

In [3]:
if JOIN_DATASETS == True:

    cols = ['ID', 'segment_no', 'audio_data', 'Diagnosis']

    try:
        df_audio_diagnosis_ka
    except NameError:
        df_audio_diagnosis_ka = pd.DataFrame(
            np.load(KING_ABDULLAH_SAVE_PATH, allow_pickle=True), columns=cols
        )

    try:
        df_audio_diagnosis_icbhi
    except NameError:
        df_audio_diagnosis_icbhi = pd.DataFrame(
            np.load(ICBHI_SAVE_PATH, allow_pickle=True), columns=cols
        )

    df_audio_diagnosis_all = pd.concat([
        df_audio_diagnosis_ka,
        df_audio_diagnosis_icbhi,
    ])

    DF_ALL_SAVE_PATH = 'arr_audio_all.npy'
    np.save(DF_ALL_SAVE_PATH, df_audio_diagnosis_all.to_numpy())