In [7]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(1, module_path + '/src')

import utility

import librosa
import sktime
from sktime.utils.data_io import load_from_tsfile_to_dataframe
from sklearn.model_selection import train_test_split

from math import ceil
import soundfile

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import random


from scipy.signal import butter, lfilter
from scipy.fft import fft

from time import time

figure_path = module_path + '/figures/'

def znorm(timeseries):
    mean = np.mean(timeseries)
    sd = np.std(timeseries)
    return (timeseries - mean)/sd

target_rate = 44100

files_path = '/home/andrine/Desktop/dataTromsoFiltered/allFilteres/'

names = []
names_full = os.listdir(files_path)
for n in names_full:
    if '_lungelyd_' in n:
        names.append(n)
        
X = pd.read_stata('/home/andrine/Desktop/dataTromso/hasse_413_ut.dta')
X_no_nan = X.dropna(thresh=1, axis=1)

cols = []

for col in X_no_nan:
    if np.any(X_no_nan[col] != ''):
        cols.append(col)
    else:
        continue 
        
X_no_nan = X_no_nan[cols]

del X
del names_full
del cols

In [8]:
def zero_pad(ts, fixed_len):
    df_new = np.zeros(fixed_len)
    len_ts = len(ts)
    pad = (fixed_len - len_ts)//2
    df_new[pad:pad + len_ts] = ts
    return df_new


lowcut = 150
highcut = 2400
FRAME_RATE = 5000

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def bandpass_filter(buffer):
    return butter_bandpass_filter(buffer, lowcut, highcut, FRAME_RATE, order=12)

def get_sub_audios_no_overlap(audio, frac):
    base = len(audio)//frac
    sub_audio = []
    for n in range(frac):
        #print(n*base)
        if (n+1)*base < len(audio):
            sub = audio[n*base : (n+1)*base]
            sub_audio.append(sub)
            #print(len(sub))
        else:
            sub = audio[n*base :]
            #print(len(sub))
            diff = len(sub) - len(sub_audio[-1])
            if diff != 0:
                print(diff)
                sub = audio[n*base - diff :]
            sub_audio.append(sub)
            #print(len(sub))
            #print(diff)
    

    return sub_audio

map_wav = {
    '1': 'a',
    '2': 'b',
    '3': 'c',
    '4': 'd',
    '5': 'e',
    '6': 'f'
}

abnormalities = ['insp_wheeze', 'exp_wheeze', 'insp_crackle', 'exp_crackle']

def get_comments(X , idx, label,  wav):
    if label not in abnormalities:
        return None
    i = map_wav[wav]
    cols_1 = [f'lung{i}_comment_c_t72', f'lung{i}_comment_b_t72', f'lung{i}_comment_a_t72'] 
    c_1 = X.iloc[idx][cols_1].values[0]
    
    cols_2 = {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_unc_comment_t72', f'sub_{i}_ob1_i_wh_unc_comment_t72'],
                'exp_wheeze': [f'sub_{i}_ob2_e_wh_unc_comment_t72', f'sub_{i}_ob1_e_wh_unc_comment_t72'],
                'insp_crackle': [f'sub_{i}_ob2_i_cr_unc_comment_t72', f'sub_{i}_ob1_i_cr_unc_comment_t72'] , 
                'exp_crackle' : [f'sub_{i}_ob2_e_cr_unc_comment_t72', f'sub_{i}_ob1_e_cr_unc_comment_t72']}
    
    c_2 = X.iloc[idx][cols_2[label]].values[0]
    
    total_len = len(c_1[0]) + len(c_1[1]) + len(c_1[2]) + len(c_2[0]) + len(c_2[1])
    if total_len == 0:
        return None
    c = np.append(c_1,c_2)
    return ' '.join(list(filter(len, c)))

def get_precence_columns(label, wav):
    map_wav = {
        '1': 'a',
        '2': 'b',
        '3': 'c',
        '4': 'd',
        '5': 'e',
        '6': 'f'
    }
    
    i = map_wav[wav]
    
    
    multiple_present = {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_number_insp_t72', f'sub_{i}_ob1_i_wh_number_insp_t72'],
                    'exp_wheeze': [f'sub_{i}_ob2_e_wh_number_exp_t72', f'sub_{i}_ob1_e_wh_number_exp_t72'],
                    'insp_crackle': [f'sub_{i}_ob2_i_cr_number_insp_t72', f'sub_{i}_ob1_i_cr_number_insp_t72'] , 
                    'exp_crackle' : [f'sub_{i}_ob2_e_cr_number_exp_t72', f'sub_{i}_ob1_e_cr_number_exp_t72']}
    if label not in multiple_present.keys():
        return None
    return multiple_present[label]



def get_column_observer_wav_file(observer, wav):    
    i = map_wav[wav]
    classify_dict = {
        'a' : [f'lung{i}_abnormal_sound_a_t72',
        f'lung{i}_insp_wheeze_a_t72',
        f'lung{i}_exp_wheeze_a_t72',
        f'lung{i}_insp_crackle_a_t72',
        f'lung{i}_exp_crackle_a_t72',
        f'lung{i}_other_abnorm_sound_a_t72',
        f'lung{i}_not_classifiable_a_t72'],

        'b' :  [f'lung{i}_abnormal_sound_b_t72',
        f'lung{i}_insp_wheeze_b_t72',
        f'lung{i}_exp_wheeze_b_t72',
        f'lung{i}_insp_crackle_b_t72',
        f'lung{i}_exp_crackle_b_t72',
        f'lung{i}_other_abnorm_sound_b_t72',
        f'lung{i}_not_classifiable_b_t72'],


        'c' : [f'lung{i}_abnormal_sound_c_t72',
        f'lung{i}_insp_wheeze_c_t72',
        f'lung{i}_exp_wheeze_c_t72',
        f'lung{i}_insp_crackle_c_t72',
        f'lung{i}_exp_crackle_c_t72',
        f'lung{i}_other_abnorm_sound_c_t72',
        f'lung{i}_not_classifiable_c_t72']
    }
    
    return classify_dict[observer]

def get_second_round_eval(X, idx, label, wav):
    if label not in abnormalities: # Return True if there is no abnormality present
        return True
    i = map_wav[wav]
    
    second_round_res = {
        0 : 'not evaluated',
        1 : 'present',
        2 : 'absent', 
        3 : 'uncertain'
    }

    second_round_pres =  {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_presence_t72', f'sub_{i}_ob1_i_wh_presence_t72'],
                    'exp_wheeze': [f'sub_{i}_ob2_e_wh_presence_t72', f'sub_{i}_ob1_e_wh_presence_t72'],
                    'insp_crackle': [f'sub_{i}_ob2_i_cr_presence_t72', f'sub_{i}_ob1_i_cr_presence_t72'] , 
                    'exp_crackle' : [f'sub_{i}_ob2_e_cr_presence_t72', f'sub_{i}_ob1_e_cr_presence_t72']}

    round2 = X.iloc[idx][second_round_pres[label]].values[0].astype(int)
    
    if (round2[0] == 2) or (round2[1] == 2): # Return False if second round of observers concluded absent 
        return False
    else:
        return True



def get_filename_label_map():
    annotation_list = ['abnormal_sound', 'insp_wheeze',
                       'exp_wheeze', 'insp_crackle', 'exp_crackle',
                      'other', 'not_classifiable']

    count = 0
    error_in_data = {}
    data = pd.DataFrame({'id':[] , 'nr': [],'label':[]})
    for n in names:
        i = n[0:8]
        try:
            wav = n.split('.')[0][-1]
        except IndexError as error:
            print(n)
            continue
        row_nr = np.where(X_no_nan['unikt_lopenr'] == i)[0] 
        a_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('a', wav)].values[0]
        b_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('b', wav)].values[0]

        if np.any(a_obs != b_obs):  # Check that observer a and observer b agree
            c_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('c', wav)].values[0]
        else:
            c_obs = a_obs


        if (c_obs[0] == 0) or (np.isnan(c_obs[0])):
            label = 'normal'
        else:
            try:
                label_idx = np.where(c_obs == 1.)[0]
                label = annotation_list[label_idx[1]]
                if len(label_idx) > 2: # Do not include samples with more than type of abnormality
                    error_in_data[n] = 'more than one abnormality'
                    count = count + 1
                    continue
            except IndexError as error:
                error_in_data[n] = 'IndexError'

        if not get_second_round_eval(X_no_nan, row_nr, label, wav): # If the second round of observers have concluded absent
            count = count + 1
            error_in_data[n] = 'second round concluded absent'
            continue
            
        cols = get_precence_columns(label, wav)
        
        if cols != None: # Filtering to be sure that there appears more than one abnormality during the 15 s
            votes = X_no_nan.iloc[row_nr][cols].values[0]
            votes = votes[0] + votes[1]
            if votes < 3: 
                count = count + 1
                error_in_data[n] = 'only one abnormality appears during the whole 15 seconds'
                continue
        comments = get_comments(X_no_nan , row_nr, label,  wav)    
        if comments is not None:
            error_in_data[n] = comments
            continue
            
        audio_file = files_path + n        
        data = data.append(pd.DataFrame({'id': [audio_file],'nr': [i] ,'label': [label]}))
        
        
        #if count == 100:
        #    break
    return error_in_data , data, count

In [9]:
start = time()

error, data, count = get_filename_label_map()

total = time() - start
print(f'Time utilized: {total} ')

Time utilized: 5.694962501525879 


In [10]:
from scipy.io import wavfile
from sklearn.utils import shuffle

def get_k_fold_indices(indices, k):
    indices = shuffle(indices, random_state = 42)
    indices_per_fold = len(indices)//k
    return_indices = []
    last_i = 0
    for i in range(indices_per_fold, len(indices),indices_per_fold):
        return_indices.append(indices[last_i:i])
        last_i = i
    return_indices.append(indices[last_i:])
    return return_indices


data.sort_values(by=['nr'], inplace=True)

id_unique = data['nr'].unique()

data.reset_index(drop = False)

data['fold'] = np.zeros((len(data))).astype(int)


folds = get_k_fold_indices(id_unique, 5)

for f in range(len(folds)):
    curr_fold = folds[f]
    for f_i in curr_fold:
        data.loc[data['nr'] == f_i, 'fold'] = f
        
data.loc[data['label'] == 'exp_crackle' , 'label'] = 'crackle'
data.loc[data['label'] == 'insp_crackle' , 'label'] = 'crackle'

data.loc[data['label'] == 'exp_wheeze' , 'label'] = 'wheeze'
data.loc[data['label'] == 'insp_wheeze' , 'label'] = 'wheeze'

In [13]:
data

Unnamed: 0,id,nr,label,fold
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,10011811,wheeze,2
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,10014511,crackle,4
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,10017716,crackle,1
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,10018313,normal,2
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,10018414,normal,4
...,...,...,...,...
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,40163519,crackle,1
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,40163519,crackle,1
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,40164924,wheeze,1
0,/home/andrine/Desktop/dataTromsoFiltered/allFi...,40170921,wheeze,2


In [14]:
def create_new_dataset(data):
    error_in_data = {}
    count = 0
    data_info = pd.DataFrame({
        'filename':[],
        'fold': [],
        'label': []
    })
    for row in data.iterrows():
       
        audio_file = row[1]['id']
        label = row[1]['label']
        fold = row[1]['fold']
        
        
        try:
            sr, audio = utility.read_wav_file(audio_file, target_rate)

        except EOFError as error:
            error_in_data[audio_file] = 'EOFError'
            continue

        try:
            audio = utility.denoise_audio(audio)
            audio, sr = utility.downsample(audio, sr, 5000), 5000
            
            audio = np.apply_along_axis(bandpass_filter, 0, audio).astype('float64')
            #audio = znorm(audio)
    
            '''sub_audio = get_sub_audios_no_overlap(audio, 3)
            sub_count = 0
            for sub in sub_audio:  
                sub_name = audio_file.split('/')[-1].split('.')[0] + f'_{sub_count}.wav'
                if (count%2 == 0):
                    sub = make_synthetic(sub, sr)
                    label = 'synthetic'
                else:
                    label = 'normal'
                count = count + 1
                
                sub = zero_pad(sub, 80000)
                
                wavfile.write('/home/andrine/Desktop/synthetic_dataset/data/' + sub_name, 16000, sub)
                sub_count = sub_count + 1
                
                data_info = data_info.append(pd.DataFrame({'filename':[sub_name],
                                               'fold': [fold],
                                               'label': [label]}))'''
        
        
            name = audio_file.split('/')[-1] 
  


            audio = zero_pad(audio, 75000)

            #wavfile.write('/home/andrine/Desktop/normal_dataset_16hz_15s/data/' + name, 16000, audio)
            soundfile.write('/home/andrine/Desktop/normal_dataset_16hz_15s/data/' + name, audio, 5000, subtype='PCM_16')


            data_info = data_info.append(pd.DataFrame({'filename':[name],
                                           'fold': [fold],
                                           'label': [label]}))
            
                
                
                
            count = count + 1    
            print(f'Extracting file : {count}, of totally {len(data)}')    
            
        except ValueError as error:
            error_in_data[audio_file] = 'ValueError'
            continue
    data_info.to_csv('/home/andrine/Desktop/normal_dataset_16hz_15s/data_info.csv')
    return data_info
            
            
create_new_dataset(data)

Extracting file : 1, of totally 1094
Extracting file : 2, of totally 1094
Extracting file : 3, of totally 1094
Extracting file : 4, of totally 1094
Extracting file : 5, of totally 1094
Extracting file : 6, of totally 1094
Extracting file : 7, of totally 1094
Extracting file : 8, of totally 1094
Extracting file : 9, of totally 1094
Extracting file : 10, of totally 1094
Extracting file : 11, of totally 1094
Extracting file : 12, of totally 1094
Extracting file : 13, of totally 1094
Extracting file : 14, of totally 1094
Extracting file : 15, of totally 1094
Extracting file : 16, of totally 1094
Extracting file : 17, of totally 1094
Extracting file : 18, of totally 1094
Extracting file : 19, of totally 1094
Extracting file : 20, of totally 1094
Extracting file : 21, of totally 1094
Extracting file : 22, of totally 1094
Extracting file : 23, of totally 1094
Extracting file : 24, of totally 1094
Extracting file : 25, of totally 1094
Extracting file : 26, of totally 1094
Extracting file : 27,

Extracting file : 216, of totally 1094
Extracting file : 217, of totally 1094
Extracting file : 218, of totally 1094
Extracting file : 219, of totally 1094
Extracting file : 220, of totally 1094
Extracting file : 221, of totally 1094
Extracting file : 222, of totally 1094
Extracting file : 223, of totally 1094
Extracting file : 224, of totally 1094
Extracting file : 225, of totally 1094
Extracting file : 226, of totally 1094
Extracting file : 227, of totally 1094
Extracting file : 228, of totally 1094
Extracting file : 229, of totally 1094
Extracting file : 230, of totally 1094
Extracting file : 231, of totally 1094
Extracting file : 232, of totally 1094
Extracting file : 233, of totally 1094
Extracting file : 234, of totally 1094
Extracting file : 235, of totally 1094
Extracting file : 236, of totally 1094
Extracting file : 237, of totally 1094
Extracting file : 238, of totally 1094
Extracting file : 239, of totally 1094
Extracting file : 240, of totally 1094
Extracting file : 241, of

Extracting file : 429, of totally 1094
Extracting file : 430, of totally 1094
Extracting file : 431, of totally 1094
Extracting file : 432, of totally 1094
Extracting file : 433, of totally 1094
Extracting file : 434, of totally 1094
Extracting file : 435, of totally 1094
Extracting file : 436, of totally 1094
Extracting file : 437, of totally 1094
Extracting file : 438, of totally 1094
Extracting file : 439, of totally 1094
Extracting file : 440, of totally 1094
Extracting file : 441, of totally 1094
Extracting file : 442, of totally 1094
Extracting file : 443, of totally 1094
Extracting file : 444, of totally 1094
Extracting file : 445, of totally 1094
Extracting file : 446, of totally 1094
Extracting file : 447, of totally 1094
Extracting file : 448, of totally 1094
Extracting file : 449, of totally 1094
Extracting file : 450, of totally 1094
Extracting file : 451, of totally 1094
Extracting file : 452, of totally 1094
Extracting file : 453, of totally 1094
Extracting file : 454, of

Extracting file : 640, of totally 1094
Extracting file : 641, of totally 1094
Extracting file : 642, of totally 1094
Extracting file : 643, of totally 1094
Extracting file : 644, of totally 1094
Extracting file : 645, of totally 1094
Extracting file : 646, of totally 1094
Extracting file : 647, of totally 1094
Extracting file : 648, of totally 1094
Extracting file : 649, of totally 1094
Extracting file : 650, of totally 1094
Extracting file : 651, of totally 1094
Extracting file : 652, of totally 1094
Extracting file : 653, of totally 1094
Extracting file : 654, of totally 1094
Extracting file : 655, of totally 1094
Extracting file : 656, of totally 1094
Extracting file : 657, of totally 1094
Extracting file : 658, of totally 1094
Extracting file : 659, of totally 1094
Extracting file : 660, of totally 1094
Extracting file : 661, of totally 1094
Extracting file : 662, of totally 1094
Extracting file : 663, of totally 1094
Extracting file : 664, of totally 1094
Extracting file : 665, of

Extracting file : 851, of totally 1094
Extracting file : 852, of totally 1094
Extracting file : 853, of totally 1094
Extracting file : 854, of totally 1094
Extracting file : 855, of totally 1094
Extracting file : 856, of totally 1094
Extracting file : 857, of totally 1094
Extracting file : 858, of totally 1094
Extracting file : 859, of totally 1094
Extracting file : 860, of totally 1094
Extracting file : 861, of totally 1094
Extracting file : 862, of totally 1094
Extracting file : 863, of totally 1094
Extracting file : 864, of totally 1094
Extracting file : 865, of totally 1094
Extracting file : 866, of totally 1094
Extracting file : 867, of totally 1094
Extracting file : 868, of totally 1094
Extracting file : 869, of totally 1094
Extracting file : 870, of totally 1094
Extracting file : 871, of totally 1094
Extracting file : 872, of totally 1094
Extracting file : 873, of totally 1094
Extracting file : 874, of totally 1094
Extracting file : 875, of totally 1094
Extracting file : 876, of

Extracting file : 1062, of totally 1094
Extracting file : 1063, of totally 1094
Extracting file : 1064, of totally 1094
Extracting file : 1065, of totally 1094
Extracting file : 1066, of totally 1094
Extracting file : 1067, of totally 1094
Extracting file : 1068, of totally 1094
Extracting file : 1069, of totally 1094
Extracting file : 1070, of totally 1094
Extracting file : 1071, of totally 1094
Extracting file : 1072, of totally 1094
Extracting file : 1073, of totally 1094
Extracting file : 1074, of totally 1094
Extracting file : 1075, of totally 1094
Extracting file : 1076, of totally 1094
Extracting file : 1077, of totally 1094
Extracting file : 1078, of totally 1094
Extracting file : 1079, of totally 1094
Extracting file : 1080, of totally 1094
Extracting file : 1081, of totally 1094
Extracting file : 1082, of totally 1094
Extracting file : 1083, of totally 1094
Extracting file : 1084, of totally 1094
Extracting file : 1085, of totally 1094
Extracting file : 1086, of totally 1094


Unnamed: 0,filename,fold,label
0,10011811_lungelyd_1.wav,2.0,wheeze
0,10014511_lungelyd_3.wav,4.0,crackle
0,10017716_lungelyd_3.wav,1.0,crackle
0,10018313_lungelyd_5.wav,2.0,normal
0,10018414_lungelyd_3.wav,4.0,normal
...,...,...,...
0,40163519_lungelyd_3.wav,1.0,crackle
0,40163519_lungelyd_5.wav,1.0,crackle
0,40164924_lungelyd_4.wav,1.0,wheeze
0,40170921_lungelyd_1.wav,2.0,wheeze
