# Filtering procedure

Since the 15 second window is too long to get any valuable information, the data will be split into fixed length windows of 500 ms. 

In [10]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(1, module_path + '/src/utils')

import utility

import librosa
import sktime
from sktime.utils.data_io import load_from_tsfile_to_dataframe

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd

import librosa
from scipy.stats import skew 

from scipy.signal import butter, lfilter
from scipy.fft import fft
from scipy.signal import stft
from scipy.stats import skew, kurtosis

from time import time
import sys
import os
figure_path = module_path + '/figures/'

import soundfile
import pygame
import pickle

from tkinter import *
import tkinter as tk

import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

from matplotlib.font_manager import FontProperties
font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)
font_small = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 8, weight = 1000)


def znorm(timeseries):
    mean = np.mean(timeseries)
    sd = np.std(timeseries)
    return (timeseries - mean)/sd

target_rate = 44100

import os
names = []

files_path = module_path + '/data/all_files_UIT/'
names_full = os.listdir(files_path)
for n in names_full:
    if '_lungelyd_' in n:
        names.append(n)
        
X = pd.read_stata(module_path + '/data/hasse_413_ut.dta')
X_no_nan = X.dropna(thresh=1, axis=1)

cols = []

for col in X_no_nan:
    if np.any(X_no_nan[col] != ''):
        cols.append(col)
    else:
        continue 
        
X_no_nan = X_no_nan[cols]

del X
del names_full
del cols

In [5]:
map_wav = {
    '1': 'a',
    '2': 'b',
    '3': 'c',
    '4': 'd',
    '5': 'e',
    '6': 'f'
}

abnormalities = ['insp_wheeze', 'exp_wheeze', 'insp_crackle', 'exp_crackle']

lowcut = 150
highcut = 2000
FRAME_RATE = 8000

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def bandpass_filter(buffer):
    return butter_bandpass_filter(buffer, lowcut, highcut, FRAME_RATE, order=12)


def get_comments(X , idx, label,  wav):
    if label not in abnormalities:
        return None
    i = map_wav[wav]
    cols_1 = [f'lung{i}_comment_c_t72', f'lung{i}_comment_b_t72', f'lung{i}_comment_a_t72'] 
    c_1 = X.iloc[idx][cols_1].values[0]
    
    cols_2 = {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_unc_comment_t72', f'sub_{i}_ob1_i_wh_unc_comment_t72'],
                'exp_wheeze': [f'sub_{i}_ob2_e_wh_unc_comment_t72', f'sub_{i}_ob1_e_wh_unc_comment_t72'],
                'insp_crackle': [f'sub_{i}_ob2_i_cr_unc_comment_t72', f'sub_{i}_ob1_i_cr_unc_comment_t72'] , 
                'exp_crackle' : [f'sub_{i}_ob2_e_cr_unc_comment_t72', f'sub_{i}_ob1_e_cr_unc_comment_t72']}
    
    c_2 = X.iloc[idx][cols_2[label]].values[0]
    
    total_len = len(c_1[0]) + len(c_1[1]) + len(c_1[2]) + len(c_2[0]) + len(c_2[1])
    if total_len == 0:
        return None
    c = np.append(c_1,c_2)
    return ' '.join(list(filter(len, c)))

def get_precence_columns(label, wav):
    map_wav = {
        '1': 'a',
        '2': 'b',
        '3': 'c',
        '4': 'd',
        '5': 'e',
        '6': 'f'
    }
    
    i = map_wav[wav]
    
    
    multiple_present = {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_number_insp_t72', f'sub_{i}_ob1_i_wh_number_insp_t72'],
                    'exp_wheeze': [f'sub_{i}_ob2_e_wh_number_exp_t72', f'sub_{i}_ob1_e_wh_number_exp_t72'],
                    'insp_crackle': [f'sub_{i}_ob2_i_cr_number_insp_t72', f'sub_{i}_ob1_i_cr_number_insp_t72'] , 
                    'exp_crackle' : [f'sub_{i}_ob2_e_cr_number_exp_t72', f'sub_{i}_ob1_e_cr_number_exp_t72']}
    if label not in multiple_present.keys():
        return None
    return multiple_present[label]



def get_column_observer_wav_file(observer, wav):    
    i = map_wav[wav]
    classify_dict = {
        'a' : [f'lung{i}_abnormal_sound_a_t72',
        f'lung{i}_insp_wheeze_a_t72',
        f'lung{i}_exp_wheeze_a_t72',
        f'lung{i}_insp_crackle_a_t72',
        f'lung{i}_exp_crackle_a_t72',
        f'lung{i}_other_abnorm_sound_a_t72',
        f'lung{i}_not_classifiable_a_t72'],

        'b' :  [f'lung{i}_abnormal_sound_b_t72',
        f'lung{i}_insp_wheeze_b_t72',
        f'lung{i}_exp_wheeze_b_t72',
        f'lung{i}_insp_crackle_b_t72',
        f'lung{i}_exp_crackle_b_t72',
        f'lung{i}_other_abnorm_sound_b_t72',
        f'lung{i}_not_classifiable_b_t72'],


        'c' : [f'lung{i}_abnormal_sound_c_t72',
        f'lung{i}_insp_wheeze_c_t72',
        f'lung{i}_exp_wheeze_c_t72',
        f'lung{i}_insp_crackle_c_t72',
        f'lung{i}_exp_crackle_c_t72',
        f'lung{i}_other_abnorm_sound_c_t72',
        f'lung{i}_not_classifiable_c_t72']
    }
    
    return classify_dict[observer]

def get_second_round_eval(X, idx, label, wav):
    if label not in abnormalities: # Return True if there is no abnormality present
        return True
    i = map_wav[wav]
    
    second_round_res = {
        0 : 'not evaluated',
        1 : 'present',
        2 : 'absent', 
        3 : 'uncertain'
    }

    second_round_pres =  {'insp_wheeze' : [f'sub_{i}_ob2_i_wh_presence_t72', f'sub_{i}_ob1_i_wh_presence_t72'],
                    'exp_wheeze': [f'sub_{i}_ob2_e_wh_presence_t72', f'sub_{i}_ob1_e_wh_presence_t72'],
                    'insp_crackle': [f'sub_{i}_ob2_i_cr_presence_t72', f'sub_{i}_ob1_i_cr_presence_t72'] , 
                    'exp_crackle' : [f'sub_{i}_ob2_e_cr_presence_t72', f'sub_{i}_ob1_e_cr_presence_t72']}

    round2 = X.iloc[idx][second_round_pres[label]].values[0].astype(int)
    
    if (round2[0] == 2) or (round2[1] == 2): # Return False if second round of observers concluded absent 
        return False
    else:
        return True


def get_filename_label_map():
    annotation_list = ['abnormal_sound', 'insp_wheeze',
                       'exp_wheeze', 'insp_crackle', 'exp_crackle',
                      'other', 'not_classifiable']

    count = 0
    error_in_data = {}
    data = pd.DataFrame({'id':[] , 'nr': [],'label':[]})
    for n in names:
        i = n[0:8]
        try:
            wav = n.split('.')[0][-1]
        except IndexError as error:
            print(n)
            continue
        row_nr = np.where(X_no_nan['unikt_lopenr'] == i)[0] 
        a_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('a', wav)].values[0]
        b_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('b', wav)].values[0]

        if np.any(a_obs != b_obs):  # Check that observer a and observer b agree
            c_obs = X_no_nan.iloc[row_nr][get_column_observer_wav_file('c', wav)].values[0]
        else:
            c_obs = a_obs


        if (c_obs[0] == 0) or (np.isnan(c_obs[0])):
            label = 'normal'
        else:
            try:
                label_idx = np.where(c_obs == 1.)[0]
                label = annotation_list[label_idx[1]]
                if len(label_idx) > 2: # Do not include samples with more than type of abnormality
                    error_in_data[n] = 'more than one abnormality'
                    count = count + 1
                    continue
            except IndexError as error:
                error_in_data[n] = 'IndexError'

        if not get_second_round_eval(X_no_nan, row_nr, label, wav): # If the second round of observers have concluded absent
            count = count + 1
            error_in_data[n] = 'second round concluded absent'
            continue
            
        cols = get_precence_columns(label, wav)
        
        if cols != None: # Filtering to be sure that there appears more than one abnormality during the 15 s
            votes = X_no_nan.iloc[row_nr][cols].values[0]
            votes = votes[0] + votes[1]
            if votes < 3: 
                count = count + 1
                error_in_data[n] = 'only one abnormality appears during the whole 15 seconds'
                continue
        comments = get_comments(X_no_nan , row_nr, label,  wav)    
        if comments is not None:
            error_in_data[n] = comments
            continue
            
        audio_file = files_path + n        
        data = data.append(pd.DataFrame({'id': [audio_file],'nr': [i] ,'label': [label]}))
    return error_in_data , data, count

In [11]:
start = time()

error, data, count = get_filename_label_map()
del X_no_nan
del names
del count

total = time() - start
print(f'Time utilized: {total} ')

Time utilized: 4.455421686172485 


In [12]:
indices_1 = np.where(data['label'] == 'normal')[0]
indices_2 = np.where(data['label'] == 'exp_wheeze')[0]
indices_3 = np.where(data['label'] == 'insp_crackle')[0]
indices_4 = np.where(data['label'] == 'insp_wheeze')[0]
indices_5 = np.where(data['label'] == 'exp_crackle')[0]
indices_6 = np.where(data['label'] == 'other')[0]
indices_7 = np.where(data['label'] == 'not_classifiable')[0]

np.random.seed(0)
np.random.shuffle(indices_1)
indices = np.concatenate((indices_1[:500], indices_2, indices_3, indices_4, indices_5))

new_dataset = data.iloc[indices].copy()

id_unique = new_dataset['nr'].unique()

del data
del indices_1, indices_2, indices_3, indices_4, indices_5, indices_6, indices_7, indices

In [13]:
def play_soundfile_tkinter(s, root, audio, sr, grid_idx = 6, text = 'play sound', wav_file = 'test.wav'):
    #soundfile.write(wav_file, audio, sr, subtype='PCM_16')

    #pygame.mixer.init()# initialise the pygame
    #s = pygame.mixer.Sound(wav_file)
    #s.play(loops=0)
    
    def play():
        s.play(loops=0)
        
    play_button = tk.Button(root, text=text, command=play, height = 5, width = 20, bg = 'white')
    play_button.grid(row=grid_idx, column = 0 , pady = 10)
    
    def stop():
        s.stop()
        
    stop_button = tk.Button(root, text='stop', command=stop, height = 5, width = 20, bg = 'gray')
    stop_button.grid(row=grid_idx, column = 2 , pady = 10)

    return root, s
    
    
def plot_lung_sound_tkinter(root, audio, sr, idx, marker = None):
    duration = float(len(audio)) / sr
    T = 1.0/sr
    N = int(duration / T)
    x = np.linspace(0.0, N*T, N)
    if len(x) != len(audio):
        x = np.linspace(0.0, N*T, N +1 ) 

    figure, ax = plt.subplots(1,1)
    figure.set_size_inches(15, 3)

    #root= tk.Tk() 
    chart_type = FigureCanvasTkAgg(figure, root)
    chart_type.get_tk_widget().grid(row = idx, columnspan=3)
    ax.plot(x, audio, linewidth = 0.3)
    
    if marker != None:
        ax.plot(x, audio, linewidth = 0.3, c = 'k')
        ax.axvline(marker[0], c = '#F94144')
        ax.axvline(marker[1], c = '#F94144')
    else:
        ax.plot(x, audio, linewidth = 0.3, c = 'k')
    
    ax.set_title('Lung sound plot')
    
    return root, figure


def make_window(patient_id, sub, audio, sr, label, marker = None, id_counts = None):
    label_map = {
        'insp_crackle': 'crackle',
        'insp_wheeze': 'wheeze',
        'exp_crackle': 'crackle',
        'exp_wheeze': 'wheeze',
        'normal': 'normal'
    }
    old_label = label
    
    label = [label_map[label]]
    label_flag = [False]
    stop_flag = [False]
    skip_flag = [False]
    
    soundfile.write('test.wav', audio, sr, subtype='PCM_16')
    soundfile.write('test1.wav', sub, sr, subtype='PCM_16')
    
    pygame.mixer.init()
    
    s1 = pygame.mixer.Sound("test1.wav")
    
    s = pygame.mixer.Sound("test.wav")
    
    if marker[0] == 0:
        s.play(loops=0)
    else:
        s1.play(loops=0)
    
    def continue_loop():
        s1.stop()
        s.stop()
        root.destroy()
        
        
    def crackle(label, label_flag):
        s1.stop()
        s.stop()
        label_flag[0] = True
        label[0] = 'crackle'
        continue_loop()
    
    def wheeze(label, label_flag):
        s1.stop()
        s.stop()
        label_flag[0] = True
        label[0] = 'wheeze'
        continue_loop()
    
    def normal(label, label_flag):
        s1.stop()
        s.stop()
        label_flag[0] = True
        label[0] = 'normal'
        continue_loop()
        
    def next_audio(label_flag):
        s1.stop()
        s.stop()
        label_flag[0] = False
        continue_loop()
        
    def stop_session(stop_flag):
        s1.stop()
        s.stop()
        stop_flag[0] = True
        continue_loop()
    
    root = Tk()
    if id_counts != None:
        root.title(f'Lung sound, {id_counts[0]} of {id_counts[1]}')
    else:
        root.title('Lung sound')
    root.geometry("1000x1000")
    
    title=Label(root,text="Play lung sound with following info") 
    #title.config(font =("Courier", 13)) 
    title.grid(row = 0, columnspan=3)
    
    
    ab=Label(root,text=f"Abnormality : {old_label}, patient id : {patient_id}") 
    #ab.config(font =("Courier", 13)) 
    ab.grid(row = 1, columnspan=3)
    
    root, s1 = play_soundfile_tkinter(s1, root, sub, sr, grid_idx = 6, text = 'play segment', wav_file = 'test1.wav')
    root, s = play_soundfile_tkinter(s, root, audio, sr, grid_idx = 7, text = 'play 15 s', wav_file = 'test.wav')
    
    normal_button = tk.Button(root, text="NORMAL", command= lambda *args: normal(label, label_flag), bg = '#90BE6D', height = 2, width = 10)
    crackle_button = tk.Button(root, text="CRACKLE", command= lambda *args: crackle(label, label_flag), bg = '#277DA1', height = 2, width = 10)
    wheeze_button = tk.Button(root, text="WHEEZE", command= lambda *args: wheeze(label, label_flag), bg = '#F3722C', height = 2, width = 10)
    
    
    crackle_button.grid(row = 3, column=0, pady = 10)
    wheeze_button.grid(row = 3, column=2, pady = 10)
    normal_button.grid(row = 3, column=1, pady = 10) 
    
    next_audio_button = tk.Button(root, text="NEXT", command= lambda *args: next_audio(label_flag), bg = '#F9C74F', height = 2, width = 10)
    next_audio_button.grid(row = 4, columnspan=3, pady = 10)
    
    stop_session_button = tk.Button(root, text="STOP SESSION", command= lambda *args: stop_session(stop_flag), bg = '#F94144', height = 2, width = 10)
    stop_session_button.grid(row = 5, columnspan=3, pady = 10)
    
    root, f1 = plot_lung_sound_tkinter(root, sub, sr, idx = 8)

    root, f2 = plot_lung_sound_tkinter(root, audio, sr, idx = 9, marker = marker)
    
    skip_session_button = tk.Button(root, text="SKIP LUNG SOUND", command= lambda *args: stop_session(skip_flag), bg = 'gray', height = 2, width = 10)
    skip_session_button.grid(row = 10, columnspan=3, pady = 10)
    #next_button = Button(root, text="Next", command=continue_loop)
    #next_button.grid(row=3, columnspan=3, pady = 10)    
    
    root.mainloop()
    plt.close(f1) # To close the figure if the root is shut down. Hence figure will not be plotted when program is done
    plt.close(f2)
    del root
    del f1
    del f2
    del s
    del s1
    pygame.mixer.quit()
    
    
    os.remove(os.getcwd() + '/test.wav' )
    os.remove(os.getcwd() + '/test1.wav') 
    
    return label[0], label_flag[0], stop_flag[0], skip_flag[0]

In [15]:
import soundfile
import pygame
import pickle

from tkinter import *
import tkinter as tk
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg


file_info = pd.DataFrame({
    'filepath_orig':[],
    'filepath_new': [],
    'patient_id': [],
    'recording_loc': [],
    'label':[],
    'start': []
})

new_file_path = module_path + '/data/minimal_dataset/data/'
error_in_data = {}
id_count = 0

with open(module_path + '/data/minimal_dataset/skipped_sessions.pkl', 'rb') as f:
    skip_session_ids = pickle.load(f)

for nr_id in id_unique:
    id_count = id_count + 1
    row = new_dataset[new_dataset['nr'] == nr_id].values[0] 

#for row in new_dataset.iterrows():
#    idx = row[0]

    audio_file = row[0]
    label = row[2]
    patient_id = row[1]
    stop_session = False
    skip_flag = False
    file_info = pd.read_csv(module_path + '/data/minimal_dataset/info.csv')
    ids = file_info['patient_id'].values

    if (int(nr_id) in ids) or (nr_id in skip_session_ids):
        continue
    
    try:
        sr, audio = utility.read_wav_file(audio_file, target_rate)
        audio = utility.denoise_audio(audio)
        audio, sr = utility.downsample(audio, sr, 8000), 8000
        audio = np.apply_along_axis(bandpass_filter, 0, audio).astype('float64')

    except EOFError as error:
        error_in_data[audio_file] = 'EOFError'
        continue

    try:
        count = 0

        subs = np.linspace(0,15,31)
        for i in subs:
            new_name = audio_file.split('.')[0].split('/')[-1] + f'_{int(i*10)}.wav'
            new_name = module_path + '/data/minimal_dataset/data/' + new_name
            begin = int(i*sr)

            s = (begin, (begin + int(0.5*sr)))
            sub = audio[s[0]:s[1]]
            
            new_label, label_flag, stop_flag, skip_flag = make_window(patient_id, 30*sub, 30*audio,
                                                           sr, label,
                                                           (i, (i + 0.5)), (id_count, len(id_unique)))
            
            
            info = pd.DataFrame({
                'filepath_orig':[audio_file],
                'filepath_new': [new_name],
                'patient_id': [patient_id],
                'recording_loc': [audio_file.split('_')[-1].split('.')[0]],
                'label':[new_label],
                'start': [i]
            })
            
            
            if stop_flag:
                stop_session = True
                print('Session Stopped')
                break
            
            elif label_flag:

                count = count + 1
                file_info = file_info.append(info)
                soundfile.write(new_name, sub, 8000, subtype='PCM_16')
                label_flag = False
                
            if skip_flag:
                break
                
            elif count == 3:
                file_info.to_csv(module_path + '/data/minimal_dataset/info.csv', index = False)
                break
                
            del sub
        del subs
        del audio 
        
        
        if stop_session:
            file_info.to_csv(module_path + '/data/minimal_dataset/info.csv', index = False)
            break
            
                
                
        if (skip_flag) or (len(info) == 0):
            skip_session_ids.append(nr_id)
            with open(module_path + '/data/minimal_dataset/skipped_sessions.pkl', 'wb') as f:
                pickle.dump(skip_session_ids,f)
            continue
            
            
        
            
        
        file_info.to_csv(module_path + '/data/minimal_dataset/info.csv', index = False)

    except ValueError as error:
            error_in_data[audio_file] = 'ValueError'
            continue

Session Stopped
