# Code for writing the lung sound data to .ts file

Below is the code for writing the small segments to .ts files

In [20]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
sys.path.insert(1, module_path + '/src')

import utility

import librosa
import sktime
from sktime.utils.data_io import load_from_tsfile_to_dataframe
from sklearn.model_selection import train_test_split

from math import ceil

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import librosa
from scipy.stats import skew 

from time import time
import sys
import os
figure_path = module_path + '/figures/'

sys.path.insert(1, module_path + '/src/')

from matplotlib.font_manager import FontProperties
font = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 10, weight = 1000)
font_small = FontProperties(fname = module_path + '/src/visualization/CharterRegular.ttf', size = 8, weight = 1000)
colors = ['#F94144', '#90BE6D', '#577590','#F3722C', '#F8961E', '#F9844A', '#F9C74F', '#43AA8B', '#4D908E', '#277DA1']
colors_opa = ['#F9414466', '#90BE6D66', '#57759066','#F3722C66', '#F8961E66',
              '#F9844A66', '#F9C74F66', '#43AA8B66', '#4D908E66', '#277DA166']

sns.set_palette(sns.color_palette(colors))

target_rate = 44100
ds_target_rate = 8000

files_path = module_path + '/data/minimal_dataset/data/'
info = pd.read_csv(module_path + '/data/minimal_dataset/info.csv')

def zero_pad(ts, fixed_len):
    df_new = np.zeros(fixed_len)
    len_ts = len(ts)
    pad = (fixed_len - len_ts)//2
    df_new[pad:pad + len_ts] = ts
    return df_new

Splitting dataset into train, test and validation, while maintaining an unbiased devide (one patient should only appear in one of the datasets)

In [21]:
# If classifying between wheeze and crackle

'''indices = np.concatenate([np.where(info['label'] == 'wheeze')[0][:580],
                          np.where(info['label'] == 'crackle')[0]])'''

# Three class classification problem
indices = np.concatenate([np.where(info['label'] == 'normal')[0],
                          np.where(info['label'] == 'wheeze')[0][:580],
                          np.where(info['label'] == 'crackle')[0]])

info = info.iloc[indices]


info.sort_values(by=['patient_id'], inplace=True)


id_unique = info['patient_id'].unique()

id_train , id_test = train_test_split(id_unique, test_size=0.2, random_state=42)
id_train , id_val = train_test_split(id_unique, test_size=0.25, random_state=42)

def get_indices(id_list, data):
    indices = np.array([])
    for i in id_list:
        idx = np.where(data['patient_id'] == i)[0]
        indices = np.append([list(idx)] , [indices])
        
    return indices


indices_train = get_indices(id_train, info).astype(int)
indices_test = get_indices(id_test, info).astype(int)
indices_val = get_indices(id_val, info).astype(int)

print('Train set split \n')
print(info.iloc[indices_train]['label'].value_counts())

print('Test set split \n')
print(info.iloc[indices_test]['label'].value_counts())

print('Val set split \n')
print(info.iloc[indices_val]['label'].value_counts())

Train set split 

crackle    438
wheeze     431
normal     418
Name: label, dtype: int64
Test set split 

wheeze     128
crackle    111
normal     105
Name: label, dtype: int64
Val set split 

wheeze     149
crackle    144
normal     136
Name: label, dtype: int64


In [22]:
file_splits = {
    '/data/ts_files/minimal_dataset_3class_TRAIN.ts' : indices_train,
    '/data/ts_files/minimal_dataset_3class_TEST.ts' : indices_test,
    '/data/ts_files/minimal_dataset_3class_VAL.ts' : indices_val
    
}


def write_feature_extracted_dataset_to_ts(filename, data):

    w = open(module_path + filename, 'w+')
    row = w.read()

    w.write('@problemName Minimal UiT Lung Sound \n')
    w.write('@timeStamps false \n')
    w.write('@missing false \n')
    w.write('@univariate true \n')
    w.write('@equalLength true \n')
    w.write('@classLabel true crackle wheeze \n')
    w.write('@data \n')

    error_in_data = {}
    count = 0
    for row in data.iterrows():
        audio_file = row[1]['filepath_new']
        label = row[1]['label']

        try:
            sr, audio = utility.read_wav_file(audio_file, 8000)

        except EOFError as error:
            error_in_data[audio_file] = 'EOFError'
            continue

        try:
            audio = zero_pad(audio, 4000)
            new_row = str(list(audio))[1:-1].replace(' ', '') + ':' + label + '\n'
            w.write(new_row)


        except ValueError as error:
            error_in_data[audio_file] = 'ValueError'
            continue
             
for name, i in file_splits.items():
    write_feature_extracted_dataset_to_ts(name, info.iloc[i])