In [213]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(1, module_path + '/src')

import utility
target_rate = 44100
import librosa
import sktime
from sktime.utils.data_io import load_from_tsfile_to_dataframe
from sklearn.model_selection import train_test_split

from math import ceil

In [224]:
none_path = module_path + '/data/crackleWheeze/none/'
crackle_path = module_path + '/data/crackleWheeze/crackle/'
both_path = module_path + '/data/crackleWheeze/both/'
wheeze_path = module_path + '/data/crackleWheeze/wheeze/'


none = [file_name for file_name in os.listdir(none_path) if '.wav' in file_name]
crackle = [file_name for file_name in os.listdir(crackle_path) if '.wav' in file_name]
wheeze = [file_name for file_name in os.listdir(wheeze_path) if '.wav' in file_name]
both= [file_name for file_name in os.listdir(both_path) if '.wav' in file_name]


class_lists = {'crackle' : crackle,
    'wheeze': wheeze,
    'none': none, 
    'both': both
}

class_paths = {'crackle' : crackle_path,
    'wheeze': wheeze_path,
    'none': none_path, 
    'both': both_path
}

frame_len = round(4000*0.001)
hop_len = ceil(frame_len/1.5)

# Creating the .ts file for the four class classification problem


@problemName <problem name>
@timeStamps <true/false>
@univariate <true/false>
@classLabel <true/false> <space delimited list of possible class values>
@data
    
classes: 
   0: crackle
   1: wheeze
   2: none 
   3: both

In [160]:
def write_class(writer, class_list, class_name, class_path, fixed= True, fixed_len = 2000):
    classes = {
    'crackle': '0',
    'wheeze': '1',
    'none': '2',
    'both': '3'
    }
    classes_2_class = {
    'crackle': '0',
    'wheeze': '1',
    'none': '1',
    'both': '0'
    }
    for name in class_list: 
        audio_file = class_path + name
        sr, audio = utility.read_wav_file(audio_file, target_rate)
        audio = utility.denoise_audio(audio)
        audio, sr = utility.downsample(audio, sr, 4000)

        rms_new = librosa.feature.rms(audio, frame_length = frame_len, hop_length = hop_len)[0]
        
        if fixed: 
            if len(rms_new) > fixed_len:
                    rms_new = rms_new[0:fixed_len]
            df_new = np.zeros(fixed_len)
            df_new[0:len(rms_new)] = rms_new
            new_row = str(list(df_new))[1:-1].replace(' ', '') + ':' + classes[class_name] + '\n'
        else: 
            new_row = str(list(rms_new))[1:-1].replace(' ', '') + ':' + classes[class_name] + '\n'
        writer.write(new_row)

def write_ts(filename, fixed = True, fixed_len = 2000):
    w = open(filename, 'w+')
    row = w.read()

    w.write('@problemName LungSounds \n')
    w.write('@timeStamps false \n')
    w.write('@missing false \n')
    w.write('@univariate true \n')
    if fixed: 
        w.write('@equalLength true \n')
        w.write(f'@seriesLength {str(fixed_len)} \n')
    else:
        w.write('@equalLength false \n')
    w.write('@classLabel true crackle wheeze none both \n')
    w.write('@data \n')

    for class_name, class_list in class_lists.items():
        if fixed:
            write_class(w , class_list, class_name, class_paths[class_name], True, fixed_len)
        else: 
            write_class(w , class_list, class_name, class_paths[class_name], False)
    #os.close()

In [161]:
#write_ts(module_path + '/data/ts_files/crackleWheeze.ts', False)

# Creating the .ts files for the two class classification problem

In [207]:
def write_class2(writer, class_list, class_name, class_path, fixed= True, fixed_len = 2000):
    classes = {
    'crackle': '0',
    'wheeze': '1',
    'none': '1',
    'both': '0'
    }
    for name in class_list: 
        audio_file = class_path + name
        sr, audio = utility.read_wav_file(audio_file, target_rate)
        audio = utility.denoise_audio(audio)
        audio, sr = utility.downsample(audio, sr, 4000)

        rms_new = librosa.feature.rms(audio, frame_length = frame_len, hop_length = hop_len)[0]
        
        if fixed: 
            if len(rms_new) > fixed_len:
                    rms_new = rms_new[0:fixed_len]
            df_new = np.zeros(fixed_len)
            df_new[0:len(rms_new)] = rms_new
            new_row = str(list(df_new))[1:-1].replace(' ', '') + ':' + classes[class_name] + '\n'
        else: 
            new_row = str(list(rms_new))[1:-1].replace(' ', '') + ':' + classes[class_name] + '\n'
        writer.write(new_row)

def write_ts2(filename, fixed = True, fixed_len = 2000):
    w = open(filename, 'w+')
    row = w.read()

    w.write('@problemName LungSounds \n')
    w.write('@timeStamps false \n')
    w.write('@missing false \n')
    w.write('@univariate true \n')
    if fixed: 
        w.write('@equalLength true \n')
        w.write(f'@seriesLength {str(fixed_len)} \n')
    else:
        w.write('@equalLength false \n')
    w.write('@classLabel true crackle no_crackle \n')
    w.write('@data \n')

    for class_name, class_list in class_lists.items():
        if fixed:
            write_class2(w , class_list, class_name, class_paths[class_name], True, fixed_len)
        else: 
            write_class2(w , class_list, class_name, class_paths[class_name], False)
    #os.close()

In [225]:
write_ts2(module_path + '/data/ts_files/crackleNoCrackleSamleLength3000.ts', True, 3000)

# Classification using sktime

In [226]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sktime.classification.compose import TimeSeriesForestClassifier

import time as time



X, y = load_from_tsfile_to_dataframe(module_path + '/data/ts_files/crackleNoCrackleSamleLength3000.ts')
X_train, X_test, y_train, y_test = train_test_split(X, y)

labels, counts = np.unique(y_train, return_counts=True)
print(labels, counts)

['0' '1'] [1777 3396]


# Interval Based Methods

## Testing out time series forest classifier (TSF)

In [228]:
y_pred_2000 = y_pred  # time 352.05756402015686

In [229]:
start = time.time()
tsf = TimeSeriesForestClassifier()
tsf.fit(X_train, y_train)
tsf_y_pred = tsf.predict(X_test)
print(f'Number of mislabeled points out of a total {X_test.shape[0]} points : {(y_test != tsf_y_pred).sum()}')
print(classification_report(tsf_y_pred, y_test))
end = time.time()
print('Time: ', str(end - start))

Number of mislabeled points out of a total 1725 points : 566
              precision    recall  f1-score   support

           0       0.24      0.55      0.34       263
           1       0.90      0.69      0.78      1462

    accuracy                           0.67      1725
   macro avg       0.57      0.62      0.56      1725
weighted avg       0.80      0.67      0.71      1725

Time:  542.2675955295563


## Test of interval based classifier (RISE)

In [180]:
from sktime.classification.interval_based import RandomIntervalSpectralForest

rise = RandomIntervalSpectralForest(n_estimators=10)
rise.fit(X_train, y_train)
rise.score(X_test, y_test)

0.5136231884057971

# Dictionary based 

## BOSS

In [None]:
from sktime.classification.dictionary_based._boss import BOSSEnsemble

start = time.time()
boss = BOSSEnsemble()
boss.fit(X_train, y_train)
boss_y_pred = boss.predict(X_test)
print(f'Number of mislabeled points out of a total {X_test.shape[0]} points : {(y_test != boss_y_pred).sum()}')
print(classification_report(boss_y_pred, y_test))

end = time.time()
print('Time: ', str(end - start))

## cBOSS

In [None]:
from sktime.classification.dictionary_based import ContractableBOSS
from time import time 


indices = np.random.RandomState(0).permutation(10)

cboss = ContractableBOSS(
    n_parameter_samples=50, max_ensemble_size=10, random_state=0
)

start = time()
cboss.fit(X_train, y_train)
end = time()
score_cboss = cboss.score(X_test.iloc[indices], y_test[indices])

print('Time used to train: ', str(end - start))

# Shapelet Based

## MR SQL

In [None]:
from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.classification.shapelet_based import ROCKETClassifier
from numpy import testing

sax_clf = MrSEQLClassifier(seql_mode='fs', symrep=['sax'])
sfa_clf = MrSEQLClassifier(seql_mode='fs', symrep=['sfa'])
ss_clf = MrSEQLClassifier(seql_mode='fs', symrep=['sax', 'sfa'])

# fit training data
sax_clf.fit(X_train, y_train)
sfa_clf.fit(X_train, y_train)
ss_clf.fit(X_train, y_train)

# prediction
sax_predicted = sax_clf.predict(X_test)
sfa_predicted = sfa_clf.predict(X_test)
ss_predicted = ss_clf.predict(X_test)

# test feature space dimension
# the multi-domain classifier (ss_clf) should produce as many features
# as the others (sax_clf and sfa_clf) combine
np.testing.assert_equal(ss_clf.ots_clf.coef_.shape[1],
                        sfa_clf.ots_clf.coef_.shape[1] +
                        sax_clf.ots_clf.coef_.shape[1])

# test number of correct predictions
np.testing.assert_equal((sax_predicted == y_test).sum(), 148)
np.testing.assert_equal((sfa_predicted == y_test).sum(), 150)
np.testing.assert_equal((ss_predicted == y_test).sum(), 150)


## ROCKET

In [None]:
indices = np.random.RandomState(0).permutation(100)

# train ROCKET
rocket = ROCKETClassifier(num_kernels=1000, random_state=0)
rocket.fit(X_train, y_train)

score_rocket = rocket.score(X_test.iloc[indices], y_test[indices])

# Hybrid

## Catch22

In [None]:
from sktime.classification.hybrid._catch22_forest_classifier import (
    Catch22ForestClassifier,
)

indices = np.random.RandomState(0).permutation(20)

c22f = Catch22ForestClassifier(random_state=0)
c22f.fit(X_train.iloc[indices], y_train[indices])

score_c22f = c22f.score(X_test.iloc[indices], y_test[indices])