# Import Packages

In [1]:
from packages import *
%matplotlib inline

# Read Dataset

In [2]:
all_files = glob.glob('virufy_clean/*')
l1, l2 = [], []
for i in all_files: 
    l1.append(i)
    label = i.split('\\')[1][:3].strip()
    l2.append(label)
df = pd.DataFrame(zip(l1,l2),columns=['fname','label'])
df.head()

Unnamed: 0,fname,label
0,virufy_clean\neg-0421-083-cough-m-53-1.wav,neg
1,virufy_clean\neg-0421-083-cough-m-53-10.wav,neg
2,virufy_clean\neg-0421-083-cough-m-53-11.wav,neg
3,virufy_clean\neg-0421-083-cough-m-53-12.wav,neg
4,virufy_clean\neg-0421-083-cough-m-53-13.wav,neg


# Get class distribution

In [3]:
df.set_index('fname', inplace=True)

for f in df.index:
    signal,rate = librosa.load(f)
    df.at[f, 'length'] = signal.shape[0]/rate

df.head()

Unnamed: 0_level_0,label,length
fname,Unnamed: 1_level_1,Unnamed: 2_level_1
virufy_clean\neg-0421-083-cough-m-53-1.wav,neg,0.473469
virufy_clean\neg-0421-083-cough-m-53-10.wav,neg,0.438277
virufy_clean\neg-0421-083-cough-m-53-11.wav,neg,0.511202
virufy_clean\neg-0421-083-cough-m-53-12.wav,neg,0.462132
virufy_clean\neg-0421-083-cough-m-53-13.wav,neg,0.537914


In [4]:
# get class names
classes = list(np.unique(df.label))

# get probability distribution of classes based on duration of audio files
class_dist = df.groupby(['label'])['length'].mean()
prob_dist = class_dist/class_dist.sum()

# Get 2 * audio samples of duration 1/10 secs taken from all audio files
n_samples = int(2 * (1/0.1)* df['length'].sum())
n_samples

1494

# Study pywt library

In [5]:
# get types of filters
print(pywt.wavelist(family=None, kind='discrete'))

['bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7', 'bior3.9', 'bior4.4', 'bior5.5', 'bior6.8', 'coif1', 'coif2', 'coif3', 'coif4', 'coif5', 'coif6', 'coif7', 'coif8', 'coif9', 'coif10', 'coif11', 'coif12', 'coif13', 'coif14', 'coif15', 'coif16', 'coif17', 'db1', 'db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', 'db11', 'db12', 'db13', 'db14', 'db15', 'db16', 'db17', 'db18', 'db19', 'db20', 'db21', 'db22', 'db23', 'db24', 'db25', 'db26', 'db27', 'db28', 'db29', 'db30', 'db31', 'db32', 'db33', 'db34', 'db35', 'db36', 'db37', 'db38', 'dmey', 'haar', 'rbio1.1', 'rbio1.3', 'rbio1.5', 'rbio2.2', 'rbio2.4', 'rbio2.6', 'rbio2.8', 'rbio3.1', 'rbio3.3', 'rbio3.5', 'rbio3.7', 'rbio3.9', 'rbio4.4', 'rbio5.5', 'rbio6.8', 'sym2', 'sym3', 'sym4', 'sym5', 'sym6', 'sym7', 'sym8', 'sym9', 'sym10', 'sym11', 'sym12', 'sym13', 'sym14', 'sym15', 'sym16', 'sym17', 'sym18', 'sym19', 'sym20']


In [6]:
# get filter values and length
name = 'haar'
wavelet = pywt.Wavelet(name)
print('Filter name:', name)
print('Length of filter:', wavelet.dec_len)
print('Filter weights:', wavelet.dec_lo, wavelet.dec_hi)

Filter name: haar
Length of filter: 2
Filter weights: [0.7071067811865476, 0.7071067811865476] [-0.7071067811865476, 0.7071067811865476]


In [7]:
# get types of padding
print(pywt.Modes.modes)

['zero', 'constant', 'symmetric', 'periodic', 'smooth', 'periodization', 'reflect', 'antisymmetric', 'antireflect']


# Feature Extraction

In [8]:
############ function to calculate the entropy value of an input signal ################
# entropy values can be taken as a measure of complexity of the signal
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    e = entropy(probabilities)
    return e

############## function to calculate some statistics ################
def calculate_statistics(list_values):
    n25 = np.nanpercentile(list_values, 25) # 25th percentile value
    median = np.nanpercentile(list_values, 50) # Median
    n75 = np.nanpercentile(list_values, 75) # 75th percentile value
    mean = np.nanmean(list_values) # Mean
    std = np.nanstd(list_values) # standard deviation
    var = np.nanvar(list_values) # variance
    return [n25, median, n75, mean, std, var]

############# function to calculate the ZCR and mean CR ############
def calculate_crossings(list_values):
    # Zero crossing rate, i.e. the number of times a signal crosses y = 0
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values)>0))[0] 
    no_zero_crossings = len(zero_crossing_indices)
    
    # Mean crossing rate, i.e. the number of times a signal crosses y = mean(y)
    mean_crossing_indices = np.nonzero(np.diff(np.array(list_values)>np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]

############# function to combine the results of these three functions above ###########
def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics

In [9]:
# get maximum number of decomposition levels for all audio files

dlevels = []

for _ in range(n_samples):
        
    # choose a random audio sample from dataset according to their probability distribution
    rand_class = np.random.choice(class_dist.index,p=prob_dist)
    file = np.random.choice(df[df.label==rand_class].index)
    rate, wav = wavfile.read(file)
    label = df.at[file,'label']

    # crop a random duration of 1/10 sec from chosen audio sample
    step=int(rate/10)
    rand_index = np.random.randint(0,wav.shape[0]-step)
    sample = wav[rand_index:rand_index+step]
    
    dlevels.append(pywt.dwt_max_level(len(sample), 'db4'))
    
min(dlevels),max(dlevels)

(7, 7)

In [10]:
def build_random_feat():
    
    X = []
    y = []
    
    for _ in range(n_samples):
        
        # choose a random audio sample from dataset according to their probability distribution
        rand_class = np.random.choice(class_dist.index,p=prob_dist)
        file = np.random.choice(df[df.label==rand_class].index)
        rate, wav = wavfile.read(file)
        label = df.at[file,'label']
        
        # crop a random duration of 1/10 sec from chosen audio sample
        step=int(rate/10)
        rand_index = np.random.randint(0,wav.shape[0]-step)
        sample = wav[rand_index:rand_index+step]
        
        # get stat features of the cropped duration
        # list_coeff is the list of approximation_coeffs and detail_coeffs
        # list_coeff = [approximation_coeffs_list (An), detail_coeffs_list (Dn),...,detail_coeffs_list (D1)]
        # length of list_coeff = no. of levels + 1
        list_coeff = pywt.wavedec(data = sample, 
                                  wavelet = 'db4', # filter
                                  mode = 'constant', # duplicattion padding
                                  level = min(dlevels) # no. of decomposition levels
                                 ) 

        # for each list in list_coeff, get 9 features.
        features = []
        for coeff in list_coeff:
            features += get_features(coeff) 

        X.append(features)
        y.append(classes.index(label))
            
    df2 = pd.DataFrame(X)
    df2['label'] = y
                   
    return df2

In [11]:
df = build_random_feat()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,label
0,2.944439,8,2,0.001317,0.006354,0.999421,0.467338,0.662373,0.438738,2.944439,...,6.688355,587,587,-0.018317,3.1e-05,0.017297,5.963507e-05,0.030143,0.0009086033,0
1,2.944439,13,1,-0.045173,-0.001142,0.000642,-0.019399,0.030064,0.000904,2.944439,...,6.688355,486,486,-0.000264,1.9e-05,0.000277,5.867482e-07,0.000674,4.546379e-07,1
2,2.944439,10,1,-1.34284,-0.010283,0.000819,-0.60067,0.928463,0.862043,2.944439,...,6.688355,602,602,-0.003761,2.6e-05,0.003804,-1.86727e-05,0.006238,3.891848e-05,0
3,2.944439,10,1,-0.015038,0.021864,1.563694,0.741711,1.070447,1.145858,2.944439,...,6.688355,448,446,-0.01376,-0.000268,0.012652,-3.762953e-05,0.025086,0.0006293154,0
4,2.944439,12,2,-0.242719,-0.011091,0.002345,-0.095093,0.12894,0.016626,2.944439,...,6.688355,529,529,-0.00794,0.000602,0.008097,-2.656144e-06,0.013116,0.0001720174,1


In [12]:
df.shape

(1494, 73)

# Save data

In [13]:
df.to_csv('data/Wavelet_features.csv',index=False)