In [None]:
import os
import glob
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
import librosa
import librosa.display
import cv2
import soundfile as sf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow_io as tfio

# Fix random seed for reproducibility
np.random.seed(42)

In [None]:
# Load metadata
data_dir = '../coughvid-clean-silence-removed'
meta_data_path = os.path.join(data_dir, 'meta_data.csv')
meta_data = pd.read_csv(meta_data_path)

In [None]:
columns = ['uuid', 'cough_detected', 'age', 'gender', 'status']
mask = meta_data['cough_detected']>=0.7
meta_data = meta_data[mask][columns].reset_index(drop=True)

In [None]:
np.unique(meta_data['status'], return_counts=True)

In [None]:
meta_data['label'] = meta_data['status'].isin(['COVID-19', 'symptomatic']).astype(int)

In [None]:
ids = meta_data['uuid'].values
labels = meta_data['label'].values
ids.shape, labels.shape

### Split sets

In [None]:
def make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Create directory for each set
new_data_dir = '../coughvid_attention'
train_dir = os.path.join(new_data_dir, 'train')
valid_dir = os.path.join(new_data_dir, 'valid')
test_dir = os.path.join(new_data_dir, 'test')

make_dir(train_dir)
make_dir(valid_dir)
make_dir(test_dir)

In [None]:
# Create splits
# 60:20:20 train:valid:test ratio
ids_train, ids_test, labels_train, labels_test = train_test_split(ids,
                                                                  labels,
                                                                  test_size=0.2,
                                                                  stratify=labels,
                                                                  shuffle=True,
                                                                  random_state=42)
ids_train, ids_valid, labels_train, labels_valid = train_test_split(ids_train,
                                                                    labels_train,
                                                                    test_size=0.25,
                                                                    stratify=labels_train,
                                                                    shuffle=True,
                                                                    random_state=41)

ids_train.shape, ids_valid.shape, ids_test.shape

In [None]:
for set_name, labels in zip(['train', 'valid', 'test'], [labels_train, labels_valid, labels_test]):
    print(f'{set_name:<5} :: {np.unique(labels, return_counts=True)}')

In [None]:
def extract_set(metadata:pd.DataFrame,
                source_dir:str,
                data_dir:str,
                set_name:str,
                set_ids):
    
    set_dir = os.path.join(data_dir, set_name)
    target_dir = os.path.join(set_dir, 'recordings')
    
    make_dir(set_dir)
    make_dir(target_dir)
    
    for uuid in tqdm(set_ids):
        source_path = os.path.join(source_dir, f'{uuid}.wav')
        
        if not os.path.exists(source_path):
            print(f'Missing :: {uuid}.wav')
            continue
            
        target_path = os.path.join(target_dir, f'{uuid}.wav')
        shutil.copy(source_path, target_path)
    
    # Save metadata for set
    mask = metadata.uuid.isin(set_ids)
    set_metadata = metadata[mask].copy().reset_index(drop=True)
    metadata_path = os.path.join(set_dir, f'{set_name}_metadata.csv')
    set_metadata.to_csv(metadata_path, index=False)

In [None]:
coughvid_dir = os.path.join(data_dir, 'wavs-silence-removed')
extract_set(meta_data, coughvid_dir, new_data_dir, 'train', ids_train)
extract_set(meta_data, coughvid_dir, new_data_dir, 'valid', ids_valid)
extract_set(meta_data, coughvid_dir, new_data_dir, 'test', ids_test)

In [None]:
def pitch_shift_set(data_dir, set_name):
    
    meta_data_path = os.path.join(data_dir, set_name, f'{set_name}_metadata.csv')
    source_dir = os.path.join(data_dir, set_name, 'recordings')
    target_dir = os.path.join(data_dir, set_name, 'augmented')
    
    make_dir(target_dir)

    meta_data = pd.read_csv(meta_data_path)
    
    counter = 0
    
    for uuid, label in tqdm(meta_data[['uuid', 'label']].values):
        signal, sr = librosa.load(os.path.join(source_dir, f'{uuid}.wav'))
        
        if label:
            sf.write(os.path.join(target_dir, f'sample{counter}_1.wav'), signal, sr, 'PCM_24')
            counter+=1
            pitch_shifting = librosa.effects.pitch_shift(signal, sr=sr, n_steps=-4)
            sf.write(os.path.join(target_dir, f'sample{counter}_1.wav'), pitch_shifting, sr, 'PCM_24')
            counter+=1
        else:
            sf.write(os.path.join(target_dir, f'sample{counter}_0.wav'), signal, sr, 'PCM_24')
            counter+=1

In [None]:
pitch_shift_set(new_data_dir, 'train')
pitch_shift_set(new_data_dir, 'valid')
pitch_shift_set(new_data_dir, 'test')

In [None]:
def spect_augment_set(data_dir, set_name, param_masking=30):
    
    # Collect files to augment
    aug_dir = os.path.join(data_dir, set_name, 'augmented')
    files_regex = os.path.join(aug_dir, r'*.wav')
    files = glob.glob(files_regex)
    
    # Create directory for melspectrograms
    mels_path = os.path.join(data_dir, set_name, 'melspec')
    make_dir(mels_path)
    
    # Path to save labels
    labels_path = os.path.join(data_dir, set_name, f'{set_name}_labels.csv')
    
    y = []
    count = 0
    meanSignalLength = 156027
    for fn in tqdm(files):
        label = os.path.splitext(os.path.basename(fn))[0].split('_')[1]
        signal , sr = librosa.load(fn)
        s_len = len(signal)
        
        # Add zero padding to the signal if less than 156027 (~4.07 seconds)
        if s_len < meanSignalLength:
               pad_len = meanSignalLength - s_len
               pad_rem = pad_len % 2
               pad_len //= 2
               signal = np.pad(signal, (pad_len, pad_len + pad_rem), 'constant', constant_values=0)
        
        # Remove from begining and the end if signal length is greater than 156027 (~4.07 seconds)
        else:
               pad_len = s_len - meanSignalLength
               pad_len //= 2
               signal = signal[pad_len:pad_len + meanSignalLength]

        mel_spectrogram = librosa.feature.melspectrogram(y=signal,
                                                         sr=sr,
                                                         n_mels=128,
                                                         hop_length=512,
                                                         fmax=8000,
                                                         n_fft=512,
                                                         center=True)
        
        dbscale_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max, top_db=80)
        img = plt.imshow(dbscale_mel_spectrogram, interpolation='nearest',origin='lower')
        plt.axis('off')
        plt.savefig(os.path.join(mels_path, f'{count}.png'), bbox_inches='tight')
        plt.close('all')
        count+=1
        
        y.append(label)
        if label == '1': # if COVID-19
            freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=param_masking)
            time_mask = tfio.audio.time_mask(freq_mask, param=param_masking)
            img = plt.imshow(time_mask,origin='lower')
            plt.axis('off')
            plt.savefig(os.path.join(mels_path, f'{count}.png'), bbox_inches='tight')
            plt.close('all')
            count+=1
            y.append(label)
        
        freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=param_masking)
        time_mask = tfio.audio.time_mask(freq_mask, param=param_masking)
        img = plt.imshow(time_mask,origin='lower')
        plt.axis('off')
        plt.savefig(os.path.join(mels_path, f'{count}.png'), bbox_inches='tight')
        plt.close('all')
        count+=1
        y.append(label)
    
    # Save labels
    y = pd.DataFrame(data={'label': y})
    y.to_csv(labels_path, index=False)

In [None]:
assert False, 'Don\'t run this in the notebook as it will crash. Run the spec_augment_sets.py script.'
spect_augment_set(new_data_dir, 'train')
spect_augment_set(new_data_dir, 'valid')
spect_augment_set(new_data_dir, 'test')

In [None]:
def save_set(data_dir, set_name):
    # Gather melspec files
    path = os.path.join(data_dir, set_name, 'melspec')
    names = sorted(os.listdir(path), key=lambda x: int(os.path.splitext(x)[0]))
    
    # Load images
    img_array_size = (88,39)
    images = []
    for filename in tqdm(names):
        img = cv2.imread(os.path.join(path, filename))
        img = cv2.resize(img, img_array_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = np.asarray(img, dtype=np.float32)
        img = img/225.0
        if img is not None:
            images.append(img)

    images = np.squeeze(images)
    
    # Load labels
    labels_path = os.path.join(data_dir, set_name, f'{set_name}_labels.csv')
    labels = pd.read_csv(labels_path)
    
    # Save features
    features_path = os.path.join(data_dir, set_name, f'{set_name}_coughvid_melspec.npz')
    covid_status = labels.label.values
    features = {
        'images': images,
        'covid_status': covid_status        
    }
    np.savez(features_path, **features)
    

In [None]:
save_set(new_data_dir, 'train')
save_set(new_data_dir, 'valid')
save_set(new_data_dir, 'test')

In [None]:
# Shuffle the set-wise augmented train and valid sets and splits them afterwards.
# This was done as this way the test set is still held out, but there is a larger variety
# in the data for training which is more representation of the dataset used for the original
# model.

def load_set(data_dir, set_name):
    path = os.path.join(data_dir, f'{set_name}_coughvid_melspec.npz')
    features = np.load(path)
    X = features['images']
    y = features['covid_status']
    
    return X, y

# Join train and valid sets, shuffle them and split them for workshop
X_train, y_train = load_set(os.path.join(new_data_dir, 'train'), 'train')
X_valid, y_valid = load_set(os.path.join(new_data_dir, 'valid'), 'valid')

X = np.concatenate((X_train, X_valid))
y = np.concatenate((y_train, y_valid))

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y, shuffle=True, random_state=42)

features = {
        'images': X_train,
        'covid_status': y_train        
    }
np.savez(os.path.join(new_data_dir, 'train.npz'), **features)

features = {
        'images': X_valid,
        'covid_status': y_valid        
    }
np.savez(os.path.join(new_data_dir, 'valid.npz'), **features)