In [66]:
import h5py
import librosa
import pydub # faster than librosa
import numpy      as np
import pandas     as pd
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [73]:
dir = 'C:/COUGHVID_public_dataset/public_dataset/'

# read in metadata and extract unique identifiers
data = pd.read_csv(dir+'metadata_compiled.csv')
#uuids = data.get('uuid').to_list()

# get only data that has a COVID status label and a cough-detected. Loading all the files takes too long
status = np.isin(data['status'],['healthy','symptomatic','COVID-19'])
cough_detected = data['cough_detected'] > 0.8 # recommended threshold from https://www.nature.com/articles/s41597-021-00937-4

labelled_data = data[ np.logical_and(status,cough_detected) ]

uuids = labelled_data.get('uuid').to_list()

print(str(len(uuids)) + " matching records found.")

10817 matching records found.


In [74]:
''' WARNING: THIS CELL TAKES AWHILE TO RUN '''
# open sound files
extensions = ['.webm','.ogg']

sounds = []
#sample_rates = []

for i,uuid in enumerate(uuids):
    if (i+1) % 50 == 0: print(str(i+1) + " of " + str(len(uuids)) + " records processed. This may take awhile...")
    
    sound = None
    #sample_rate = 0
    for ext in extensions:
        try:
            #sound, sample_rate = librosa.load(dir+uuid+ext,sr=None)
            # much faster than librosa
            sound = np.array(pydub.AudioSegment.from_file(dir+uuid+ext).get_array_of_samples(),dtype='int64' )
            break
        except FileNotFoundError as e:
            print("File " + dir+uuid+ext + " not found. Trying next extension...")
            
    sounds       += [sound]
    #sample_rates += [sample_rate]
            

File C:/COUGHVID_public_dataset/public_dataset/005887c9-4bb1-4f13-86b2-1c7b3cee0881.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/00a31212-7c64-4649-b78b-4c7d9ce3156e.webm not found. Trying next extension...
50 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/02877709-f7ea-4789-9526-1e1f47cafb3f.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/029685b5-d06b-44ea-9ad9-6f6134438c67.webm not found. Trying next extension...
100 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/02def005-fb2b-46aa-97ce-66262fb6f93d.webm not found. Trying next extension...
150 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/043e6bf3-8924-4d51-9109-0074d5afa9e1.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/045364d1-30c3-4784-a51f-48f791b913a1

1150 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/1b444b5c-7013-4e3c-88db-b41f3f2c7ad8.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/1c26ef49-17b3-47fe-b700-295373fdedb8.webm not found. Trying next extension...
1200 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/1d0dab8f-ab22-4b7b-865d-4700befaa68d.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/1d3efa17-4f8b-48ef-9dd8-47ac6b0247f7.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/1d52ac12-0361-4154-a9f7-e526491112a9.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/1d53487c-eb9d-430a-a3ee-74cfc1622faf.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/1d6cb7bf-b926-433a-bf6e-0d8ce43d02c9.webm not found. Trying next extension...
1250 of 108

File C:/COUGHVID_public_dataset/public_dataset/333ca4a1-557a-4be5-b2e3-9e572b3132ff.webm not found. Trying next extension...
2150 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/33a2d6f8-da51-4de6-9293-771e879b6352.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/33d54956-31de-4a8e-86a2-9fa3838c2374.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/34191fb6-7fb8-4cbf-a5cf-ea458bf4107b.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/341ff20f-b75f-4d5a-8ee9-38ba198ed6ec.webm not found. Trying next extension...
2200 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/34ed92ba-3dbd-4f66-9ded-3f6e6261b14b.webm not found. Trying next extension...
2250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/35cdb63c-cc79-46b6-b052-c144ff1d

File C:/COUGHVID_public_dataset/public_dataset/4afa9f79-e299-4fc7-b3b3-74c3a1f5f35e.webm not found. Trying next extension...
3200 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/4bc00277-2024-48cd-bce4-6203766f0a94.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/4c08d0fa-e52f-4e57-a69e-f66153d0a7c1.webm not found. Trying next extension...
3250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/4c9d483e-e4f4-470d-94eb-8b83bf1aacf8.webm not found. Trying next extension...
3300 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/4d608f75-dd73-427f-95e8-3e6a2e08393f.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/4d86b352-ff7b-47c2-a289-c4fe8621eaf0.webm not found. Trying next extension...
3350 of 10817 records processed. This may take awhile...
File C:/COUGHVID_publi

4300 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/65be7822-e7c7-4fdc-9549-bf3fac30f919.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/65c9a70c-232d-41b1-a2a7-279d65d3971d.webm not found. Trying next extension...
4350 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/66e030f3-fa93-4144-9bba-3965814aec89.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/66ed6834-a618-40e8-9e84-b17261b69e1d.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/67087ffc-29e7-4ced-9846-1fcc939e6fbe.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/67462ca9-c109-4597-a427-5d53cf60c0f6.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/6752ac49-1c88-40d6-af9d-e16dfb81bf51.webm not found. Trying next extension...
4400 of 108

5400 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/7f2d8035-5f1b-4f3a-b251-95387389c8ea.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/800baf89-e68b-4022-bf75-3897b3740f64.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/80647f3d-a24e-4963-82c3-e31cf5181929.webm not found. Trying next extension...
5450 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/81a2f1c1-dce1-48e8-955f-98770431c505.webm not found. Trying next extension...
5500 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/82b527a0-661e-49aa-8d06-08f3ad7a9d84.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/82efd73f-05ed-4572-a562-e416b03a81e1.webm not found. Trying next extension...
5550 of 10817 records processed. This may take awhile...
File C:/COUGHVID_publi

6400 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/97a23af1-65b5-467b-afa6-344e2607b362.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/97c8e35e-be60-4ce6-8935-ccc1e7846a18.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/97d54cf3-852b-46ca-9082-b8e54b2b29ab.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/98022df0-fb48-499b-a0d4-957648ff718e.webm not found. Trying next extension...
6450 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/9838a7c1-bf48-4005-8424-92324ca4412c.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/986cdb5d-3e6d-460e-847b-a3bff1b9ef17.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/9877a438-1cca-46dd-bd66-2ceecb717558.webm not found. Trying next extension...
File C:/COU

7200 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/abbf44b9-24fd-42f6-82cf-babe390a5dcc.webm not found. Trying next extension...
7250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/abcb0f36-df6d-4316-8006-c1f881262fc2.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/abd6c883-84ef-461f-825e-d22412524d2e.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/ac3c7943-2127-4def-8439-43232be4e776.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/acc05d8f-5e86-44f9-bedc-c6904d2d1b3f.webm not found. Trying next extension...
7300 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/adc81ade-acd3-496d-9bbd-58487f68c9ac.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/ae0ff134-0218-4d7e-804d-c48f5794

File C:/COUGHVID_public_dataset/public_dataset/c2ade3ae-a5f3-41dc-b5af-ed06dfe51722.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/c33f4619-cf67-4fd0-9071-92295bb71688.webm not found. Trying next extension...
8250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/c3872d24-2748-4e06-9518-79464ba26f19.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/c388d50a-8fd3-4070-a0cc-bdf17c88f919.webm not found. Trying next extension...
8300 of 10817 records processed. This may take awhile...
8350 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/c576b1e0-cb74-4560-bea4-0ba89b862273.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/c5901b26-cf9f-4235-be9b-8b900cc77a8b.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/c591921d-6476-4ec6-81f3-0cb647c1

File C:/COUGHVID_public_dataset/public_dataset/db044a11-1d57-4365-8364-1f73f5fb1bdf.webm not found. Trying next extension...
9250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/db1408d4-4cba-4a28-8d88-5cdd52bdba90.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/db80b5c3-814b-4d64-956a-cf5eb3837c64.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/dbcc6cfe-045c-4b9f-b890-23f598a9e6f2.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/dc825027-5f65-442a-81a5-ab554934990b.webm not found. Trying next extension...
9300 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/dcc64b75-8e16-45bd-8a25-7ec94e160f44.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/dcde1965-12d0-4975-9b67-a633f73aca24.webm not found. Trying next extension...
File C:/COU

File C:/COUGHVID_public_dataset/public_dataset/f1471243-2511-4f2a-b3ac-98e955d795ef.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/f1563159-1350-4e0d-a007-ddc91504489e.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/f1761c24-c5cb-4d5e-9546-d95fdb6c6f44.webm not found. Trying next extension...
10200 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/f24a7779-d819-4fa5-98d8-bb7384d4d475.webm not found. Trying next extension...
10250 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/f2b8d4c0-b7ba-49a6-8e91-9f20be33520a.webm not found. Trying next extension...
File C:/COUGHVID_public_dataset/public_dataset/f35337eb-96f3-47dd-b1d4-e13248f4cc20.webm not found. Trying next extension...
10300 of 10817 records processed. This may take awhile...
File C:/COUGHVID_public_dataset/public_dataset/f3d50342-5592-4c58-8856-bc951

In [75]:
# update dataframe
#sounds = pad_to_dense(sounds)
labelled_data.insert(2,'sounds',sounds)
#labelled_data.insert(3,'sample_rates',sample_rates)


In [76]:
'''NONE OF THESE METHODS WORKED. LOOKS LIKE LOADING WILL HAVE TO TAKE PLACE EACH TIME'''

# for me this takes a very long time and generates a 17.2 GB file
#import pickle as pi
#pi.dump(data,open('all_data.pickle','wb'))

# alternatively, use h5py. this didn't work either as sounds cannot be serialized as a jagged arary. padding didn't work either
#labelled_data.to_hdf('labelled_data.h5','df',mode='w',format='table',data_columns=True)

#h = h5py.File('all_data.h5')

TypeError: Cannot serialize the column [sounds]
because its data contents are not [string] but [mixed] object dtype

In [77]:
labelled_data

Unnamed: 0,uuid,datetime,sounds,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9609,16.151433,31.3,34.8,15.0,male,False,...,,,,,,,,,,
3,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,2020-04-12T04:02:18.159383+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9301,20.146058,40.0,-75.1,34.0,male,True,...,,,,,,,,,,
5,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,2020-04-13T22:23:06.997578+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9968,13.146502,,,21.0,male,False,...,,,,,,,,,,
10,0028b68c-aca4-4f4f-bb1d-cb4ed5bbd952,2020-05-24T12:12:46.394647+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.8937,13.477551,,,28.0,female,False,...,,,,,,,,,,
11,00291cce-36a0-4a29-9e2d-c1d96ca17242,2020-04-13T15:10:58.405156+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9883,14.603851,39.4,67.2,15.0,male,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27535,ffd42893-4119-4855-9aad-c67d8d392cc1,2020-04-11T12:44:37.495743+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9414,28.530965,,,26.0,male,False,...,,,,,,,,,,
27539,ffe0658f-bade-4654-ad79-40a468aabb03,2020-04-14T01:58:32.200245+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0000,21.960583,41.6,60.9,22.0,male,True,...,poor,unknown,False,False,False,False,False,False,,unknown
27540,ffe13fcf-c5c2-4a6a-a9fc-e010f4f033c1,2020-04-13T21:08:50.708320+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9485,9.966762,41.1,28.8,31.0,male,False,...,,,,,,,,,,
27542,ffedc843-bfc2-4ad6-a749-2bc86bdac84a,2020-06-05T03:41:37.481463+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0000,33.661082,-34.5,-58.5,23.0,male,False,...,good,dry,False,False,False,False,False,True,healthy_cough,pseudocough


Unnamed: 0,uuid,datetime,sounds,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9609,16.151433,31.3,34.8,15.0,male,False,...,,,,,,,,,,
3,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,2020-04-12T04:02:18.159383+00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.9301,20.146058,40.0,-75.1,34.0,male,True,...,,,,,,,,,,


In [None]:
'''model = tf.keras.applications.resnet50.ResNet50(
    include_top=True, weights='imagenet', input_tensor=None,
    input_shape=None, pooling=None, classes=1000
)'''