# Mini-Project - Covid-19 Cough Audio Classification

1. Explore the dataset through code
    * How many samples does the dataset contain?
    * How many classes? How many samples per class? Show a histogram of the number of instances per class
    * Play a random sample from each class
    * Describe if/how you think the data distribution will affect training of a classifier.
    * Decide what part of the dataset to use; all, some classes, some samples. Motivate your choice

In [None]:
from matplotlib import pyplot as plt
from glob import glob
import pandas as pd
import numpy as np
import os

# Load in metadata.csv
data_path = r"metadata_compiled.csv"
data = pd.read_csv(data_path, sep=",")
#data.columns = data.columns.str.replace(" ", "")

data = data[["uuid","cough_detected","SNR","age","gender","status"]].loc[data['cough_detected'] >= 0.5].dropna()
print(data["status"].value_counts())
print("Total samples",len(data))

data.to_csv("status_check.csv")

plt.figure(figsize=(6,4))
plt.title("Histogram of Patient Status")
plt.bar(data['status'].value_counts().index, data['status'].value_counts())
plt.xticks(rotation=20, ha='right', fontsize=8)
plt.xlabel('Class', fontsize=8)
plt.ylabel('Frequency', fontsize=8)
plt.show()

In [None]:
#print(data["uuid"][1])

data_dir_path = r"../Dataset/"
# glob(os.path.join(clip_dir, '*.txt')):
t = os.listdir(data_dir_path)

"""
#data = []
#for i in t:
#    if i.endswith(".webm") or i.endswith(".ogg"):
#        data.append(os.path.join(data_dir_path, i))

#data = pd.DataFrame(data, columns=["Links"]).to_csv("files.csv", index=False)  

#print(t := os.path.join(data_dir_path, data["uuid"][1]+".json"))

#print(t[0])
#print(data["uuid"][1])
"""

webm_data = []
ogg_data = []
c = 0
for file in data['uuid']:
    if os.path.exists(os.path.join(data_dir_path, f'{file}.webm')):
        webm_data.append(os.path.join(data_dir_path, f'{file}.webm'))
    elif os.path.exists(os.path.join(data_dir_path, f'{file}.ogg')):
        ogg_data.append(os.path.join(data_dir_path, f'{file}.ogg'))
    #file_path = os.path.join(data_dir_path, f"{path}.webm")
    #webm_data.append(file_path)
    #json_data = pd.read_json(file_path, orient="table")
    #print(json_data)
  
webm_data = pd.DataFrame(webm_data)
webm_data.to_csv("webm_data.csv")
ogg_data = pd.DataFrame(ogg_data)
ogg_data.to_csv("ogg_data.csv")

In [None]:
from torch.utils.data import Dataset
import soundfile as sf
import librosa
import torchaudio
import torch

class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, data, label_encoder = None):
        self.data_dir = data_dir
        self.data = pd.read_csv(data, sep=",")
        self.data = self.data[["uuid","cough_detected","SNR","age","gender","status"]].loc[self.data['cough_detected'] >= 0.5].dropna()
        self.label_encoder = label_encoder
        
        self.data_paths = []
        self.labels = []
        for file in self.data['uuid']:
            if os.path.exists(os.path.join(self.data_dir, f'{file}.webm')):
                self.data_paths.append(os.path.join(self.data_dir, f'{file}.webm'))
                self.labels.append(self.data.loc[self.data["uuid"] == file, "status"].values[0])
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        audio_path = self.data_paths[idx]
        print(audio_path)
        if os.path.exists(audio_path):
            print("exists")
        audio_sample,_ = torchaudio.load(audio_path, format="FFmpeg")
        #audio_sample, fs = librosa.load(audio_path, sr=None)
        #print("fs",fs)
        audio_label = self.labels[idx]
        audio_label = self.label_encoder.transform(audio_label)
        
        return audio_sample, audio_label

In [1]:
from matplotlib import pyplot as plt
from glob import glob
import pandas as pd
import numpy as np
import os

def preprocess_data(data_path, data_dir_path):
    # Read data file then remove every column other than the specified columns
    # Removes empty samples and filters through cough probability
    data = pd.read_csv(data_path, sep=",")
    data = data[["uuid","cough_detected","SNR","age","gender","status"]].loc[data['cough_detected'] >= 0.5].dropna()
    
    webm_data = []
    ogg_data = []
    for file in data['uuid']:
        if os.path.exists(os.path.join(data_dir_path, f'{file}.webm')):
            #print(data.loc[data["uuid"] == file, "status"].values[0])
            label = data.loc[data["uuid"] == file, "status"].values[0]
            webm_data.append((os.path.join(data_dir_path, f'{file}.webm'), label))
        elif os.path.exists(os.path.join(data_dir_path, f'{file}.ogg')):
            ogg_data.append(os.path.join(data_dir_path, f'{file}.ogg'))
    
    return webm_data
    
data = r"metadata_compiled.csv"
data_dir_path = r"../Dataset/MP3/"
#data = preprocess_data(data_path, data_dir_path)

#print(data)
#print(pd.DataFrame(data))

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(["healthy", "symptomatic", "COVID-19"])

test = AudioDataset(data_dir_path, data, le)

#print(test.__getitem__(0))

KeyboardInterrupt: 

In [None]:
print(test.__getitem__(1))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)