In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

#for dirname, _, filenames in os.walk(dir):
#    for i, filename in enumerate(filenames):
#        print(os.path.join(dirname, filename))
#        if i == 5: break

#if not os.path.exists("/kaggle/working/models"):
#    os.makedirs('models')
       
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Setup and define custom dataset class

The custom dataset class finds each raw audio sample and corresponding label, encodes the label and returns the raw audio sample as mono-channel as well as the label.

In [2]:
from pydub.silence import split_on_silence

def remove_silence(audio_object, min_silence_ms=100, threshold_dBFS=-40, keep_silence=100, seek_step=1):
    # Check for loudness (DEBUGGING)
    #loudness_dBFS = audio_object.dBFS
    #print("Loudness (dBFS):", loudness_dBFS)

    # Attempt to split and remove silence from the audio signal
    audio_segments = split_on_silence(audio_object, min_silence_ms, threshold_dBFS, seek_step)

    # Check if audio_segments is empty if yes return the original audio object as numpy array
    if not audio_segments:

        # Get the array of samples from the audio segment
        org_audio = np.array(audio_object.get_array_of_samples(), dtype=np.float32)

        # Normalize the samples if needed
        org_audio /= np.max(np.abs(org_audio))

        return org_audio

    # Add the different audio segments together
    audio_processed = sum(audio_segments)

    # Return the samples from the processed audio, save as numpy array, and normalize it
    audio_processed = np.array(audio_processed.get_array_of_samples(), dtype=np.float32)
    audio_processed /= np.max(np.abs(audio_processed))
    #print("audio_processed",audio_processed)
    #print("audio_processed.shape",audio_processed.shape)

    return audio_processed

def encode_age(age):
    # Define age mapping
    age_mapping = {"child": 0, "teen": 1, "adult": 2, "senior": 3}

    # Determine age range
    if age <= 12:  # Children from ages 0-12
        return age_mapping["child"]
    elif age <= 19:  # Teenagers from ages 13-19
        return age_mapping["teen"]
    elif age <= 50:  # Adults from ages 20-50
        return age_mapping["adult"]
    else:  # Seniors (age > 50)
        return age_mapping["senior"]

In [3]:
from pydub import AudioSegment
import torchaudio
import torch
import numpy as np

class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, data, args, label_encoder=None):
        # Initialize attributes
        self.data = data["uuid"]
        self.label = data["status"]
        self.age = data["age"]
        self.gender = data["gender"]
        self.SNR = data["SNR"]
        self.label_encoder = label_encoder
        self.min_silence = args[0]
        self.threshold = args[1]
        self.keep_silence = args[2]
        self.sample_rate = {}

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        # Extract audio sample from idx
        audio_path = self.data[idx]
        #print("Audio path", audio_path)

        # Load in audio
        #audio_samples, sample_ratess = torchaudio.load(audio_path)
        #print("original signal", audio_samples)
        #print("original signal.shape", audio_samples.shape)
        #self.sample_rate[idx] = sample_rate
        audio_object = AudioSegment.from_file(audio_path)
        audio_sample = remove_silence(audio_object, self.min_silence, self.threshold, self.keep_silence)
        #print("processed signal", audio_sample)
        #print("processed signal.shape", audio_sample.shape)
        self.sample_rate[idx] = audio_object.frame_rate


        # Extract audio label from idx and transform
        audio_label = [self.label[idx]]
        audio_label = self.label_encoder.transform(audio_label)

        # Extract age, gender, and SNR from idx and encode the necessary features
        gender_mapping = {"male": 0, "female": 1}
        gender = np.array([gender_mapping[self.gender[idx]]])
        age = np.array(encode_age(self.age[idx]))
        snr = np.array([self.SNR[idx]])

        # Check if audio sample is stereo -> convert to mono (remove_silence already turns it into 1 channel)
        #if len(audio_sample.shape) > 1 and audio_sample.shape[1] > 1:
            # Convert stereo audio to mono
            #audio_sample = audio_sample.mean(dim=0, keepdim=True)

        return torch.tensor(audio_sample, dtype=torch.float32), torch.tensor(audio_label, dtype=torch.int64), torch.tensor(gender), torch.tensor(age), torch.tensor(snr)

    def __get_sample_rate__(self, idx):
        # If needed extract sample rate
        return self.sample_rate.get(idx)

# Custom collate function

The following collate function will take batches of raw audio samples and zero pad them to match the largest sized audio sample.

In [4]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.0)
    #print("batch", batch)
    #print("batch shape", batch.shape)
    return batch.unsqueeze(1) # Add channel dimension for MFCC input
    #return batch.permute(0, 2, 1)


def collate_fn(batch):
    # A data tuple has the form:
    # waveform, label

    # Separate audio samples and labels
    waveforms, labels, genders, ages, snrs = zip(*batch)

    # Pad the audio samples (if needed)
    #padded_waveforms = pad_sequence(waveforms)

    # Convert features and labels to tensor
    labels = torch.tensor(labels)
    genders = torch.tensor(genders)
    ages = torch.tensor(ages)
    snrs = torch.tensor(snrs)

    # Extract tensors from tuples
    #waveforms = [item[0] for item in waveforms]
    #original = [item[0] for item in original]
    #print("collate waveform",waveforms)
    #print("collate original waveform",original)

    # Convert to tensors
    #waveforms = np.array(waveforms, dtype=np.float32)
    #waveforms = torch.tensor(waveforms, dtype=torch.float32)
    #print("collate waveform tensor",waveforms)
    #original = torch.tensor(original, dtype=torch.float32)

    #waveforms = torch.tensor(waveforms, dtype=torch.float32)
    #original = torch.tensor(original, dtype=torch.float32)

    #print("collate waveform",waveforms)
    #print("collate waveform.shape",waveforms.shape)

    #print("collate original waveform",original)
    #print("collate original waveform.shape",original[0].shape)


    #return waveforms, labels, original
    return waveforms, labels, genders, ages, snrs
    #return padded_waveforms, labels, genders, ages, snrs

# Miscellaneous functions

The following code block contains miscellaneous functions such as plotting of waveforms, spectograms, fbank, and preprocessing of the data.

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import librosa

def waveform_plot(signal, sr, title, threshold=None, plot=None):
    # Calculate time axis
    time = np.arange(0, len(signal)) / sr

    plt.figure(figsize=(10, 8))

    # Plot standard waveform
    plt.subplot(3,1,1)
    plt.plot(time, signal, color='b')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.title(title)
    plt.grid(True)
    plt.show()

    if plot:
        # Calculate dBFS values
        if np.any(signal != 0):
            db_signal = 20 * np.log10(np.abs(signal) / np.max(np.abs(signal)))
        else:
            db_signal = -60

        plt.subplot(3,1,2)
        # Plot waveform in dB scale
        plt.plot(time, db_signal, color='b')

        # Plot threshold level
        if threshold:
            plt.axhline(y=threshold, color='r', linestyle='--', label=f'{threshold} dBFS Threshold')
            plt.legend()

        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude (dBFS)')
        plt.title(title)
        plt.grid(True)

        n_fft = 2048  # Length of the FFT window
        hop_length = 512  # Hop length for FFT
        S = np.abs(librosa.stft(signal.astype(float), n_fft=n_fft, hop_length=hop_length))

        # Convert amplitude to dB scale (sound pressure level)
        S_db = librosa.amplitude_to_db(S, ref=np.max)

        # Get frequency bins corresponding to FFT
        freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)

        # Step 3: Plot the SPL values over frequency
        plt.subplot(3,1,3)
        plt.plot(freqs, np.mean(S_db, axis=1), color='b')
        plt.title('Sound Pressure Level (SPL) vs. Frequency')
        plt.xlabel('Frequency (Hz)')
        plt.ylabel('SPL (dB)')
        plt.grid(True)
        plt.xlim([20, 25000])  # Set frequency range for better visualization
        plt.xscale('log')  # Use log scale for frequency axis


        plt.tight_layout()
        plt.show()

# Stolen from pytorch tutorial xd
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", batch=0, idx=0, ax=None):
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    ax.set_ylabel(ylabel)
    im = ax.imshow(
        librosa.power_to_db(specgram),
        origin="lower",
        aspect="auto",
        interpolation="nearest",
    )
    plt.colorbar(im, ax=ax, label='dB')
    #plt.close()
    plt.savefig(f"test_outputs/batch{batch}_idx{idx}_{title}.png")

def plot_fbank(fbank, title=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or "Filter bank")
    axs.imshow(fbank, aspect="auto")
    axs.set_ylabel("frequency bin")
    axs.set_xlabel("mel bin")

def preprocess_data(data_meta_path, data_dir_path, output_dir):
    # Read data file then remove every column other than the specified columns
    # Removes empty samples and filters through cough probability
    data = pd.read_csv(data_meta_path, sep=",")
    data = (
        data[["uuid", "cough_detected", "SNR", "age", "gender", "status"]]
        .loc[data["cough_detected"] >= 0.8]
        .dropna().reset_index(drop=True).sort_values(by='cough_detected')
    )
    data = data[(data["gender"] != "other")]

    #Count the occurrences of each age value
    age_counts = data['age'].value_counts()

    # Filter out ages with fewer than 100 samples
    ages_to_keep = age_counts.index[age_counts >= 100]

    # Filter the DataFrame based on the selected ages
    data = data[data['age'].isin(ages_to_keep)]

    # Check if the following MP3 with uuid exists
    mp3_data = []
    non_exist = []
    for file in data["uuid"]:
        if os.path.exists(os.path.join(data_dir_path, f"{file}.mp3")):
            mp3_data.append(os.path.join(data_dir_path, f"{file}.mp3"))
        else:
            non_exist.append(file)
        # elif os.path.exists(os.path.join(data_dir_path, f'{file}.ogg')):
        #    ogg_data.append(os.path.join(data_dir_path, f'{file}.ogg'))

    # Remove entries with missing MP3 files from the original data
    data = data[~data["uuid"].isin(non_exist)]

    # Replace the uuids with the path to uuid
    data["uuid"] = mp3_data

    # Save the data as csv
    data.to_csv(os.path.join(output_dir, "filtered_audio_data.csv"), index=False)

    print("Finished processing!")

In [6]:
"""
data_path = r"misc_data/metadata_compiled.csv"
data_dir_path = r"../Dataset/MP3/"
output_dir = r"misc_data/"
preprocess_data(data_path, data_dir_path, output_dir)
"""

'\ndata_path = r"misc_data/metadata_compiled.csv"\ndata_dir_path = r"../Dataset/MP3/"\noutput_dir = r"misc_data/"\npreprocess_data(data_path, data_dir_path, output_dir)\n'

# Dataset specific functions

The following codeblock contains functions specially related to the dataset preprocessing.

In [7]:
from sklearn.model_selection import train_test_split

def preprocess_dataset(data, test_size):
    # Extract audio samples and labels
    X = data.drop(columns=["status"])
    y = data["status"]


    # Perform a stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )

    # Combine audio samples and target labels for training and validation sets
    train_data = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

    return train_data, test_data

def weighted_sample(data):
    # Find class distribution
    class_counts = data["status"].value_counts()
    # print(class_counts)

    # Check class weights
    class_weights = 1 / class_counts
    # print(class_weights)

    # Adjust weighting to each sample
    sample_weights = [1 / class_counts[i] for i in data["status"].values]
    # print("len sample weights:",len(sample_weights))

    return sample_weights

def undersample(data, n, normalize=False):
    # Step 1: Identify majority class
    class_counts = data["status"].value_counts()
    majority_class = class_counts.idxmax()

    # Step 2: Calculate desired class distribution (e.g., balanced distribution)
    desired_class_count = n  # Target number of samples for each class

    # Step 3: Select subset from majority class
    undersampled_data_majority = data[data["status"] == majority_class].sample(
        n=desired_class_count
    )

    # Combine with samples from minority classes
    undersampled_data_minority = data[~(data["status"] == majority_class)]

    # Combine undersampled majority class with minority classes
    undersampled_data = pd.concat(
        [undersampled_data_majority, undersampled_data_minority]
    )

    # Shuffle the undersampled dataset
    undersampled_data = undersampled_data.sample(frac=1).reset_index(drop=True)

    return undersampled_data

def visualize_dataset(data, normalize, title):
    print(f"{title} Distribution")
    print(data["status"].value_counts(normalize=normalize))
    print("Total samples", len(data))

    plt.figure(figsize=(6, 4))
    plt.title(f"Histogram of Patient Status\n- {title}")
    plt.bar(data["status"].value_counts().index, data["status"].value_counts())
    plt.xticks(rotation=20, ha="right", fontsize=8)
    plt.xlabel("Class", fontsize=8)
    plt.ylabel("Frequency", fontsize=8)
    plt.show()

# Initialization of dataset and dataset loader

This codeblock includes the initialization of the dataset as well as any processing needed, such as splitting it into training/testing datasets, as well as different sampling techniques, such as undersampling/weighted sampling.

In [8]:
from torch.utils.data import WeightedRandomSampler
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import os

# Set seed for reproducibility
torch.manual_seed(42)

# Load data
#data = pd.read_csv("/kaggle/input/covid-19-audio-classification/filtered_audio_data.csv")
#data = pd.read_csv("/kaggle/input/filtered-csv/filtered_audio_data.csv")
##data['uuid'] = data['uuid'].apply(lambda x: x.replace('../Dataset/MP3/', "/kaggle/input/covid-19-audio-classification/MP3/"))
#data['uuid'] = data['uuid'].str.replace('../Dataset/MP3/',"/kaggle/input/covid-19-audio-classification/MP3/")

#print(data.head())

data = pd.read_csv("misc_data/filtered_audio_data.csv")

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform labels into encoded form
labels = ["healthy", "symptomatic", "COVID-19"]
encoded_labels = le.fit_transform(labels)

# Prepare standard dataset
train_data, test_data = preprocess_dataset(data, 0.3) # First split the original dataset into 70% training
val_data, test_data = preprocess_dataset(test_data, 0.5) # Second split the "test_data" into 50/50 validation and test (or technically 15/15)

# Prepare and create undersampled version
#undersampled_data = undersample(data, 2000, True)
#visualize_dataset(undersampled_data, None, "Standard")
#visualize_dataset(undersampled_data, "normalize", "Normalized")
#train_undersampled_data, test_undersampled_data = preprocess_dataset(undersampled_data, 0.3)
#val_undersampled_data, test_undersampled_data = preprocess_dataset(test_undersampled_data, 0.5)
#visualize_dataset(train_undersampled_data, None, "Train")
#visualize_dataset(train_undersampled_data, "normalize", "Train Normalized")
#visualize_dataset(val_undersampled_data, None, "Validation")
#visualize_dataset(val_undersampled_data, "normalize", "Validation Normalized")

# Prepare and create weighted sampler
train_sample_weights = weighted_sample(train_data)
val_sample_weights = weighted_sample(val_data)
test_sample_weights = weighted_sample(test_data)

train_weighted_Sampler = WeightedRandomSampler(weights=train_sample_weights, num_samples=len(train_data), replacement=True)
val_weighted_Sampler = WeightedRandomSampler(weights=val_sample_weights, num_samples=len(val_data), replacement=True)
test_weighted_Sampler = WeightedRandomSampler(weights=test_sample_weights, num_samples=len(test_data), replacement=True)

# Create AudioDataset instances for training and validation sets
# Standard dataset
min_silence = 500
threshold = -40
keep_silence = 250
args = [min_silence, threshold, keep_silence]

train_dataset = AudioDataset(train_data, args, le)
val_dataset = AudioDataset(val_data, args, le)
test_dataset = AudioDataset(test_data, args, le)

# Undersampled dataset
#train_undersampled_dataset = AudioDataset(train_undersampled_data, le)
#test_undersampled_dataset = AudioDataset(test_undersampled_data, le)

# Create training and test dataloader instances
batch = 8
workers = 0
pin_memory = True

#train_dataloader = DataLoader(train_dataset, batch_size=batch, shuffle=False, num_workers=workers, collate_fn=collate_fn, pin_memory=pin_memory)
#val_dataloader = DataLoader(train_dataset, batch_size=batch, shuffle=True, num_workers=workers, collate_fn=collate_fn, pin_memory=pin_memory)
#test_dataloader = DataLoader(test_dataset, batch_size=batch, shuffle=False, num_workers=workers, collate_fn=collate_fn, pin_memory=pin_memory)

"""
train_undersampled_dataloader = DataLoader(
    train_dataset,
    batch_size=batch,
    shuffle=True,
    num_workers=workers,
    collate_fn=collate_fn,
    pin_memory=True,
)
val_undersampled_dataloader = DataLoader(
    val_dataset,
    batch_size=batch,
    shuffle=False,
    num_workers=workers,
    collate_fn=collate_fn,
    pin_memory=True,
)
test_undersampled_dataloader = DataLoader(
    test_dataset,
    batch_size=batch,
    shuffle=False,
    num_workers=workers,
    collate_fn=collate_fn,
    pin_memory=True,
)
"""

train_weighted_dataloader = DataLoader(train_dataset, sampler=train_weighted_Sampler, batch_size=batch, num_workers=workers, collate_fn=collate_fn, pin_memory=True)
val_weighted_dataloader = DataLoader(val_dataset, sampler=val_weighted_Sampler, batch_size=batch, num_workers=workers, collate_fn=collate_fn, pin_memory=True)
test_weighted_dataloader = DataLoader(test_dataset, sampler=test_weighted_Sampler, batch_size=batch, num_workers=workers, collate_fn=collate_fn, pin_memory=True)

# Initialize and define MFCC feature extractor

In the following codeblock the MFCC specific parameters are defined and initialized. The codeblock also includes a function that pads the extracted MFCC features in order to pass it to the model.

In [9]:
from torchaudio.transforms import MFCC
from torchvision.transforms import Resize
import torch.nn.functional as F

def MFCC_Features(data, padding=3000, normalize=False, resize=False, batch=0):
    """
    Args:
    data: Input audio waveform
    max_length: Maximum length for padding
    normalize: Normalize the channel layer
    resize: Resize the spectrogram
    target_size: Target size for resizing
    """
    # Extract MFCC features
    #features = mfcc(data)
    #print("features.shape",features.shape)
    #features = [mfcc(waveform) for waveform in data]
    features = [torch.unsqueeze(mfcc(waveform), 0) for waveform in data] # Adding channels
    #features = [torch.unsqueeze(torch.unsqueeze(mfcc(waveform), 0), 0) for waveform in data] # Adding batch size and channels


    #for feature in features:
    #        print("Feature shape after MFCC:", feature.shape)

    #for i, feature in enumerate(features):
    #    x = torch.unsqueeze(torch.unsqueeze(feature,0),0)
    #    #x = torch.unsqueeze(feature,0)
    #    features[i] = x

    #for feature in features:
            #print("Feature shape after adding channels:", feature.shape)

    # Hardcoded padding
    if padding:
        features = F.pad(features, (0, padding - features.shape[3]), "constant", 0)


    # Normalize the features for each sample
    if normalize == True:
        #for j, i in enumerate(features):
        #    plot_spectrogram(i[0], "Before normalization", batch = batch, idx = j)
        #features = (features - features.mean()) / features.std()

        #print("features.shape during normalization step", features.shape)
        for j, feature in enumerate(features):
            mean = feature.mean(dim=[1 ,2], keepdim=True)
            std = feature.std(dim=[1, 2], keepdim=True)
            features[j] = (feature - mean) / std
        #features = [(feature - feature.mean()) / feature.std() for feature in features]
        #for j, i in enumerate(features):
        #    #print("shape after normalization i",i.shape)
        #    plot_spectrogram(i[0], "After normalization", batch = batch, idx = j)

    #print("len normal", len(features[0]))

    # Add two artificial channels filled with zeros
    if resize == True:
        #artificial_channels = torch.zeros(features.shape[0], 2, features.shape[2], features.shape[3])
        #features = torch.cat([features, artificial_channels], dim=1)
        #for j, i in enumerate(features):
        #    plot_spectrogram(i[0], "Before resizing", batch = batch, idx = j)
        #features = Resize((224,224), antialias=True)(features)

        # Print out the shape of each feature tensor
        #for feature in features:
        #    print("Feature shape before resizing:", feature[0].shape)

        features = [Resize((224, 224), antialias=True)(feature) for feature in features]
        #for feature in features:
        #    print("Feature shape after resizing:", feature.shape)

        #for j, i in enumerate(features):
        #    plot_spectrogram(i[0], "After resizing", batch = batch, idx = j)

    features = torch.stack(features)
    #print("features after stacking", features.shape)
    return features

# Settings for MelSpectrogram computation
melkwargs = {
    "n_mels": 80,  # How many mel frequency filters are used
    "n_fft": 350,  # How many fft components are used for each feature
    "win_length": 350,  # How many frames are included in each window
    "hop_length": 100,  # How many frames the window is shifted for each component
    "center": False,  # Whether frams are padded such that the component of timestep t is centered at t
    "f_max": 11000,  # Maximum frequency to consider
    "f_min": 0,
}

# Instantiate MFCC feature extractor
mfcc = MFCC(
    n_mfcc=22,  # Number of cepstrum components
    sample_rate=22000,  # Sample rate of input audio
    melkwargs=melkwargs)  # Keyword arguments for MelSpectogram



# Initializing and defining model

The following codeblock contains the initialization of the ResNet50 model from the PyTorch library.

In [10]:
from torchvision import models
import torch.nn as nn
import torch

# Load in the pre-trained resnet model
#model = models.vgg16_bn(weights=None, num_classes=3)
model = models.resnet18(weights=None, num_classes=3)
#model = models.resnet18()
#model = models.resnet50()

# Modifying the first layer to be able to pass 1-channel image (spectrogram) for ResNet model
#model.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1,1), padding=(1, 1)) VGG
#model.conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1,1), padding=(1, 1)) # RESNET

# Set the model to training mode and put it on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device running on: {device}")

model.to(device) # add ";" to keep from printing the network architecture
print(model)
# Wrap your model with DataParallel if multiple GPUs are available
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
else:
    print("Only 1 GPU available!")

Device running on: cuda
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu

# Setup weights and bias logging

In [11]:
import wandb

# Initialize wandb
#!wandb login --relogin 9be53a0c7076cae09612be80ee5e0e80d9dac79c

# Defining training variables
lr = 0.1
step = 5
decay = 0.1
optim = "adam"
gamma = 0.1
epochs = 50


## Defining weights and biases config
#wandb.init(
#    # set the wandb project where this run will be logged
#    project="mini-project",
#    config={
#    "architecture": "ResNet18",
#    "description": "Weighted, Normalized, and resized mel spectrograms to 224x224 + removing silent audio parts",
#    "dataset": "COVID-19 Audio Classification",
#    "learning_rate": lr,
#    "step_size": step,
#    "weight_decay": decay,
#    "optimizer": optim,
#    "gamma": gamma,
#    "epochs": epochs
#    }
#)


# Training loop

In [12]:
from sklearn.utils.class_weight import compute_class_weight
train_labels = train_data["status"]
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)

print("COVID-19","Healthy", "Symptomatic")
print(class_weights)

COVID-19 Healthy Symptomatic
[5.88954635 0.43635832 1.85696517]


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import GradScaler
from tqdm import tqdm
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import torch
import os

if optim == "adam":
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=decay)
else:
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=decay)
scheduler = StepLR(optimizer, step_size=step, gamma=0.5)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
#scaler = GradScaler()
log_interval = 20
best_vloss = float("inf")
model_no = 0

print("Currently: Training")
for epoch in range(epochs):
    model.train() # Initiate training mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    # Training loop
    for i, (inputs, targets, *_) in tqdm(enumerate(
       train_weighted_dataloader),
       total=len(train_weighted_dataloader),
       leave=True,
       desc=f"Epoch {epoch+1}/{epochs} | Training"
        ):
        #print(f"========== BATCH {i} ========== ")
        #for idx, f in enumerate(features):
        #    print(f"{idx+1} | {f.shape}")
        #print(f"========== BATCH {i} ========== ")

        features = MFCC_Features(inputs, padding=False, normalize=True, resize=True) # Compute the MFCC features
        features, targets = features.to(device), targets.to(device) # Load them onto GPU
        optimizer.zero_grad() # Zero the parameters
        outputs = model(features) # Retrieve the output from the model
        loss = criterion(outputs, targets) # Compute the loss
        #loss = F.cross_entropy(outputs, targets, reduction='mean')
        loss.backward() # Compute gradients of the loss
        optimizer.step() # Update weights

        running_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == targets).sum().item()
        total_predictions += targets.size(0)

        if i % log_interval == 0:
            print(f"Epoch {epoch+1}/{epochs} | Batch {i}/{len(train_weighted_dataloader)} | Training Loss: {loss.item():.4f}")

    # Compute accuracy
    accuracy = correct_predictions / total_predictions

    print(f"Training Accuracy: {accuracy:.4f}")

    # Compute average training loss for the epoch
    avg_loss = running_loss / len(train_weighted_dataloader)

    # Validation loop
    running_vloss = 0.0
    vcorrect_predictions = 0
    vtotal_predictions = 0
    model.eval()
    with torch.no_grad(): # Disable gradient computation
        for j, (vinputs, vtargets, *_) in tqdm(enumerate(
            val_weighted_dataloader),
            total=len(val_weighted_dataloader),
            leave=True,
            desc=f"Epoch {epoch+1}/{epochs} | Validating"):
            vfeatures = MFCC_Features(vinputs, padding=False, normalize=True, resize=True) # Compute the MFCC features
            vfeatures, vtargets = vfeatures.to(device), vtargets.to(device) # Load them onto GPU
            voutputs = model(vfeatures)
            vloss = criterion(voutputs.to(device), vtargets)
            #vloss = F.cross_entropy(voutputs, vtargets, reduction='mean')
            running_vloss += vloss.item()

            # Calculate accuracy
            _, vpredicted = torch.max(voutputs, 1)
            vcorrect_predictions += (vpredicted == vtargets).sum().item()
            vtotal_predictions += vtargets.size(0)

    # Compute average validation loss for the epoch
    avg_vloss = running_vloss / len(val_weighted_dataloader)

    # Compute accuracy
    vaccuracy = vcorrect_predictions / vtotal_predictions

    # Compute precision, recall, F1 score
    precision = precision_score(vtargets.cpu(), vpredicted.cpu(), average='macro',zero_division=0.0)
    recall = recall_score(vtargets.cpu(), vpredicted.cpu(), average='macro',zero_division=0.0)
    f1 = f1_score(vtargets.cpu(), vpredicted.cpu(), average='macro',zero_division=0.0)

    # Log metrics to wandb
    wandb.log({"precision": precision, "recall": recall, "f1_score": f1})
    wandb.log({"epoch": epoch+1, "train_loss": avg_loss,"train_acc": accuracy, "val_loss": avg_vloss, "val_accuracy": vaccuracy})

    print(f"Epoch #{epoch+1} | Training Loss: {avg_loss:.4f}  |  Training Accuracy: {accuracy:.4f} | Validation Loss: {avg_vloss:.4f} | Validation Accuracy: {vaccuracy:.4f}\n           Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")

    # Update learning rate
    #print_lr(is_verbose, group, lr, epoch=None)
    scheduler.step()

    print(f"Epoch {epoch+1}, Learning Rate: {scheduler.get_last_lr()}")

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_no += 1
        model_path = f"/kaggle/working/models/ResNet18_weighted_model_no_{model_no}_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), model_path)

# Finish the run
wandb.finish()

Currently: Training


Epoch 1/50 | Training:   0%|          | 0/747 [00:05<?, ?it/s]


RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[8, 1, 224, 224] to have 3 channels, but got 1 channels instead

In [None]:
wandb.finish()