In [3]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import torchaudio.transforms as T
import torchvision.models as models
import torchaudio
import os
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
import torch.optim.lr_scheduler as lr_scheduler
# Implement Stratified K-Folds Cross-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

import config


In [4]:
class MyPipeline(torch.nn.Module):
    def __init__(
        self,
        input_freq,
        resample_freq,
        device,
        n_fft=2048,
        hop_length = 512,
        n_mels=80,  
        win_length = 2048,
        window = 'hann',
        desired_length_in_seconds=5,
        train=True,
    ):
        super().__init__()
        
        self.train = train
        self.desired_length_in_seconds = desired_length_in_seconds
        self.sample_rate = 32000
        self.mel_spectrogram = T.MelSpectrogram(
        sample_rate=32000,  # Your sample rate
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        win_length=win_length,
        window_fn=torch.hann_window  # This corresponds to the 'hann' window
        ).to(device)

        self.PS = T.PitchShift(self.sample_rate, n_steps=1.1).to(device)
        
        self.amplitude = T.AmplitudeToDB().to(device)

        self.spec_aug = torch.nn.Sequential(
            T.TimeStretch(random.uniform(0.8, 1.2), fixed_rate=True).to(device),
            T.FrequencyMasking(freq_mask_param=15).to(device),
            T.TimeMasking(time_mask_param=90).to(device),
        )

    @staticmethod
    def random_crop_or_pad(waveform, sample_rate, desired_length_in_seconds=5):
            """
            Randomly crops the waveform to the desired length in seconds.
            If the waveform is shorter than the desired length, it will be padded with zeros.
            """
            desired_length = desired_length_in_seconds * sample_rate
            current_length = waveform.shape[1]

            # If the waveform is shorter than desired, pad it with zeros
            side = random.randint(0,2)

            if current_length < desired_length:
                if side == 0:
                    padding_needed = desired_length - current_length
                    left_pad = padding_needed // 2
                    right_pad = padding_needed - left_pad
                    waveform = torch.nn.functional.pad(waveform, (left_pad, right_pad))
                elif side == 1:
                    padding_needed = desired_length - current_length
                    left_pad = padding_needed
                    right_pad = 0
                    waveform = torch.nn.functional.pad(waveform, (left_pad, right_pad))
                else:
                    padding_needed = desired_length - current_length
                    left_pad = 0
                    right_pad = padding_needed
                    waveform = torch.nn.functional.pad(waveform, (left_pad, right_pad))
            
            # Calculate the starting point for cropping
            start_idx = random.randint(0, waveform.shape[1] - desired_length)
            return waveform[:, start_idx:start_idx+desired_length]

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        
        # Apply pitch shift
        #waveform = self.PS(waveform)

        #crop or pad
        waveform = self.random_crop_or_pad(waveform, 32000, self.desired_length_in_seconds)

        # Convert to power spectrogram
        spec = self.mel_spectrogram(waveform)

        # Apply SpecAugment
        if self.train: spec = self.spec_aug(spec)
        
        # Convert to decibel
        spec = self.amplitude(spec).squeeze(0)


        if config.channels == 3:
            spec = torch.stack([spec, spec, spec]) 

        return spec

In [5]:
# Load the dataframe
labels_file = pd.read_csv('./data/labeled_ADSMI/labels_int.csv', index_col=0)
labels_file = labels_file.drop(columns=['fold'])
train_df, val_df = train_test_split(labels_file, test_size=0.2, stratify=labels_file['Label_int'], random_state=42)
# train test split
print("Train size: ", len(train_df))
print("Val size: ", len(val_df))


Train size:  3776
Val size:  944


In [6]:



class MyDataset_finetune(Dataset,):
    
    def __init__(self, train_indices=None, test_indices=None, train=True, sample_rate=32000, desired_length_in_seconds=10):
        self.root = './data/labeled_ADSMI/labeled_data_2013-535/'
        self.train = train
        
        #getting name of all files inside the all of the train_folds
        temp = os.listdir(self.root)
        temp.sort()
        self.file_names = []
        self.class_ids = []

        if train_indices is not None:
            self.file_names = labels_file.iloc[train_indices]["filename"].values
            self.class_ids = labels_file.iloc[train_indices]["Label_int"].values

        if test_indices is not None:
            self.file_names = labels_file.iloc[test_indices]["filename"].values
            self.class_ids = labels_file.iloc[test_indices]["Label_int"].values

        
        if self.train:
            self.pipeline = MyPipeline(sample_rate, sample_rate, 'cuda', desired_length_in_seconds=desired_length_in_seconds, train=self.train)
            self.pipeline.to(device=torch.device("cuda"), dtype=torch.float32)

        
        else: #for test
            self.pipeline = MyPipeline(sample_rate, sample_rate, 'cuda', desired_length_in_seconds=desired_length_in_seconds, train=self.train)
            self.pipeline.to(device=torch.device("cuda"), dtype=torch.float32)    
    
    def __len__(self):
        return len(self.file_names)
    


    def __getitem__(self, index):
        file_name = self.file_names[index]  
        path = self.root + file_name
        
        # Using torchaudio to load waveform
        waveform, sample_rate = torchaudio.load(path)
        waveform = waveform.to(device=torch.device("cuda"), dtype=torch.float32)

        mel_spec = self.pipeline(waveform)

        class_id = self.class_ids[index]

        return mel_spec, class_id

def create_generators_finetune(train_indices=None, test_indices=None):
    train_dataset = MyDataset_finetune(train_indices=train_indices, train=True, desired_length_in_seconds=config.desired_length_in_seconds)
    test_dataset = MyDataset_finetune(test_indices=test_indices, train=False, desired_length_in_seconds=config.desired_length_in_seconds)
    

    train_loader = DataLoader(train_dataset, batch_size = config.batch_size, shuffle=True, num_workers=0 ,drop_last=False)
    
    test_loader = DataLoader(test_dataset, batch_size = config.batch_size, shuffle=True, num_workers=0 ,drop_last=False)
    
    return train_loader, test_loader

In [7]:
print(config.channels)

class Resnet50_Classifier(nn.Module):
    def __init__(self, num_classes):
        super(Resnet50_Classifier, self).__init__()
        self.resnet50 = models.resnet50(pretrained=True)
        self.resnet50.conv1 = nn.Conv2d(config.channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
        num_features = self.resnet50.fc.in_features
        self.resnet50.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.resnet50(x)

3


In [8]:

#------Data fold generation for cross-validation
n_folds = 8
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

    




num_classes = len(set(labels_file["Label_int"]))  # Assuming the number of classes is the unique count of "Label_int" in your labels_file
model = Resnet50_Classifier(num_classes)
#model = ModifiedResnet50_Classifier(num_classes)
#model = ResNet101_Classifier(num_classes)



#  Transfer the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

# Define a loss function and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0004, weight_decay = 1e-4 ) # Adjust the value as needed)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.8, verbose=True)

train_losses = []
test_losses = []
config.best_accuracy = 0
config.model_path = "./results_standalone/test_checkpoint.pth"


for fold, (train_indices, test_indices) in enumerate(skf.split(train_df, train_df['Label_int'])):
    
    train_loader, test_loader = create_generators_finetune(train_indices=train_indices, test_indices=test_indices)
    #  Create an instance of the model
    
    # Training loop
    print("Fold: ", fold)
    num_epochs = 1  # Adjust this as needed
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (spectrograms, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
            spectrograms = spectrograms.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            train_losses.append(loss.item())
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()



        correct_predictions = 0
        total_samples = 0

        # Set the model to evaluation mode (important for dropout and batch normalization)
        model.eval()

        # Iterate through the test set
        with torch.no_grad():  # Disable gradient computation during testing
            for spectrograms, labels in test_loader:
                # Move data to the testing device
                spectrograms = spectrograms.to(device)
                labels = labels.to(device)

                # Forward pass
                outputs = model(spectrograms)
                loss = criterion(outputs, labels)
                test_losses.append(loss.item())
                
                # Compute the predicted labels
                _, predicted = torch.max(outputs, 1)

                # Update evaluation metrics
                total_samples += labels.size(0)
                correct_predictions += (predicted == labels).sum().item()

        #if new test accuracy is better than the previous best, save the model
        if correct_predictions / total_samples > config.best_accuracy:
            config.best_accuracy = correct_predictions / total_samples
            torch.save(model, config.model_path)
            
        # Step the learning rate scheduler
        scheduler.step(test_losses[-1])

        # Calculate accuracy or other evaluation metrics
        accuracy = correct_predictions / total_samples
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
        print(f"Test Accuracy: {accuracy * 100:.2f}%")







cuda
Fold:  0


 13%|█▎        | 7/52 [00:13<01:21,  1.82s/it]