<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/Ravdess%20Inference%20try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unzip Dataset (RAVDESS)

In [1]:
!gdown --id 1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY

Downloading...
From: https://drive.google.com/uc?id=1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [00:01<00:00, 186MB/s]


In [2]:
import zipfile
dataset_directory = '/content/Audio_Speech_Actors_01-24.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('Unzipped_Data')
zip_ref.close()

Create custom Dataset

In [14]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn.functional import normalize


class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        #print(audio_sample_path)
        label = self._get_audio_sample_label(index)
        #print(label)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        # produce delta and delta-delta and add to mfcc
        delta = torchaudio.functional.compute_deltas(signal)
        delta2 = torchaudio.functional.compute_deltas(delta)
        #signal = torch.cat((signal, delta, delta2), 1) # concatenating deltas
        
        signal = signal + delta + delta2
        avg_mfcc_deltas = signal
        #columns = list(signal[0][1].shape)
        #columns = int(columns[0])
        #avg_mfcc_deltas = (torch.sum(signal, 2))
        #avg_mfcc_deltas /= 60
        
        #means = avg_mfcc_deltas.mean(dim=1, keepdim=True)
        #stds = avg_mfcc_deltas.std(dim=1, keepdim=True)
        #avg_mfcc_deltas = (avg_mfcc_deltas - means) / stds #NORMALIZED
        #avg_mfcc_deltas = normalize(avg_mfcc_deltas, p=2.0)

        return  avg_mfcc_deltas,label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler.to(device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]

Instantiate the dataset

In [15]:
if __name__ == "__main__":
    AUDIO_DIR = ""
    SAMPLE_RATE = 48000
    NUM_SAMPLES = SAMPLE_RATE*3  # either reduce the samples to 1/4 of number of samples or increase sample size 4 times of sample rate to adjust to 4 second audio
    train_csv_file = '/content/train_csv.csv'
    test_csv_file = '/content/test_csv.csv'
    val_csv_file = '/content/val_csv.csv'

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using device {device}")

    mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=20, melkwargs={"n_fft": 2048, "hop_length": 512, "power": 2})

    test_csv_file = CustomDataset(test_csv_file,
                            AUDIO_DIR,
                            mfcc,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    print(f"There are {len(test_csv_file)} samples in the dataset.")
   

    val_csv_file = CustomDataset(val_csv_file,
                            AUDIO_DIR,
                            mfcc,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    print(f"There are {len(val_csv_file)} samples in the dataset.")
  
    #print(signal.size())  
    #print(len(usd))

Using device cuda
There are 269 samples in the dataset.
There are 108 samples in the dataset.


In [7]:
!gdown --id 1NiKtbbebxL8kNPw7bsbCr85lprAomcNw

Downloading...
From: https://drive.google.com/uc?id=1NiKtbbebxL8kNPw7bsbCr85lprAomcNw
To: /content/Resnet50_Ravdess_71acc.pth
100% 94.4M/94.4M [00:00<00:00, 203MB/s]


In [16]:
model = torch.load('/content/Resnet50_Ravdess_71acc.pth')


In [17]:
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

In [28]:
test_dataloader = create_data_loader(val_csv_file,32)

In [29]:
def test_single_epoch(model, dataloader, device):
  correct = 0
  size = len(dataloader.dataset)

  model.eval()
  for input,target in dataloader:
        input, target = input.to(device), target.to(device)
        # calculate loss
        prediction = model(input)
        correct += (prediction.argmax(1) == target).type(torch.float).sum().item()
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

In [30]:
test_single_epoch(model,test_dataloader,device)

Test Error: 
 Accuracy: 87.0% 

