<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/Ravdess%20Inference%20Skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unzip Dataset (RAVDESS)

# Download And Extract

In [1]:
#Model Link
!gdown --id 1NiKtbbebxL8kNPw7bsbCr85lprAomcNw

Downloading...
From: https://drive.google.com/uc?id=1NiKtbbebxL8kNPw7bsbCr85lprAomcNw
To: /content/Resnet50_Ravdess_71acc.pth
100% 94.4M/94.4M [00:00<00:00, 116MB/s] 


In [2]:
#Audio_Speech_Actors_01-24.zip
!gdown --id 1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
!gdown --id 1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg

Downloading...
From: https://drive.google.com/uc?id=1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [00:02<00:00, 99.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg
To: /content/Ravdess_csv.zip
100% 6.76k/6.76k [00:00<00:00, 11.8MB/s]


In [4]:
import zipfile
dataset_directory = '/content/Audio_Speech_Actors_01-24.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('Unzipped_Data')
zip_ref.close()

dataset_directory = '/content/Ravdess_csv.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Custom Dataset Class

Create custom Dataset

In [5]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn.functional import normalize


class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        #print(audio_sample_path)
        label = self._get_audio_sample_label(index)
        #print(label)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        # produce delta and delta-delta and add to mfcc
        delta = torchaudio.functional.compute_deltas(signal)
        delta2 = torchaudio.functional.compute_deltas(delta)
        #signal = torch.cat((signal, delta, delta2), 1) # concatenating deltas
        
        signal = signal + delta + delta2
        avg_mfcc_deltas = signal
        #columns = list(signal[0][1].shape)
        #columns = int(columns[0])
        #avg_mfcc_deltas = (torch.sum(signal, 2))
        #avg_mfcc_deltas /= 60
        
        #means = avg_mfcc_deltas.mean(dim=1, keepdim=True)
        #stds = avg_mfcc_deltas.std(dim=1, keepdim=True)
        #avg_mfcc_deltas = (avg_mfcc_deltas - means) / stds #NORMALIZED
        #avg_mfcc_deltas = normalize(avg_mfcc_deltas, p=2.0)

        return avg_mfcc_deltas,label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler.to(device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]

Instantiate the dataset

In [6]:
AUDIO_DIR = ""
SAMPLE_RATE = 48000
NUM_SAMPLES = SAMPLE_RATE*4  # either reduce the samples to 1/4 of number of samples or increase sample size 4 times of sample rate to adjust to 4 second audio
train_csv_file = '/content/train_csv.csv'
test_csv_file = '/content/test_csv.csv'
val_csv_file = '/content/val_csv.csv'

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=20, melkwargs={"n_fft": 2048, "hop_length": 512, "power": 2})

train_dataset = CustomDataset(train_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(train_dataset)} samples in the train dataset.")

test_dataset = CustomDataset(test_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(test_dataset)} samples in the test dataset.")


val_dataset = CustomDataset(val_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)

print(f"There are {len(val_dataset)} samples in validation  dataset.")


Using device cpu
There are 896 samples in the train dataset.
There are 224 samples in the test dataset.
There are 224 samples in validation  dataset.


In [7]:
signal, label = val_dataset[0]
print(signal.shape)
df = pd.read_csv(train_csv_file)
print(df['Emotion_ID'].value_counts())
df = pd.read_csv(test_csv_file)
print(df['Emotion_ID'].value_counts())
df = pd.read_csv(val_csv_file)
print(df['Emotion_ID'].value_counts())


torch.Size([1, 20, 376])
6    128
5    128
4    128
3    128
2    128
1    128
0    128
Name: Emotion_ID, dtype: int64
6    32
5    32
4    32
3    32
2    32
1    32
0    32
Name: Emotion_ID, dtype: int64
6    32
5    32
4    32
3    32
2    32
1    32
0    32
Name: Emotion_ID, dtype: int64


# Model Loading and Testing

In [8]:
batch_size = 32
model = torch.load('/content/Resnet50_Ravdess_71acc.pth',map_location=device)


In [9]:
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
    return train_dataloader

In [10]:
test_dataloader = create_data_loader(test_dataset,batch_size)
val_dataloader = create_data_loader(val_dataset,batch_size)
train_dataloader = create_data_loader(train_dataset,batch_size)

In [11]:
def test_single_epoch(model, dataloader, device):
  correct = 0
  size = len(dataloader.dataset)

  model.eval()
  for input,target in dataloader:
        input, target = input.to(device), target.to(device)
        # calculate loss
        prediction = model(input)
        correct += (prediction.argmax(1) == target).type(torch.float).sum().item()
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

In [12]:
test_single_epoch(model,test_dataloader,device)


Test Error: 
 Accuracy: 81.2% 



In [13]:
#just for Assurance!!!!
test_single_epoch(model,val_dataloader,device)


Test Error: 
 Accuracy: 81.7% 



In [14]:
test_single_epoch(model,train_dataloader,device)

Test Error: 
 Accuracy: 84.3% 

