<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/Ravdess%20Inference%20Skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unzip Dataset (RAVDESS)

# Download And Extract

In [2]:
#Audio_Speech_Actors_01-24.zip
!gdown --id 1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
!gdown --id 1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg

Downloading...
From: https://drive.google.com/uc?id=1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [00:01<00:00, 191MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg
To: /content/Ravdess_csv.zip
100% 6.76k/6.76k [00:00<00:00, 10.5MB/s]


In [3]:
import zipfile
dataset_directory = '/content/Audio_Speech_Actors_01-24.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('Unzipped_Data')
zip_ref.close()

dataset_directory = '/content/Ravdess_csv.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Custom Dataset Class

Create custom Dataset

In [4]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn.functional import normalize


class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        #print(audio_sample_path)
        label = self._get_audio_sample_label(index)
        #print(label)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
    
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler.to(device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]

Instantiate the dataset

In [50]:
AUDIO_DIR = ""
SAMPLE_RATE = 16300
NUM_SAMPLES = SAMPLE_RATE*4  # either reduce the samples to 1/4 of number of samples or increase sample size 4 times of sample rate to adjust to 4 second audio
train_csv_file = '/content/train_csv.csv'
test_csv_file = '/content/test_csv.csv'
val_csv_file = '/content/val_csv.csv'

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=128
)

train_dataset = CustomDataset(train_csv_file,
                        AUDIO_DIR,
                        mel_spectrogram,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(train_dataset)} samples in the train dataset.")

test_dataset = CustomDataset(test_csv_file,
                        AUDIO_DIR,
                        mel_spectrogram,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(test_dataset)} samples in the test dataset.")


val_dataset = CustomDataset(val_csv_file,
                        AUDIO_DIR,
                        mel_spectrogram,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)

print(f"There are {len(val_dataset)} samples in validation  dataset.")
signal, label = val_dataset[0]
print(signal.shape)

Using device cpu
There are 896 samples in the train dataset.
There are 224 samples in the test dataset.
There are 224 samples in validation  dataset.
torch.Size([1, 128, 128])


In [51]:
signal, label = val_dataset[0]
print(signal.shape)
df = pd.read_csv(train_csv_file)
print(df['Emotion_ID'].value_counts())
df = pd.read_csv(test_csv_file)
print(df['Emotion_ID'].value_counts())
df = pd.read_csv(val_csv_file)
print(df['Emotion_ID'].value_counts())


torch.Size([1, 128, 128])
6    128
5    128
4    128
3    128
2    128
1    128
0    128
Name: Emotion_ID, dtype: int64
6    32
5    32
4    32
3    32
2    32
1    32
0    32
Name: Emotion_ID, dtype: int64
6    32
5    32
4    32
3    32
2    32
1    32
0    32
Name: Emotion_ID, dtype: int64


# Model Loading and Testing

In [52]:
from torch import nn
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
     
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=(7,7),
                stride=(2,2),
                padding=1 #67
            ),
            nn.ReLU(),
            nn.BatchNorm2d(16)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=(5,5),
                stride=(2,2),
                padding=1 #66
            ),
            nn.ReLU(),
            nn.BatchNorm2d(32)           
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=32,
                kernel_size=(3,3),
                stride=(2,2),
                padding=1 #65
            ),
            nn.ReLU(),
            nn.BatchNorm2d(32)  
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=(3,3),
                stride=(2,2),
                padding=1 #65
            ),
            nn.ReLU(),
            nn.BatchNorm2d(64)  
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=(3,3),
                stride=(2,2),
                padding=1 #65
            ),
            nn.ReLU(),
            nn.BatchNorm2d(64)  
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=(3,3),
                stride=(2,2),
                padding=1 #65
            ),
            nn.ReLU(),
            nn.BatchNorm2d(128)  
        )
        self.conv7 = nn.Sequential(
            nn.Conv2d(
                in_channels=128,
                out_channels=128,
                kernel_size=(3,3),
                stride=(2,2),
                padding=1 #65
            ),
            nn.ReLU(),
            nn.BatchNorm2d(128)  
        )
        self.flatten = nn.Flatten()
        self.linear1 = nn.Sequential(nn.Linear(128, 64),nn.Dropout(0.25))
        self.linear2 = nn.Linear(64, 7)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.flatten(x)
        
        x = self.linear1(x)
        logits = self.linear2(x)
        predictions = self.softmax(logits)
        return predictions
model = CNNNetwork().to(device)

In [61]:
from torch import nn
BATCH_SIZE = 32
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
    return train_dataloader

train_dataloader = create_data_loader(train_dataset, BATCH_SIZE)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE)
val_dataloader = create_data_loader(val_dataset, BATCH_SIZE)

dataloaders = {
  "train": train_dataloader,
  "test": test_dataloader,
  "val": val_dataloader,
}
dataset_sizes = {
  "train": len(train_dataloader),
  "test": len(test_dataloader),
  "val": len(val_dataloader),
}

In [66]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)  
    for batch, (X,y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 30 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return model

In [63]:
def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for (X,y) in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [67]:
import math
import time
EPOCHS = 30
start = time.time()
for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, criterion, optimizer)
    test_loop(val_dataloader, model, criterion)
final = (time.time() - start)/60
print(f"Done for all {EPOCHS} epochs in {math.ceil(final)} minutes\n")

Epoch 1
-------------------------------
tensor([[3.0635e-09, 1.9883e-12, 1.0000e+00, 2.8209e-14, 1.7850e-10, 4.7215e-06,
         4.3982e-09],
        [1.0000e+00, 1.8995e-10, 4.3100e-12, 4.6553e-09, 7.5132e-12, 7.6356e-10,
         1.2670e-15],
        [1.4645e-08, 1.1507e-05, 3.5182e-05, 2.9555e-08, 8.9617e-08, 1.8673e-04,
         9.9977e-01],
        [7.7914e-10, 2.2137e-07, 1.0219e-08, 9.9995e-01, 6.6936e-08, 4.6222e-05,
         2.2241e-06],
        [1.2547e-05, 2.0972e-06, 9.9998e-01, 1.6490e-09, 1.2399e-06, 6.7762e-08,
         9.9681e-08],
        [4.5020e-09, 3.9196e-07, 3.2492e-06, 6.1354e-10, 8.0208e-08, 1.5915e-07,
         1.0000e+00],
        [6.3140e-06, 2.2224e-09, 1.2216e-07, 1.4827e-06, 9.9999e-01, 5.6963e-08,
         2.3102e-06],
        [8.3021e-06, 1.1144e-07, 9.9999e-01, 5.4512e-10, 6.5514e-07, 4.4414e-07,
         1.6734e-07],
        [1.4992e-07, 4.9414e-05, 2.2747e-05, 9.9988e-01, 1.3584e-06, 3.6860e-05,
         4.5064e-06],
        [1.3147e-07, 2.3822e-06, 

KeyboardInterrupt: ignored