<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/Ravdess%20Inference%20try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unzip Dataset (RAVDESS)

In [1]:
!gdown --id 1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY

Downloading...
From: https://drive.google.com/uc?id=1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [00:01<00:00, 186MB/s]


In [2]:
import zipfile
dataset_directory = '/content/Audio_Speech_Actors_01-24.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('Unzipped_Data')
zip_ref.close()

Create custom Dataset

In [14]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn.functional import normalize


class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        #print(audio_sample_path)
        label = self._get_audio_sample_label(index)
        #print(label)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        # produce delta and delta-delta and add to mfcc
        delta = torchaudio.functional.compute_deltas(signal)
        delta2 = torchaudio.functional.compute_deltas(delta)
        #signal = torch.cat((signal, delta, delta2), 1) # concatenating deltas
        
        signal = signal + delta + delta2
        avg_mfcc_deltas = signal
        #columns = list(signal[0][1].shape)
        #columns = int(columns[0])
        #avg_mfcc_deltas = (torch.sum(signal, 2))
        #avg_mfcc_deltas /= 60
        
        #means = avg_mfcc_deltas.mean(dim=1, keepdim=True)
        #stds = avg_mfcc_deltas.std(dim=1, keepdim=True)
        #avg_mfcc_deltas = (avg_mfcc_deltas - means) / stds #NORMALIZED
        #avg_mfcc_deltas = normalize(avg_mfcc_deltas, p=2.0)

        return  avg_mfcc_deltas,label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler.to(device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]

Instantiate the dataset

In [15]:
if __name__ == "__main__":
    AUDIO_DIR = ""
    SAMPLE_RATE = 48000
    NUM_SAMPLES = SAMPLE_RATE*3  # either reduce the samples to 1/4 of number of samples or increase sample size 4 times of sample rate to adjust to 4 second audio
    train_csv_file = '/content/train_csv.csv'
    test_csv_file = '/content/test_csv.csv'
    val_csv_file = '/content/val_csv.csv'

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using device {device}")

    mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=20, melkwargs={"n_fft": 2048, "hop_length": 512, "power": 2})

    test_csv_file = CustomDataset(test_csv_file,
                            AUDIO_DIR,
                            mfcc,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    print(f"There are {len(test_csv_file)} samples in the dataset.")
   

    val_csv_file = CustomDataset(val_csv_file,
                            AUDIO_DIR,
                            mfcc,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    print(f"There are {len(val_csv_file)} samples in the dataset.")
  
    #print(signal.size())  
    #print(len(usd))

Using device cuda
There are 269 samples in the dataset.
There are 108 samples in the dataset.


In [7]:
!gdown --id 1NiKtbbebxL8kNPw7bsbCr85lprAomcNw

Downloading...
From: https://drive.google.com/uc?id=1NiKtbbebxL8kNPw7bsbCr85lprAomcNw
To: /content/Resnet50_Ravdess_71acc.pth
100% 94.4M/94.4M [00:00<00:00, 203MB/s]


In [16]:
model = torch.load('/content/Resnet50_Ravdess_71acc.pth')


In [17]:
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

In [28]:
test_dataloader = create_data_loader(val_csv_file,32)

In [29]:
def test_single_epoch(model, dataloader, device):
  correct = 0
  size = len(dataloader.dataset)

  model.eval()
  for input,target in dataloader:
        input, target = input.to(device), target.to(device)
        # calculate loss
        prediction = model(input)
        correct += (prediction.argmax(1) == target).type(torch.float).sum().item()
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

In [30]:
test_single_epoch(model,test_dataloader,device)

Test Error: 
 Accuracy: 87.0% 



In [None]:
BATCH_SIZE = 64
EPOCHS = 100
LEARNING_RATE = 0.001
import torch.utils.data as data

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def test_single_epoch(model, dataloader, loss_fn, optimiser, device):
  correct = 0
  size = len(dataloader.dataset)
  with torch.no_grad():
    for input, target in dataloader:
          input, target = input.to(device), target.to(device)

          # calculate loss
          prediction = model(input)
          loss = loss_fn(prediction, target)

          # backpropagate error and update weights
          #optimiser.zero_grad()
          #loss.backward()
          #optimiser.step()
          correct += (prediction.argmax(1) == target).type(torch.float).sum().item()

    print(f"loss: {loss.item()}")
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

# original train function
def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


def train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, train_dataloader, loss_fn, optimiser, device)
        test_single_epoch(model, test_dataloader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


# initialise loss funtion + optimiser
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(),
                              lr=LEARNING_RATE)

# train model
#train(model, train_dataloader, loss_fn, optimiser, device, EPOCHS)

# save model
#torch.save(cnn.state_dict(), "cnn.pth")
#print("Trained Convolutional Neural Network saved at cnn.pth")

Optimiser: Adam
LR = 0.0001
Epochs = 100
cnn params = 2.5M


In [None]:
train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, EPOCHS)

Epoch 1
loss: 2.033282995223999
loss: 2.033538579940796
Test Error: 
 Accuracy: 11.5% 

---------------------------
Epoch 2
loss: 2.0070557594299316
loss: 2.0512685775756836
Test Error: 
 Accuracy: 22.9% 

---------------------------
Epoch 3
loss: 2.0062015056610107
loss: 2.0284602642059326
Test Error: 
 Accuracy: 24.3% 

---------------------------
Epoch 4
loss: 1.9975597858428955
loss: 2.0138556957244873
Test Error: 
 Accuracy: 27.8% 

---------------------------
Epoch 5
loss: 1.986736536026001
loss: 2.0612738132476807
Test Error: 
 Accuracy: 32.3% 

---------------------------
Epoch 6
loss: 1.9401371479034424
loss: 2.0289509296417236
Test Error: 
 Accuracy: 31.9% 

---------------------------
Epoch 7
loss: 1.9318186044692993
loss: 2.045159339904785
Test Error: 
 Accuracy: 30.9% 

---------------------------
Epoch 8
loss: 1.9668943881988525
loss: 2.0384926795959473
Test Error: 
 Accuracy: 31.9% 

---------------------------
Epoch 9
loss: 1.9367958307266235
loss: 2.0194029808044434
Te

In [None]:
torch.save(model.state_dict(), "cnn.pth")

In [None]:
EPOCHS = 50
LEARNING_RATE = 0.001
train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, EPOCHS)

Epoch 1
loss: 2.0654027462005615
loss: 2.0706677436828613
Test Error: 
 Accuracy: 11.1% 

---------------------------
Epoch 2
loss: 2.0616683959960938
loss: 2.0481722354888916
Test Error: 
 Accuracy: 12.8% 

---------------------------
Epoch 3
loss: 2.057406187057495
loss: 2.0335593223571777
Test Error: 
 Accuracy: 15.3% 

---------------------------
Epoch 4
loss: 2.054110288619995
loss: 2.0349717140197754
Test Error: 
 Accuracy: 14.6% 

---------------------------
Epoch 5
loss: 2.0524582862854004
loss: 1.988764762878418
Test Error: 
 Accuracy: 16.0% 

---------------------------
Epoch 6
loss: 2.050995111465454
loss: 1.9794433116912842
Test Error: 
 Accuracy: 15.3% 

---------------------------
Epoch 7
loss: 2.0471460819244385
loss: 1.980089783668518
Test Error: 
 Accuracy: 16.0% 

---------------------------
Epoch 8
loss: 2.045097827911377
loss: 1.978513240814209
Test Error: 
 Accuracy: 14.9% 

---------------------------
Epoch 9
loss: 2.043210506439209
loss: 1.9772216081619263
Test E

KeyboardInterrupt: ignored

In [None]:
from torchvision import models
from torch import nn

model = models.resnet50(pretrained=True)

model = model.to(device)
model.conv1=nn.Conv2d(1, model.conv1.out_channels, 
                      kernel_size=model.conv1.kernel_size[0], 
                      stride=model.conv1.stride[0], 
                      padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(*[nn.Dropout(p=0.25), nn.Linear(num_ftrs, 8)])
model = model.to(device)

In [None]:
EPOCHS = 50
#LEARNING_RATE = 0.001
train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, EPOCHS)

Epoch 1
loss: 2.132601499557495
loss: 2.125936508178711
Test Error: 
 Accuracy: 10.8% 

---------------------------
Epoch 2
loss: 2.105767250061035
loss: 2.1930460929870605
Test Error: 
 Accuracy: 11.8% 

---------------------------
Epoch 3
loss: 2.117488384246826
loss: 2.220412492752075
Test Error: 
 Accuracy: 11.5% 

---------------------------
Epoch 4
loss: 2.1412150859832764
loss: 2.1393489837646484
Test Error: 
 Accuracy: 11.8% 

---------------------------
Epoch 5
loss: 2.0891730785369873
loss: 2.19270920753479
Test Error: 
 Accuracy: 14.2% 

---------------------------
Epoch 6
loss: 2.112936019897461
loss: 2.1531589031219482
Test Error: 
 Accuracy: 14.9% 

---------------------------
Epoch 7
loss: 2.1354024410247803
loss: 2.127596378326416
Test Error: 
 Accuracy: 13.2% 

---------------------------
Epoch 8
loss: 2.109525442123413
loss: 2.170292615890503
Test Error: 
 Accuracy: 11.5% 

---------------------------
Epoch 9
loss: 2.154937982559204
loss: 2.152895212173462
Test Error:

In [None]:
EPOCHS = 30
LEARNING_RATE = 0.0001
train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, EPOCHS)

Epoch 1
loss: 0.7167951464653015
loss: 2.1231791973114014
Test Error: 
 Accuracy: 45.5% 

---------------------------
Epoch 2
loss: 0.7439258098602295
loss: 2.2900640964508057
Test Error: 
 Accuracy: 43.8% 

---------------------------
Epoch 3
loss: 0.7316758632659912
loss: 2.0084497928619385
Test Error: 
 Accuracy: 47.2% 

---------------------------
Epoch 4
loss: 0.824337363243103
loss: 1.9670825004577637
Test Error: 
 Accuracy: 44.1% 

---------------------------
Epoch 5
loss: 0.6440131068229675
loss: 1.8499634265899658
Test Error: 
 Accuracy: 45.1% 

---------------------------
Epoch 6
loss: 0.42822885513305664
loss: 2.2577555179595947
Test Error: 
 Accuracy: 39.9% 

---------------------------
Epoch 7
loss: 0.4221033453941345
loss: 2.2379703521728516
Test Error: 
 Accuracy: 43.1% 

---------------------------
Epoch 8
loss: 0.43903622031211853
loss: 2.2819457054138184
Test Error: 
 Accuracy: 43.1% 

---------------------------
Epoch 9
loss: 0.4059475064277649
loss: 2.07771968841552