<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/augmenting%20tried.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown --id 1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
!gdown --id 1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg

Downloading...
From: https://drive.google.com/uc?id=1UBZLKSiAJyKoIy_cvIH94ad0EhJLO1vY
To: /content/Audio_Speech_Actors_01-24.zip
100% 208M/208M [00:01<00:00, 151MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mfPTTkRtBPl1pUfby5G9CPFj6ml_YvDg
To: /content/Ravdess_csv.zip
100% 6.76k/6.76k [00:00<00:00, 11.3MB/s]


In [2]:
import zipfile
dataset_directory = '/content/Audio_Speech_Actors_01-24.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('Unzipped_Data')
zip_ref.close()

dataset_directory = '/content/Ravdess_csv.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('/content')
zip_ref.close()
#1SQ7-bhKan1gyBoRJiAMOXorgYoRUJwh2 emodb

In [55]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
from torch.nn.functional import normalize
import math

class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.augment = False

    def __len__(self):
        return len(self.annotations) * 2

    def __getitem__(self, index):

        if len(self.annotations) - 1 < index:
            self.augment = True
        else :
            self.augment = False
        index = math.floor(index / 2)

        audio_sample_path = self._get_audio_sample_path(index)
        #print(audio_sample_path)
        label = self._get_audio_sample_label(index)
        #print(label)
        signal, sr = torchaudio.load(audio_sample_path)

        if self.augment:
            effects = [
                ["lowpass", "-1", "300"], # apply single-pole lowpass filter
                ["speed", "0.8"],  # reduce the speed
                                    # This only changes sample rate, so it is necessary to
                                    # add `rate` effect with original sample rate after this.
                ["rate", f"{sr}"],
                ["reverb", "-w"],  # Reverbration gives some dramatic feeling
            ]
            signal, sr = torchaudio.sox_effects.apply_effects_tensor(signal, sr, effects)

        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        delta = torchaudio.functional.compute_deltas(signal)
        delta2 = torchaudio.functional.compute_deltas(delta)
        signal = torch.cat((signal, delta, delta2), 1) # concatenating deltas
        #signal = torch.mean(signal, 2, True)
        #signal = (torch.sum(signal, 2))
        
        means = signal.mean(dim=1, keepdim=True)
        stds = signal.std(dim=1, keepdim=True)
        signal = (signal - means) / stds #NORMALIZED

        return signal,label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]

In [56]:
AUDIO_DIR = ""
SAMPLE_RATE = 48000
NUM_SAMPLES = SAMPLE_RATE*4  # either reduce the samples to 1/4 of number of samples or increase sample size 4 times of sample rate to adjust to 4 second audio
train_csv_file = '/content/train_csv.csv'
test_csv_file = '/content/test_csv.csv'
val_csv_file = '/content/val_csv.csv'

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=44, melkwargs={"n_fft": 1500, "hop_length": 500, 'power':2})

train_dataset = CustomDataset(train_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(train_dataset)} samples in the train dataset.")

test_dataset = CustomDataset(test_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(test_dataset)} samples in the test dataset.")


val_dataset = CustomDataset(val_csv_file,
                        AUDIO_DIR,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)

print(f"There are {len(val_dataset)} samples in validation  dataset.")
signal, label = val_dataset[0]
print(signal.shape)

Using device cuda
There are 1792 samples in the train dataset.
There are 448 samples in the test dataset.
There are 448 samples in validation  dataset.
torch.Size([1, 132, 385])


In [57]:
from torch import nn
from torchsummary import summary

from torchvision import models
model = models.resnet50(pretrained=True)

model = model.to(device)
model.conv1=nn.Conv2d(1, model.conv1.out_channels, 
                      kernel_size=model.conv1.kernel_size[0], 
                      stride=model.conv1.stride[0], 
                      padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(*[nn.Dropout(p=0.25), nn.Linear(num_ftrs, 7)])

In [59]:
model = model.to(device)
#summary(model, signal.shape)

In [61]:
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = .0001
import torch.utils.data as data

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
    return train_dataloader

def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    model.train()
    for input,target in data_loader:
        input, target = input.to(device), target.to(device)
        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)
        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
    print(f"loss: {loss.item()}")


def test_single_epoch(model, dataloader, loss_fn, optimiser, device):
  correct = 0
  size = len(dataloader.dataset)
  model.eval()
  with torch.no_grad():
    for input,target in dataloader:
          input, target = input.to(device), target.to(device)
          # calculate loss
          prediction = model(input)
          loss = loss_fn(prediction, target)
          correct += (prediction.argmax(1) == target).type(torch.float).sum().item()
    correct /= size
    
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

  return 100*correct


def train_test(model, train_dataloader, test_dataloader, loss_fn, optimiser, device, epochs,best_acc=0):
    
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, train_dataloader, loss_fn, optimiser, device)
        acc = test_single_epoch(model, test_dataloader, loss_fn, optimiser, device)
        #scheduler.step(acc)
        if(acc > best_acc):
          print("saving model now",best_acc)
          best_acc = acc
          torch.save(model,"model.pth")
        print("---------------------------")
    print("Finished training")
    return best_acc
train_dataloader = create_data_loader(train_dataset, BATCH_SIZE)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE)
val_dataloader = create_data_loader(val_dataset, BATCH_SIZE)
model = model.to(device)

# initialise loss funtion + optimiser
loss_fn = nn.CrossEntropyLoss() 
optimiser = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, 'max',factor=0.1, patience=10,verbose=True )


In [48]:
#optimiser = torch.optim.SGD(model.parameters(),lr=.000001,momentum=0.9, weight_decay=0.01)


In [None]:
best_acc = train_test(model, train_dataloader, val_dataloader, loss_fn, optimiser, device, EPOCHS,best_acc)

Epoch 1
