<a href="https://colab.research.google.com/github/Faisal-NSU/CSE465/blob/main/skeleton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!gdown --id 1k-afEJdwz5Tf4-bsuKOJzP7xn-KQTlkM

Downloading...
From: https://drive.google.com/uc?id=1k-afEJdwz5Tf4-bsuKOJzP7xn-KQTlkM
To: /content/SUBESCO.zip
100% 1.65G/1.65G [00:11<00:00, 149MB/s]


In [None]:
from google.colab import drive


import zipfile
dataset_directory = '/content/SUBESCO.zip'
zip_ref = zipfile.ZipFile(dataset_directory, 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn.functional import normalize


class CustomDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        
        label = self._get_audio_sample_label(index)
       
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        # signal = signal.view(signal.shape[1],-1)
      
        # delta = torchaudio.functional.compute_deltas(signal)
        # delta2 = torchaudio.functional.compute_deltas(delta)
        # avg_mfcc_deltas = torch.cat((signal, delta, delta2), 0) # concatenating deltas        
    
        
        # means = avg_mfcc_deltas.mean(dim=0, keepdim=True)
        # stds = avg_mfcc_deltas.std(dim=0, keepdim=True)
        # avg_mfcc_deltas = (avg_mfcc_deltas - means) / stds #NORMALIZED

        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            # print(signal.shape[1]) # print sample size
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler.to(device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    # for any audio index this function returns the audio path
    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,0])
        return path
    
    # for specified audio index this will return the label from the data csv
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 4]

In [None]:
SAMPLE_RATE = 16000
NUM_SAMPLES = SAMPLE_RATE*4
classes = {
  0: "ANGRY",
  1: "DISGUST",
  2: "FEAR",
  3: "HAPPY",
  4: "NEUTRAL",
  5: "SAD",
  6: "SURPRISE"
}

TRAIN_CSV = '/content/SUBESCO/train/train.csv'
TEST_CSV = '/content/SUBESCO/test/test.csv'
VALID_CSV = '/content/SUBESCO/valid/valid.csv'

TRAIN_PATH = '/content/SUBESCO/train/'
TEST_PATH = '/content/SUBESCO/test/'
VALID_PATH = '/content/SUBESCO/valid/'

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

mfcc = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    win_length=None,
    hop_length=512,
    center=True,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    onesided=True,
    n_mels=224,
    mel_scale="htk",
)

train_dataset = CustomDataset(TRAIN_CSV,
                        TRAIN_PATH,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(train_dataset)} samples in the dataset.")
avg_mfcc_deltas, label = train_dataset[0]

print(avg_mfcc_deltas.shape)  
#print(len(usd))

val_dataset = CustomDataset(VALID_CSV,
                        VALID_PATH,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)

print(f"There are {len(val_dataset)} samples in the dataset.")

test_dataset = CustomDataset(TEST_CSV,
                        TEST_PATH,
                        mfcc,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)

print(f"There are {len(test_dataset)} samples in the dataset.")

Using device cuda
There are 4900 samples in the dataset.
torch.Size([1, 224, 126])
There are 700 samples in the dataset.
There are 1400 samples in the dataset.


In [None]:
import torch
import torch.nn.functional as F
class ANN(nn.Module):
  def __init__(self):
    super().__init__()
    self.flatten = nn.Flatten()
    self.dense_layers = nn.Sequential(
      nn.Linear(22560, 4096),
      nn.Dropout(p=0.5),
      nn.ReLU(),
      nn.Linear(4096, 1024),
      nn.Dropout(p=0.5),
      nn.ReLU(),
      nn.Linear(1024, 256),
      nn.Dropout(p=0.5),
      nn.ReLU(),
      nn.Linear(256, 7),
    )
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_data):
    flattened_input_data = self.flatten(input_data)
    logits = self.dense_layers(flattened_input_data)
    prediction = self.softmax(logits)
    return prediction

In [None]:
from torchsummary import summary

model = ANN().to(device)
summary(model, avg_mfcc_deltas.shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                [-1, 22560]               0
            Linear-2                 [-1, 4096]      92,409,856
           Dropout-3                 [-1, 4096]               0
              ReLU-4                 [-1, 4096]               0
            Linear-5                 [-1, 1024]       4,195,328
           Dropout-6                 [-1, 1024]               0
              ReLU-7                 [-1, 1024]               0
            Linear-8                  [-1, 256]         262,400
           Dropout-9                  [-1, 256]               0
             ReLU-10                  [-1, 256]               0
           Linear-11                    [-1, 7]           1,799
          Softmax-12                    [-1, 7]               0
Total params: 96,869,383
Trainable params: 96,869,383
Non-trainable params: 0
-------------------------

In [None]:
from torchsummary import summary
device = ''
from torchvision import models
from torch import nn

if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

print(f"Using device {device}")

model = models.resnet50(pretrained=True)

model = model.to(device)
model.conv1=nn.Conv2d(1, model.conv1.out_channels, 
                      kernel_size=model.conv1.kernel_size[0], 
                      stride=model.conv1.stride[0], 
                      padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(*[nn.Dropout(p=0.25), nn.Linear(num_ftrs, 7)])
model = model.to(device)

Using device cuda


In [None]:
BATCH_SIZE = 64
def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size,shuffle=True, num_workers=0)
    return train_dataloader

train_dataloader = create_data_loader(train_dataset, BATCH_SIZE)
test_dataloader = create_data_loader(test_dataset, BATCH_SIZE)
val_dataloader = create_data_loader(val_dataset, BATCH_SIZE)

dataloaders = {
  "train": train_dataloader,
  "test": test_dataloader,
  "val": val_dataloader,
}
dataset_sizes = {
  "train": len(train_dataloader),
  "test": len(test_dataloader),
  "val": len(val_dataloader),
}

LETS DO THIS IN STYLE


In [None]:
import time
import copy

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()
                        optimizer.zero_grad()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.1)
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=10)

Epoch 0/9
----------
train Loss: 196.8878 Acc: 10.9091


KeyboardInterrupt: ignored

In [None]:
def test_single_epoch(model, dataloader, loss_fn, optimiser, device):
  correct = 0
  size = len(dataloader.dataset)

  model.eval()
  for sr, target, input in dataloader:
        input, target = input.to(device), target.to(device)
        # calculate loss
        prediction = model(input)
        correct += (prediction.argmax(1) == target).type(torch.float).sum().item()
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}% \n")

test_dataloader = create_data_loader(test_dataset, BATCH_SIZE)

Previously ANN gave 39% accuracy, Without average MFCCS.

In [None]:
test_single_epoch(model, test_dataloader, criterion, optimizer, device)

Test Error: 
 Accuracy: 72.9% 



# SUBESCO Now
