In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns 
from sklearn.ensemble import RandomForestClassifier
from google.colab import drive
import os
from datetime import datetime
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import torch
import math, random
import torch.nn.functional as F
from torch.nn import init
import torch.nn as nn
from torchaudio import transforms
from IPython.display import Audio
from torchvision import models
from torchsummary import summary
from tqdm import tqdm
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
drive.mount('/content/drive')
import time
start_time = time.time()
!7z x /content/drive/MyDrive/SUBESCO.zip -o'/content'
print("--------------- %s seconds -----------------" % (time.time() - start_time))

Mounted at /content/drive

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/drive/MyDrive/                                 1 file, 1651629165 bytes (1576 MiB)

Extracting archive: /content/drive/MyDrive/SUBESCO.zip
 58% 4096 Open              --
Path = /content/drive/MyDrive/SUBESCO.zip
Type = zip
Physical Size = 1651629165

  0%      0% 50 - SUBESCO/test/F_02_MONIKA_S_1_FEAR_4.wav                                                   1% 98 - SUBESCO/test/F_02_MONIKA_S_2_SAD_2.wav                                                

# Showing Data Graphs

In [None]:
def plot_audio(filename):
    waveform, sample_rate = torchaudio.load(filename)

    print("Shape of waveform: {}".format(waveform.size()))
    print("Sample rate of waveform: {}".format(sample_rate))

    plt.figure()
    plt.plot(waveform.t().numpy())

    return waveform, sample_rate


def show_waveform(waveform, sample_rate):
    print("Waveform: {}\nSample rate: {}\n".format(waveform.size(), sample_rate))   
    plt.figure()
    plt.plot(waveform[0,:].numpy())

def show_spectrogram(waveform):
    spectrogram = torchaudio.transforms.Spectrogram()(waveform)
    #print(spectrogram)
    print("Shape of spectrogram: {}".format(spectrogram.size()))

    plt.figure()
    plt.imshow(spectrogram.log2()[0,:,:].numpy(), cmap='gray')
    #plt.imsave(f'test/spectrogram_img.png', spectrogram.log2()[0,:,:].numpy(), cmap='gray')

def show_melspectrogram(waveform,sample_rate):
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
    print("Shape of spectrogram: {}".format(mel_spectrogram.size()))

    plt.figure()
    plt.imshow(mel_spectrogram.log2()[0,:,:].numpy(), cmap='gray')

def show_mfcc(waveform,sample_rate):
    mfcc_spectrogram = torchaudio.transforms.MFCC(sample_rate= sample_rate)(waveform)
    print("Shape of spectrogram: {}".format(mfcc_spectrogram.size()))

    plt.figure()
    fig1 = plt.gcf()
    plt.imshow(mfcc_spectrogram.log2()[0,:,:].numpy(), cmap='gray')
    
    plt.figure()
    plt.plot(mfcc_spectrogram.log2()[0,:,:].numpy())
    plt.draw()

def create_images(trainloader, label_dir):
    #make directory
    directory = f'./data/spectrograms/{label_dir}/'
    if(os.path.isdir(directory)):
        print("Data exists")
    else:
        os.makedirs(directory, mode=0o777, exist_ok=True)
        
        for i, data in enumerate(trainloader):

            waveform = data[0]
            sample_rate = data[1][0]
            label = data[2]
            ID = data[3]

            # create transformed waveforms
            spectrogram_tensor = torchaudio.transforms.Spectrogram()(waveform)     
            
            fig = plt.figure()
            plt.imsave(f'./data/spectrograms/{label_dir}/spec_img{i}.png', spectrogram_tensor[0].log2()[0,:,:].numpy(), cmap='gray')

def create_mfcc_images(trainloader, label_dir):
    #make directory
    os.makedirs(f'./data/mfcc_spectrograms/{label_dir}/', mode=0o777, exist_ok=True)
    
    for i, data in enumerate(trainloader):

        waveform = data[0]
        sample_rate = data[1][0]
        label = data[2]
        ID = data[3]
        
        mfcc_spectrogram = torchaudio.transforms.MFCC(sample_rate= sample_rate)(waveform)

        plt.figure()
        fig1 = plt.gcf()
        plt.imshow(mfcc_spectrogram[0].log2()[0,:,:].numpy(), cmap='gray')
        plt.draw()
        fig1.savefig(f'./data/mfcc_spectrograms/{label_dir}/spec_img{i}.png', dpi=100)
 
        #spectorgram_train.append([spectrogram_tensor, label, sample_rate, ID])


# AudioUtil class - ensuring data preprocessing

In [None]:
PATH = '/content/drive/MyDrive/CSE465/SUBESCO/F_01_OISHI_S_10_ANGRY_3.wav'
wave , sr = torchaudio.load(PATH)
wave = wave[:1, :]
top_db = 80
spec = transforms.MelSpectrogram(sr, n_fft=1024, hop_length=None, n_mels=64)(wave)
print(spec.shape,spec.max(),spec.min(),spec.mean())
spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
print(spec.shape,spec.max(),spec.min(),spec.mean())

torch.Size([1, 64, 347]) tensor(98175.4844) tensor(4.1404e-12) tensor(1057.6481)
torch.Size([1, 64, 347]) tensor(49.9200) tensor(-30.0800) tensor(-4.0887)


In [None]:
class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)
  # ----------------------------
  # Convert the given audio to the desired number of channels
  # ----------------------------
  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud

    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud

    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel
      resig = torch.cat([sig, sig])

    return ((resig, sr))
  # ----------------------------
  # Since Resample applies to a single channel, we resample one channel at a time
  # ----------------------------
  @staticmethod
  def resample(aud, newsr):
    sig, sr = aud

    if (sr == newsr):
      # Nothing to do
      return aud

    num_channels = sig.shape[0]
    # Resample first channel
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      # Resample the second channel and merge both channels
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))
  # ----------------------------
  # Pad (or truncate) the signal to a fixed length 'max_ms' in milliseconds
  # ----------------------------
  @staticmethod
  def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)
  # ----------------------------
  # Shifts the signal to the left or right by some percent. Values at the end
  # are 'wrapped around' to the start of the transformed signal.
  # ----------------------------
  @staticmethod
  def time_shift(aud, shift_limit):
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)
      # ----------------------------
  # Generate a Spectrogram
  # ----------------------------
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)
  # ----------------------------
  # Augment the Spectrogram by masking out some sections of it in both the frequency
  # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
  # overfitting and to help the model generalise better. The masked sections are
  # replaced with the mean value.
  # ----------------------------
  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

# SoundDS class for Custom Dataset and Spliting Dataset

In [None]:
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = pd.read_csv(df)
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 8000
    self.channel = 1
    self.shift_pct = 0.1
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path + self.df.loc[idx, 'filename']
    # Get the Class ID
    class_id = int(self.df.loc[idx, 'label_id'])

    aud = AudioUtil.open(audio_file)
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

In [None]:
TRAIN_CSV = '/content/SUBESCO/train/train.csv'
TEST_CSV = '/content/SUBESCO/test/test.csv'
VALID_CSV = '/content/SUBESCO/valid/valid.csv'

TRAIN_PATH = '/content/SUBESCO/train/'
TEST_PATH = '/content/SUBESCO/test/'
VALID_PATH = '/content/SUBESCO/valid/'

batch_size = 32

classes = {
  0: "ANGRY",
  1: "DISGUST",
  2: "FEAR",
  3: "HAPPY",
  4: "NEUTRAL",
  5: "SAD",
  6: "SURPRISE"
}

train_set = SoundDS(TRAIN_CSV, TRAIN_PATH)
test_set = SoundDS(TEST_CSV,TEST_PATH)
valid_set = SoundDS(VALID_CSV,VALID_PATH)




In [None]:
print(len(train_set))
print(len(test_set))
print(len(valid_set))

4900
1400
700


In [None]:
train_dl = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_set, batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(valid_set, batch_size, shuffle=True)

print(len(train_dl))
print(len(val_dl))
print(len(test_dl))

154
22
44


# Fine Tuning Resnet18

In [None]:
#classes = tuple(df.label.unique())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = models.resnet18(pretrained=True)
model = model.to(device)
model.conv1=nn.Conv2d(1, model.conv1.out_channels, 
                      kernel_size=model.conv1.kernel_size[0], 
                      stride=model.conv1.stride[0], 
                      padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(*[nn.Dropout(p=0.25), nn.Linear(num_ftrs, 7)])


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [None]:
device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
print('Device to use: {}'.format(device))
model = model.to(device)

Device to use: 0


In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 20 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 50
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum = 0.9)
criterion = nn.CrossEntropyLoss()

print("Started Training at : ",datetime.now().strftime("%H:%M:%S"))

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dl, model, criterion, optimizer)
    test_loop(test_dl, model, criterion)
print(f"Done for all {epochs} epochs\n")
print("Ended Training at : ",datetime.now().strftime("%H:%M:%S"))

Started Training at :  07:33:08
Epoch 1
-------------------------------
loss: 2.581007  [    0/ 4900]
loss: 1.800059  [  640/ 4900]
loss: 2.108917  [ 1280/ 4900]
loss: 1.661596  [ 1920/ 4900]
loss: 1.651484  [ 2560/ 4900]
loss: 1.892682  [ 3200/ 4900]
loss: 1.687457  [ 3840/ 4900]
loss: 1.733246  [ 4480/ 4900]
Test Error: 
 Accuracy: 31.6%, Avg loss: 1.776557 

Epoch 2
-------------------------------
loss: 1.292117  [    0/ 4900]
loss: 1.986362  [  640/ 4900]
loss: 1.839953  [ 1280/ 4900]
loss: 1.758464  [ 1920/ 4900]
loss: 1.643806  [ 2560/ 4900]
loss: 1.382904  [ 3200/ 4900]
loss: 1.737454  [ 3840/ 4900]
loss: 1.428741  [ 4480/ 4900]
Test Error: 
 Accuracy: 37.8%, Avg loss: 1.717422 

Epoch 3
-------------------------------
loss: 1.326558  [    0/ 4900]
loss: 1.467501  [  640/ 4900]
loss: 1.485544  [ 1280/ 4900]
loss: 1.565834  [ 1920/ 4900]
loss: 1.490774  [ 2560/ 4900]
loss: 1.329419  [ 3200/ 4900]
loss: 1.237284  [ 3840/ 4900]
loss: 1.086777  [ 4480/ 4900]
Test Error: 
 Accuracy: 

# Save model

In [None]:
checkpoint = {
    'epoch': epochs + 1,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
}
torch.save(checkpoint,"/content/drive/MyDrive/CSE465/checkpoint.pth")