In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd 'gdrive/MyDrive/SER/implementations/pepino21_interspeech/'

/content/gdrive/MyDrive/SER/implementations/pepino21_interspeech


In [3]:
!pip uninstall torch torchaudio torchvision torchtext -y

Found existing installation: torch 1.12.0+cu113
Uninstalling torch-1.12.0+cu113:
  Successfully uninstalled torch-1.12.0+cu113
Found existing installation: torchaudio 0.12.0+cu113
Uninstalling torchaudio-0.12.0+cu113:
  Successfully uninstalled torchaudio-0.12.0+cu113
Found existing installation: torchvision 0.13.0+cu113
Uninstalling torchvision-0.13.0+cu113:
  Successfully uninstalled torchvision-0.13.0+cu113
Found existing installation: torchtext 0.13.0
Uninstalling torchtext-0.13.0:
  Successfully uninstalled torchtext-0.13.0


In [4]:
!pip install torch==1.11.0 torchaudio==0.11.0 torchvision==0.12.0 torchtext==0.12.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.11.0
  Downloading torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl (750.6 MB)
[K     |████████████████████████████████| 750.6 MB 12 kB/s 
[?25hCollecting torchaudio==0.11.0
  Downloading torchaudio-0.11.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 61.0 MB/s 
[?25hCollecting torchvision==0.12.0
  Downloading torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (21.0 MB)
[K     |████████████████████████████████| 21.0 MB 1.2 MB/s 
[?25hCollecting torchtext==0.12.0
  Downloading torchtext-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 28.7 MB/s 
Installing collected packages: torch, torchvision, torchtext, torchaudio
Successfully installed torch-1.11.0 torchaudio-0.11.0 torchtext-0.12.0 torchvision-0.12.0


In [5]:
!pip install opensmile

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opensmile
  Downloading opensmile-2.4.1-py3-none-any.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 7.6 MB/s 
[?25hCollecting audobject>=0.6.1
  Downloading audobject-0.7.5-py3-none-any.whl (24 kB)
Collecting audinterface>=0.7.0
  Downloading audinterface-0.9.1-py3-none-any.whl (30 kB)
Collecting audformat<2.0.0,>=0.12.1
  Downloading audformat-0.14.3-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 7.5 MB/s 
[?25hCollecting audresample<2.0.0,>=1.1.0
  Downloading audresample-1.1.0-py3-none-any.whl (635 kB)
[K     |████████████████████████████████| 635 kB 65.7 MB/s 
Collecting oyaml
  Downloading oyaml-1.0-py2.py3-none-any.whl (3.0 kB)
Collecting iso3166
  Downloading iso3166-2.1.1-py3-none-any.whl (9.8 kB)
Collecting audiofile>=0.4.0
  Downloading audiofile-1.1.0-py3-none-any.whl (11 kB)
Collecting audeer<2.0.0,>=1.18.0
  Downloading

In [6]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence

from dataset_classes.ravdess import MergeCalmAndNeutralRAVDESS, MyResample, RavdessAudio
from models.fusion.pt_all_eGeMAPS_subset_global import PreTrainedAllLayersSubsetOfLLDsGlobalNorm

In [7]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.set_default_tensor_type(torch.cuda.FloatTensor)

device

device(type='cuda')

In [8]:
def my_collate_fn(batch):
    sample_list = [sample[0] for sample in batch]
    batch_tensor = pad_sequence(sample_list, batch_first=True)
    lengths = torch.tensor([sample[1] for sample in batch], dtype=torch.int)
    samp_rates = torch.tensor([sample[2] for sample in batch], dtype=torch.int)
    spkr_ids = torch.tensor([sample[3] for sample in batch], dtype=torch.int)
    labels = torch.tensor([sample[4] for sample in batch], dtype=torch.long)
    return batch_tensor, lengths, samp_rates, spkr_ids, labels

In [9]:
def train_epoch(dataloader, model, loss_fn, optimizer, print_every):
    """
    print_every: after how many batches to print loss (on the last processed batch)
    """
    num_samples = len(dataloader.dataset)
    samples_done = 0
    for batch_idx, batch in enumerate(dataloader):
        model_inp = tuple(batch[i].to(device) for i in range(4))
        labels = batch[4].to(device)
        
        preds = model(model_inp)
        loss = loss_fn(preds, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        samples_done += model_inp[0].size(0)
        if (batch_idx+1)%print_every == 0:
            print(f"loss: {loss:.5f}    [{samples_done:4d}/{num_samples:4d}]")

In [10]:
def test_epoch(dataloader, num_classes, model, loss_fn):
    num_batches = len(dataloader)
    num_samples = len(dataloader.dataset)
    
    loss = 0
    acc = 0
    tp = torch.zeros((num_classes))
    gnd_trth_p = torch.zeros((num_classes))

    model.eval()
    for batch in dataloader:
        model_inp = tuple(batch[i].to(device) for i in range(4))
        labels = batch[4].to(device)
        
        with torch.no_grad():
            preds = model(model_inp)
            loss += loss_fn(preds, labels).item()
            preds = preds.argmax(dim=1)
            acc += (preds == labels).sum().item()
            for i in range(num_classes):
                tp[i] += torch.logical_and(preds==i, labels==i).sum().item()
                gnd_trth_p[i] += (labels == i).sum().item()
    model.train()

    loss /= num_batches
    acc /= num_samples
    recall = tp/gnd_trth_p
    avg_recall = torch.mean(recall)
    print("Test Error:")
    print(f"    loss: {loss:.5f}, acc: {100*acc:.2f} %\n    avg recall: {100*avg_recall:.2f} %")
    return loss

In [11]:
train_dataset = RavdessAudio(dir_path="../../datasets/",
                             csv_path="../../datasets/RAVDESS/my_stuff/train_csv.csv",
                             transform=MyResample(48000, 16000),
                             target_transform=MergeCalmAndNeutralRAVDESS())

val_dataset = RavdessAudio(dir_path="../../datasets/",
                           csv_path="../../datasets/RAVDESS/my_stuff/val_csv.csv",
                           transform=MyResample(48000, 16000),
                           target_transform=MergeCalmAndNeutralRAVDESS())

test_dataset = RavdessAudio(dir_path="../../datasets/",
                            csv_path="../../datasets/RAVDESS/my_stuff/test_csv.csv",
                            transform=MyResample(48000, 16000),
                            target_transform=MergeCalmAndNeutralRAVDESS())

num_classes = 7

In [12]:
batch_size = 64

train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               collate_fn=my_collate_fn,
                                               shuffle=True,
                                               generator=torch.Generator(device))

val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             collate_fn=my_collate_fn,
                                             shuffle=True,
                                             generator=torch.Generator(device))

test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=batch_size,
                                              collate_fn=my_collate_fn,
                                              shuffle=True,
                                              generator=torch.Generator(device))

In [13]:
# load mean, std_dev
eGeMAPS_feat_idxs = [10, 11, 16, 17, 19, 20, 22, 23]

norm_dict = torch.load('./normalization_tensors/ravdess/pt_w2v2/all_layers_global.pt')

w2v2_mean = norm_dict['mean']
w2v2_std_dev = norm_dict['std_dev']

norm_dict = torch.load('./normalization_tensors/ravdess/eGeMAPS/global.pt')

eGeMAPS_mean = norm_dict['mean']
eGeMAPS_std_dev = norm_dict['std_dev']

eGeMAPS_mean = eGeMAPS_mean[:, eGeMAPS_feat_idxs]
eGeMAPS_std_dev = eGeMAPS_std_dev[:, eGeMAPS_feat_idxs]

w2v2_mean = w2v2_mean.to(device)
w2v2_std_dev = w2v2_std_dev.to(device)
eGeMAPS_mean = eGeMAPS_mean.to(device)
eGeMAPS_std_dev = eGeMAPS_std_dev.to(device)

In [14]:
#        dim:      16,      32,      64,     128
#   val_loss: 0.52205, 0.51733, 0.61896, 0.58098#loss: 0.52205, acc: 78.85 % avg recall: 76.34 %
#    val_acc:   78.85,   80.77,   74.52,   78.37
# val_recall:   76.34,   78.57,   72.62,   75.89
dim1 = 16

model = PreTrainedAllLayersSubsetOfLLDsGlobalNorm(
    250,
    dim1,
    eGeMAPS_feat_idxs,
    norm_means_w2v2=w2v2_mean,
    norm_std_devs_w2v2=w2v2_std_dev,
    norm_means_eGeMAPS=eGeMAPS_mean,
    norm_std_devs_eGeMAPS=eGeMAPS_std_dev
)

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth


  0%|          | 0.00/360M [00:00<?, ?B/s]

In [15]:
loss_fn = nn.CrossEntropyLoss()

In [16]:
lr = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [17]:
num_epochs = 30
patience = 4

print("Before Training, on val set:")
_ = test_epoch(val_dataloader, num_classes, model, loss_fn)
print("Before Training, on test set:")
_ = test_epoch(test_dataloader, num_classes, model, loss_fn)

print("Starting training")
best_loss = float('inf')
i = 0
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}")
    print("--------------------------------")
    train_epoch(train_dataloader, model, loss_fn, optimizer, 4)
    loss = test_epoch(val_dataloader, num_classes, model, loss_fn)
    if loss < best_loss:
        best_loss = loss
        torch.save({
            "epochs_done": epoch+1,
            "model_state_dict": model.state_dict(),
            "opt_state_dict": optimizer.state_dict(),
            "val_loss": loss
            }, "./saved_models/fusion/best_pt_all_eGeMAPS_freq_feats_global2.pt")
        i = 0
    elif loss > best_loss:
        i += 1
    if i > patience:
        print(f"Early stopping after {epoch+1} epochs")
        break;

Before Training, on val set:
Test Error:
    loss: 1.92837, acc: 15.87 %
    avg recall: 13.84 %
Before Training, on test set:
Test Error:
    loss: 1.92223, acc: 18.27 %
    avg recall: 15.03 %
Starting training
Epoch 1
--------------------------------
loss: 1.79006    [ 256/2036]
loss: 1.59371    [ 512/2036]
loss: 1.38663    [ 768/2036]
loss: 1.26165    [1024/2036]
loss: 1.21136    [1280/2036]
loss: 1.14376    [1536/2036]
loss: 1.34015    [1792/2036]
loss: 1.04234    [2036/2036]
Test Error:
    loss: 1.01158, acc: 58.17 %
    avg recall: 54.02 %
Epoch 2
--------------------------------
loss: 0.79599    [ 256/2036]
loss: 0.81345    [ 512/2036]
loss: 0.85036    [ 768/2036]
loss: 0.78218    [1024/2036]
loss: 0.95514    [1280/2036]
loss: 0.98405    [1536/2036]
loss: 0.70929    [1792/2036]
loss: 0.86803    [2036/2036]
Test Error:
    loss: 0.73517, acc: 71.63 %
    avg recall: 71.28 %
Epoch 3
--------------------------------
loss: 0.64744    [ 256/2036]
loss: 0.63999    [ 512/2036]
loss: 

## Now load the saved model, and check its performance

In [18]:
loaded_model = PreTrainedAllLayersSubsetOfLLDsGlobalNorm(
    250,
    dim1,
    eGeMAPS_feat_idxs,
    norm_means_w2v2=w2v2_mean,
    norm_std_devs_w2v2=w2v2_std_dev,
    norm_means_eGeMAPS=eGeMAPS_mean,
    norm_std_devs_eGeMAPS=eGeMAPS_std_dev
)

In [19]:
loaded_dict = torch.load(
    "./saved_models/fusion/best_pt_all_eGeMAPS_freq_feats_global2.pt",
    map_location=device
)

In [20]:
loaded_dict["epochs_done"]

4

In [21]:
loaded_model.load_state_dict(loaded_dict["model_state_dict"])

<All keys matched successfully>

In [None]:
# On train set
test_epoch(train_dataloader, num_classes, loaded_model, loss_fn);

In [22]:
# On validation set
test_epoch(val_dataloader, num_classes, loaded_model, loss_fn);

Test Error:
    loss: 0.52205, acc: 78.85 %
    avg recall: 76.34 %


In [None]:
# On test set
test_epoch(test_dataloader, num_classes, loaded_model, loss_fn);