In [1]:
import torch
from torch import nn

import copy
import math
import librosa
from pathlib import Path
import torch.utils.data as data
import torch.optim as optim
from torch.optim import lr_scheduler
import os
import random
import glob
from tqdm import tqdm
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Running on device: {}'.format(device))
print(torch.cuda.get_device_name(0))

Running on device: cuda
NVIDIA GeForce RTX 2080 Ti


In [2]:
from transformers import AutoModelForCTC

model = AutoModelForCTC.from_pretrained("ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt")
model.to(device)


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureExtractor(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (4): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,)

In [3]:
DATASET_PATH = '../dataset/chinese_dataset/'
emotions = ['Angry', 'Happy', 'Neutral', 'Sad']
OUTPUT = '../models/chinese/data_aug/'

BATCHSIZE = 10
EPOCHS = 30
optimizer = optim.Adam(model.parameters())
scheduler = lr_scheduler.StepLR(optimizer, 10)

## Load dataset


In [4]:
all_dataset = []
for i, emotion in enumerate(emotions):
    filepath = Path(DATASET_PATH + emotion + '/')
    for file in filepath.glob("*.wav"): 
        wav, rate = librosa.load(file, sr=16000, mono=True, res_type="kaiser_fast")
        tensor = torch.tensor(wav, device="cuda")
        all_dataset.append((tensor, i))


In [5]:
root = glob.glob(DATASET_PATH + "*")
for i, filepath in enumerate(root):
    filepath = filepath.replace("\\", "/")
    all_wav = os.listdir(filepath)
    wav_sample = random.sample(all_wav, 10)

    for sample in tqdm(wav_sample):
        file = f"{filepath}/{sample}"
        # add noise
        wav, rate = librosa.load(file, sr=16000, mono=True, res_type="kaiser_fast")
        wav_n = wav + 0.009*np.random.normal(0,1,len(wav))
        tensor = torch.tensor(wav, device="cuda")
        all_dataset.append((tensor, i))

print(len(all_dataset[10][0]))
print(len(all_dataset))

160


In [6]:
def sort_data(data_set):
    indices = sorted(range(len(data_set)),
                     key=lambda k: len(data_set[k][0]),
                     reverse=True)
    data_set = [data_set[i] for i in indices]
    return data_set, indices

In [7]:
def split_data(all_dataset):
    # Split dataset
    data_size = len(all_dataset)
    dataset_sizes = {
    "train": int(data_size*0.8),
    "test": int(data_size*0.2),
    }

    train_set, test_set = data.random_split(all_dataset, [dataset_sizes["train"], dataset_sizes["test"]])
    # sort
    sorted_train, train_indices = sort_data(train_set)
    sorted_test, test_indices = sort_data(test_set)

    def pad_tensor(vec, pad, dim):
        pad_size = list(vec.shape)
        pad_size[dim] = pad - vec.size(dim)
        return torch.cat([vec, torch.zeros(*pad_size).cuda()], dim=dim)

    def collate_fn(instances):
        
        max_len = max(map(lambda x: x[0].shape[0], instances))
        batch = []
        for (x,y) in instances:
            batch.append((pad_tensor(x, pad=max_len, dim=-1), y))
        
        f = list(map(lambda x: x[0], batch))
        l = list(map(lambda x: x[1], batch))
        features = torch.stack(f, dim=0)
        labels = torch.Tensor(l)
        return (features, labels)

    dataloaders = {
        "train": torch.utils.data.DataLoader(sorted_train, batch_size=BATCHSIZE,  num_workers=0, drop_last=True, collate_fn=collate_fn),
        "test": torch.utils.data.DataLoader(sorted_test, batch_size=BATCHSIZE,  num_workers=0, drop_last=True, collate_fn=collate_fn),
    }

    return dataloaders, dataset_sizes

In [8]:
dataloaders, dataset_sizes = split_data(all_dataset)
print(dataset_sizes["train"], dataset_sizes["test"])

128 32


In [9]:
class TranferModel(nn.Module):

  def __init__(self, wav2vec, n_classes):
    super().__init__()
    self.wav2vec = wav2vec
    self.dropout = nn.Dropout(0.2)
    self.linear = nn.Linear(1024, n_classes)
    self.device = device
    self.to(self.device)

  def forward(self, input_values, attention_mask=None):
    x = self.wav2vec(input_values=input_values, output_hidden_states=True, attention_mask=attention_mask)
    x = x.hidden_states[0]
    x = x.mean(1)
    x = self.dropout(x)
    x = self.linear(x)
    return x

In [10]:
def train (model, dataloaders, dataset_sizes, num_epochs, optimizer, scheduler, checkpoint=None):
    criterion = nn.CrossEntropyLoss()

    outputlist = {'train': {'loss':[], 'acc': []}, 'test': {'loss':[], 'acc': []}}

    if checkpoint is None:
        best_model_wts = copy.deepcopy(model.state_dict())
        best_loss = math.inf
        best_acc = 0.0
    else:
        print(
            f'Test loss: {checkpoint["best_test_loss"]}, Test accuracy: {checkpoint["best_test_accuracy"]}')
        model.load_state_dict(checkpoint['model_state_dict'])
        best_model_wts = copy.deepcopy(model.state_dict())
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        best_loss = checkpoint['best_test_loss']
        best_acc = checkpoint['best_test_accuracy']

    #start training
    for epoch in range(1, num_epochs+1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('--' * 10)

        for phase in ["train", "test"]:
            running_loss = 0.0
            running_corrects = 0

            if phase == "train":
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                
                inputs = inputs.to(device)
                labels = torch.tensor([i.long() for i in labels])
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward propogation and optimize in 'train' mode
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            
            if phase == "train":
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            #record the loss and accuracy for visualization
            outputlist[phase]['loss'].append(epoch_loss)
            outputlist[phase]['acc'].append(epoch_acc.cpu())

            print('{} Loss: {:.4f} Accuracy: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # output test result
            if phase == "test" and epoch_acc > best_acc:
                print(f'New best model found!')
                print(f'New record accuracy: {epoch_acc}, Previous record accuracy: {best_acc}')
                best_loss = epoch_loss
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                if epoch_acc > 0.40: 
                    check_point_path = OUTPUT + "epoch" + str(epoch) + ".pth"

                    torch.save({'model_state_dict': model.state_dict(),
                                'optimizer_state_dict': optimizer.state_dict(),
                                'best_test_loss': best_loss,
                                'best_test_accuracy': best_acc,
                                'scheduler_state_dict': scheduler.state_dict(),
                                }, check_point_path)
                    

    print('Best test Accuracy: {:.4f} Best test loss: {:.4f}'.format(best_acc, best_loss))

    # load and return the best model weights
    model.load_state_dict(best_model_wts)
    return model, best_loss, best_acc, outputlist
            

In [None]:
new_model = TranferModel(model, len(emotions))

best_model, best_test_loss, best_test_acc, outputlist = train(
    new_model, dataloaders, dataset_sizes, EPOCHS, optimizer, scheduler
)