In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

import sampler
import datasets
from earlystopping import EarlyStopping

from sklearn.metrics import accuracy_score, recall_score
from torch.autograd  import  Function

# Acoustic Branch

Inputs for acoustic branch will be N x 40 where N [1,33]  
Time step: (2, 10) (seconds?)  
N: relative duration after feature extraction

In [2]:
class AcousticNet(nn.Module):
    def __init__(self, num_conv_layers = 3, kernel_size = 2, conv_width = 32, num_gru_layers = 2):
        super(AcousticNet, self).__init__()
        self.num_conv_layers = num_conv_layers
        self.conv1 = nn.Conv1d(in_channels=40, out_channels=conv_width, kernel_size=kernel_size, padding = kernel_size - 1)
        self.conv2 = nn.Conv1d(in_channels=conv_width, out_channels=conv_width, kernel_size=kernel_size, padding = kernel_size - 1)
        self.conv3 = nn.Conv1d(in_channels=conv_width, out_channels=conv_width, kernel_size=kernel_size, padding = kernel_size - 1)
        self.conv4 = nn.Conv1d(in_channels=conv_width, out_channels=conv_width, kernel_size=kernel_size, padding = kernel_size - 1)
        self.convs = [self.conv1, self.conv2, self.conv3, self.conv4]
        self.max_pool = nn.MaxPool1d(kernel_size = 2)
        self.relu = nn.ReLU()
        
        self.gru = nn.GRU(input_size=conv_width,hidden_size=32,num_layers=num_gru_layers) # 19 is hardcoded
        self.mean_pool = nn.AvgPool1d(kernel_size=2)
        
    def forward(self, x):
        for i in range(self.num_conv_layers):
            x = self.relu(self.max_pool(self.convs[i](x)))
        x = torch.transpose(x, 1, 2) 
        x, _ = self.gru(x)
        x = torch.transpose(x, 1, 2)
        x = F.adaptive_avg_pool1d(x,1)[:, :, -1]
#         x = self.mean_pool(x)
        return x

In [3]:
# Test dummy input
net = AcousticNet(num_conv_layers = 3, kernel_size = 2, conv_width = 32, num_gru_layers = 2)
batch_size = 8
test_vec = torch.randn(batch_size, 40, 17) # samples x features (or channels) x N (relative duration)
output = net(test_vec)
print(f'Shape of output: {output.shape}')
# assert output.shape[-1] == 16

Shape of output: torch.Size([8, 32])


# Lexical Branch

In [4]:
# implement GRU (or transformer)
class LexicalNet(nn.Module):
    def __init__(self, num_gru_layers = 2):
        super(LexicalNet, self).__init__()
        # implement GRU (or transformer)
        self.gru = nn.GRU(input_size=300,hidden_size=32,num_layers=num_gru_layers)
        self.mean_pool = nn.AvgPool1d(kernel_size=2) 
        self.flatten = nn.Flatten()
        
    def forward(self, x):
        x, _ = self.gru(x)
#         x = self.mean_pool(x)
        x = self.flatten(x)
        print(x.shape)
        return x

In [5]:
# Test dummy input
net = LexicalNet(num_gru_layers = 2)
batch_size = 8
test_vec = torch.randn(batch_size, 1, 300)
output = net(test_vec)
# assert output.shape[-1] == 16

torch.Size([8, 32])


# Master branch

In [6]:
# class GRL(Function):
#     @staticmethod
#     def forward(self,x):
#         return x
#     @staticmethod
#     def backward(self,grad_output):
#         grad_input = grad_output.neg()
#         return grad_input

In [7]:
class GradientReversalFunction(Function):
    """
    Gradient Reversal Layer from:
    Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015)
    Forward pass is the identity function. In the backward pass,
    the upstream gradients are multiplied by -lambda (i.e. gradient is reversed)
    """

    @staticmethod
    def forward(ctx, x, lambda_):
        ctx.lambda_ = lambda_
        return x.clone()

    @staticmethod
    def backward(ctx, grads):
        lambda_ = ctx.lambda_
        lambda_ = grads.new_tensor(lambda_)
        dx = -lambda_ * grads
        return dx, None
    
class GradientReversal(torch.nn.Module):
    def __init__(self, lambda_=1):
        super(GradientReversal, self).__init__()
        self.lambda_ = lambda_

    def forward(self, x):
        return GradientReversalFunction.apply(x, self.lambda_)

In [8]:
class MasterNet(nn.Module):
    def __init__(self, acoustic_modality = True, lexical_modality = True, visual_modality = False,
                 num_conv_layers = 3, kernel_size = 2, conv_width = 32, num_gru_layers = 2,
                 num_dense_layers = 1, dense_layer_width = 32, grl_lambda = .3):
        super(MasterNet, self).__init__()
        
        self.acoustic_modality = acoustic_modality
        self.lexical_modality = lexical_modality
        self.visual_modality = visual_modality
        
        self.acoustic_model = AcousticNet(num_conv_layers = num_conv_layers, kernel_size = kernel_size, 
                                     conv_width = conv_width, num_gru_layers = num_gru_layers)
        self.lexical_model = LexicalNet(num_gru_layers = 2)
        
        # emotion classifier
#         self.dense1_emo = nn.Linear()
#         self.dense2_emo = nn.Linear()
        
        width = 0 # width of the FC layers
        if self.acoustic_modality:
            width += 32
        if self.visual_modality:
            width += 0 # to implement
        if self.lexical_modality:
            width += 32
            
        self.fc_1 = nn.Linear(width, dense_layer_width)
        self.fc_2 = nn.Linear(dense_layer_width, 3)
        self.softmax = nn.Softmax(dim=1)

        self.relu = nn.ReLU()
#         # To implement   
#         if num_dense_layers == 2:
#             self.fc = nn.Sequential()
#             self.linear_1 = nn.Linear(width, dense_layer_width)
#         else:
#             self.fc = 
        
        # confound classifier -- to implement
        
        self.grl = GradientReversal(lambda_ = grl_lambda)
        self.dense_con = nn.Linear(width, 3)
#         self.dense2_con = None
        
        
    def forward_a(self, x_a):
        x = x_a
        x = self.acoustic_model(x)
        return x
    
    def forward_l(self, x_l):
        x = x_l
        x = self.lexical_model(x)
        return x
    
    def forward_v(self, x_v):
        x = x_v
        return x
    
    def encoder(self, x_v, x_a, x_l):
        print('x_a before encoding', x_a.shape)
        print('x_l before encoding', x_l.shape)
        if self.visual_modality:
            x_v = self.forward_v(x_v)
        if self.acoustic_modality:
            x_a = self.forward_a(x_a)
        if self.lexical_modality:
            x_l = self.forward_l(x_l)
        print('x_a after encoding', x_a.shape)
        print('x_l after encoding', x_l.shape)
        
        if self.visual_modality:
            if self.acoustic_modality:
                if self.lexical_modality:
                    x = torch.cat((x_v, x_a, x_l), 1)
                else:
                    x = torch.cat((x_v, x_a), 1)
            else:
                if self.lexical_modality:
                    x = torch.cat((x_v, x_l), 1)
                else:
                    x = x_v
        else:
            if self.acoustic_modality:
                if self.lexical_modality:
                    x = torch.cat((x_a, x_l), 1)
                else:
                    x = x_a
            else:
                x = x_l
        print('x after concat', x.shape)
        return x

    def stress_model(self, x):
#         x = self.grl.apply(x)
        x = self.grl(x)
        x = self.dense_con(x)
        x = self.softmax(x)
        return x
    
    def recognizer(self, x):
        print(x.shape)
        x = self.relu(self.fc_1(x))
        x = self.fc_2(x)
        x = self.softmax(x)
        return x
    
    def forward(self, x_v, x_a, x_l):
        x = self.encoder(x_v, x_a, x_l)
        emotion_output = self.recognizer(x)
        stress_output = self.stress_model(x)
        
        return emotion_output, stress_output

In [9]:
# Test dummy input
net = MasterNet()
batch_size = 8
acoustic_features = torch.randn(batch_size, 40, 17) # samples x features (or channels) x N (relative duration)
lexical_features = torch.randn(batch_size, 1, 300)
visual_features = None
emotion_output, stress_output = net(visual_features, acoustic_features, lexical_features)
print(f'Shape of emotion output: {emotion_output.shape}')
print(f'Shape of stress output: {stress_output.shape}')
print(emotion_output)
print(stress_output)
# assert output.shape[-1] == 16

x_a before encoding torch.Size([8, 40, 17])
x_l before encoding torch.Size([8, 1, 300])
torch.Size([8, 32])
x_a after encoding torch.Size([8, 32])
x_l after encoding torch.Size([8, 32])
x after concat torch.Size([8, 64])
torch.Size([8, 64])
Shape of emotion output: torch.Size([8, 3])
Shape of stress output: torch.Size([8, 3])
tensor([[0.2938, 0.3491, 0.3572],
        [0.2988, 0.3450, 0.3561],
        [0.2984, 0.3560, 0.3457],
        [0.2959, 0.3576, 0.3464],
        [0.3010, 0.3521, 0.3470],
        [0.2989, 0.3543, 0.3468],
        [0.3045, 0.3503, 0.3452],
        [0.3025, 0.3500, 0.3475]], grad_fn=<SoftmaxBackward>)
tensor([[0.3173, 0.3699, 0.3128],
        [0.3246, 0.3688, 0.3066],
        [0.3263, 0.3645, 0.3092],
        [0.3327, 0.3443, 0.3230],
        [0.3489, 0.3213, 0.3298],
        [0.3732, 0.3070, 0.3199],
        [0.3783, 0.2888, 0.3329],
        [0.3796, 0.2909, 0.3295]], grad_fn=<SoftmaxBackward>)


In [10]:
# Use specific GPU
def get_device():
    if torch.cuda.is_available():  
        dev = "cuda:0" 
    else:  
        dev = "cpu"  
    return torch.device(dev)
device = get_device()

In [11]:
def train_one_folder(folder = 0):
    # Use specific GPU
    device = get_device()

    # Dataloaders
    train_dataset_file_path = os.path.join('../dataset', opt.source_domain, str(opt.folder), 'train.csv')
    train_loader = get_dataloader(train_dataset_file_path, 'train', opt)

    test_dataset_file_path = os.path.join('../dataset', opt.source_domain, str(opt.folder), 'test.csv')
    print(test_dataset_file_path)
    test_loader = get_dataloader(test_dataset_file_path, 'test', opt)

    # Model, optimizer and loss function
    emotion_recognizer = models.Model(opt)
    models.init_weights(emotion_recognizer)
    for param in emotion_recognizer.parameters():
        param.requires_grad = True
    emotion_recognizer.to(device)

    optimizer = torch.optim.Adam(emotion_recognizer.parameters(), lr=opt.learning_rate)
    lr_schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1)

    criterion = torch.nn.CrossEntropyLoss()

    best_acc = 0.
    best_uar = 0.
    es = EarlyStopping(patience=opt.patience)

    # Train and validate
    for epoch in range(opt.epochs_num):
        if opt.verbose:
            print('epoch: {}/{}'.format(epoch + 1, opt.epochs_num))

        train_loss, train_acc = train(	train_loader, emotion_recognizer,
                                        optimizer, criterion, device, opt)
        test_loss, test_acc, test_uar = test(	test_loader, emotion_recognizer,
                                                criterion, device, opt)

        if opt.verbose:
            print(	'train_loss: {0:.5f}'.format(train_loss),
                    'train_acc: {0:.3f}'.format(train_acc),
                    'test_loss: {0:.5f}'.format(test_loss),
                    'test_acc: {0:.3f}'.format(test_acc),
                    'test_uar: {0:.3f}'.format(test_uar))

        lr_schedule.step(test_loss)

        os.makedirs(os.path.join(opt.logger_path, opt.source_domain), exist_ok=True)

        model_file_name = os.path.join(opt.logger_path, opt.source_domain, 'checkpoint.pth.tar')
        state = {'epoch': epoch+1, 'emotion_recognizer': emotion_recognizer.state_dict(), 'opt': opt}
        torch.save(state, model_file_name)

        if test_acc > best_acc:
            model_file_name = os.path.join(opt.logger_path, opt.source_domain, 'model.pth.tar')
            torch.save(state, model_file_name)

            best_acc = test_acc

        if test_uar > best_uar:
            best_uar = test_uar

        if es.step(test_loss):
            break

    return best_acc, best_uar

In [12]:
def train(train_loader, model, optimizer, criterion, device, verbose = False):
    model.train()

    running_loss = 0.
    running_acc = 0.

    groundtruth = []
    prediction = []

    for i, train_data in enumerate(train_loader):
        visual_features, _, acoustic_features, _, lexical_features, _, _, a_labels, _, _ = train_data # UPDATE

        visual_features = visual_features.to(device)
        acoustic_features = acoustic_features.to(device)
        lexical_features = lexical_features.to(device)

        labels = a_labels.to(device)

        optimizer.zero_grad()
        
        emotion_output, stress_output = model(visual_features, acoustic_features, lexical_features)

        emotion_loss = criterion(emotion_output, emotion_labels)
        stress_loss = criterion(stress_output, stress_labels)

        emotion_loss.backward()
        stress_loss.backward()
        
        optimizer.step() # do we need two optimizers?
        
        running_loss += loss.item()

        groundtruth.append(labels.tolist())
        predictions = predictions.argmax(dim=1, keepdim=True)
        prediction.append(predictions.view_as(labels).tolist())

        if verbose and i > 0 and int(len(train_loader) / 10) > 0 and i % (int(len(train_loader) / 10)) == 0:
            print('.', flush=True, end='')
            
    train_loss = running_loss / len(train_loader)

    groundtruth = list(itertools.chain.from_iterable(groundtruth))
    prediction = list(itertools.chain.from_iterable(prediction))

    train_acc = accuracy_score(prediction, groundtruth)

    return train_loss, train_acc

In [13]:
def test(test_loader, model, criterion, device):
    model.eval()

    running_loss = 0.
    running_acc = 0.

    with torch.no_grad():
        groundtruth = []
        prediction = []

        for i, test_data in enumerate(test_loader):
            visual_features, _, acoustic_features, _, lexical_features, _, _, a_labels, _, _ = test_data # UPDATE

            visual_features = visual_features.to(device)
            acoustic_features = acoustic_features.to(device)
            lexical_features = lexical_features.to(device)

            labels = a_labels.to(device)

            emotion_predictions, _ = model(visual_features, acoustic_features, lexical_features)
            loss = criterion(emotion_predictions, labels)

            running_loss += loss.item()

            groundtruth.append(labels.tolist())
            emotion_predictions = emotion_predictions.argmax(dim=1, keepdim=True)
            prediction.append(emotion_predictions.view_as(labels).tolist())

        test_loss = running_loss / len(test_loader)

        groundtruth = list(itertools.chain.from_iterable(groundtruth))
        prediction = list(itertools.chain.from_iterable(prediction))

        test_acc = accuracy_score(prediction, groundtruth)
        test_uar = recall_score(prediction, groundtruth, average='macro')

        return test_loss, test_acc, test_uar

In [14]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0.01)

In [15]:
# emotion_recognizer = net.Model(opt)
emotion_recognizer = MasterNet()
init_weights(emotion_recognizer)
for param in emotion_recognizer.parameters():
    param.requires_grad = True
emotion_recognizer.to(device)

learning_rate = 1e-4
optimizer = torch.optim.Adam(emotion_recognizer.parameters(), lr=learning_rate)
lr_schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1)
criterion = torch.nn.CrossEntropyLoss()

train_dataset_file_path = '../dataset/IEMOCAP/0/train.csv'
train_loader = datasets.get_dataloader(train_dataset_file_path, 'train')
test_dataset_file_path = '../dataset/IEMOCAP/0/test.csv'
test_loader = datasets.get_dataloader(test_dataset_file_path, 'test')

train_loss, train_acc = train(train_loader, emotion_recognizer, optimizer, criterion, device)
test_loss, test_acc, test_uar = test(test_loader, emotion_recognizer, criterion, device)

x_a before encoding torch.Size([8, 750, 40])
x_l before encoding torch.Size([8, 768])


RuntimeError: Given groups=1, weight of size [32, 40, 2], expected input[8, 750, 40] to have 40 channels, but got 750 channels instead