In [8]:
"""
Code from
" GRATIS: Deep Learning Graph Representation
 with Task-specific Topology and
 Multi-dimensional Edge Features"
Siyang Song, Yuxin Song, Cheng Luo, Zhiyuan Song, Selim Kuzucu, Xi Jia, Zhijiang Guo, Weicheng Xie,
 Linlin Shen, and Hatice Gunes
Please see https://github.com/SSYSteve/GRATIS
"""
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import math
import pandas as pd
from tqdm import tqdm
from audtorch.metrics import ConcordanceCC
import os
from torch.autograd import Variable

os.environ['CUDA_VISIBLE_DEVICES']='0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def bn_init(bn):
    bn.weight.data.fill_(1)
    bn.bias.data.zero_()
class CrossAttn(nn.Module):
    """ cross attention Module"""
    def __init__(self, in_channels):
        super(CrossAttn, self).__init__()
        self.in_channels = in_channels
        self.linear_q = nn.Linear(in_channels, in_channels // 2)
        self.linear_k = nn.Linear(in_channels, in_channels // 2)
        self.linear_v = nn.Linear(in_channels, in_channels)
        self.scale = (self.in_channels // 2) ** -0.5
        self.attend = nn.Softmax(dim=-1)

        self.linear_k.weight.data.normal_(0, math.sqrt(2. / (in_channels // 2)))
        self.linear_q.weight.data.normal_(0, math.sqrt(2. / (in_channels // 2)))
        self.linear_v.weight.data.normal_(0, math.sqrt(2. / in_channels))

    def forward(self, y, x):
        query = self.linear_q(y)
        key = self.linear_k(x)
        value = self.linear_v(x)
        dots = torch.matmul(query, key.transpose(-2, -1)) * self.scale
        attn = self.attend(dots)
        out = torch.matmul(attn, value)
        return out


class GEM(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(GEM, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.FAM = CrossAttn(self.in_channels)
        self.ARM = CrossAttn(self.in_channels)
        self.edge_proj = nn.Linear(in_channels, in_channels)
        self.bn = nn.BatchNorm2d(self.num_classes * self.num_classes)

        self.edge_proj.weight.data.normal_(0, math.sqrt(2. / in_channels))
        self.bn.weight.data.fill_(1)
        self.bn.bias.data.zero_()

    def forward(self, class_feature, global_feature):
        B, N, D, C = class_feature.shape
        global_feature = global_feature.repeat(1, N, 1).view(B, N, D, C)
        feat = self.FAM(class_feature, global_feature)
        feat_end = feat.repeat(1, 1, N, 1).view(B, -1, D, C)
        feat_start = feat.repeat(1, N, 1, 1).view(B, -1, D, C)
        feat = self.ARM(feat_start, feat_end)
        edge = self.bn(self.edge_proj(feat))
        return edge

class LinearBlock(nn.Module):
    def __init__(self, in_features,out_features=None,drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        self.fc = nn.Linear(in_features, out_features)
        self.bn = nn.BatchNorm1d(out_features)
        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout(drop)
        self.fc.weight.data.normal_(0, math.sqrt(2. / out_features))
        self.bn.weight.data.fill_(1)
        self.bn.bias.data.zero_()

    def forward(self, x):
        x = self.drop(x)
        x = self.fc(x).permute(0, 2, 1)
        x = self.relu(self.bn(x)).permute(0, 2, 1)
        return x
#Used in stage 1 (ANFL)
def normalize_digraph(A):
    b, n, _ = A.shape
    node_degrees = A.detach().sum(dim = -1)
    degs_inv_sqrt = node_degrees ** -0.5
    norm_degs_matrix = torch.eye(n)
    dev = A.get_device()
    if dev >= 0:
        norm_degs_matrix = norm_degs_matrix.to(dev)
    norm_degs_matrix = norm_degs_matrix.view(1, n, n) * degs_inv_sqrt.view(b, n, 1)
    norm_A = torch.bmm(torch.bmm(norm_degs_matrix,A),norm_degs_matrix)
    return norm_A


#Used in stage 2 (MEFL)
def create_e_matrix(n):
    end = torch.zeros((n*n,n))
    for i in range(n):
        end[i * n:(i + 1) * n, i] = 1
    start = torch.zeros(n, n)
    for i in range(n):
        start[i, i] = 1
    start = start.repeat(n,1)
    return start,end
class MEFG(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(MEFG, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.VCR = CrossAttn(self.in_channels)
        self.VVR = CrossAttn(self.in_channels)
        self.edge_proj = nn.Linear(in_channels, in_channels)
        self.bn = nn.BatchNorm2d(self.num_classes * self.num_classes)

        self.edge_proj.weight.data.normal_(0, math.sqrt(2. / in_channels))
        self.bn.weight.data.fill_(1)
        self.bn.bias.data.zero_()

    def forward(self, class_feature, global_feature):
        B, N, D, C = class_feature.shape
        global_feature = global_feature.repeat(1, N, 1).view(B, N, D, C)
        feat = self.VCR(class_feature, global_feature)
        feat_end = feat.repeat(1, 1, N, 1).view(B, -1, D, C)
        feat_start = feat.repeat(1, N, 1, 1).view(B, -1, D, C)
        feat = self.VVR(feat_start, feat_end)
        edge = self.bn(self.edge_proj(feat))
        return edge
class GRUModel(nn.Module):
    def __init__(self, input_num, hidden_num, output_num):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_num
        # 这里设置了 batch_first=True, 所以应该 inputs = inputs.view(inputs.shape[0], -1, inputs.shape[1])
        # 针对时间序列预测问题，相当于将时间步（seq_len）设置为 1。
        self.GRU_layer = nn.GRU(input_size=input_num, hidden_size=hidden_num, batch_first=True,bidirectional=True)
        self.output_linear = nn.Linear(hidden_num*2, output_num)
        self.hidden = None

    def forward(self, x):
        # h_n of shape (num_layers * num_directions, batch, hidden_size)
        # 这里不用显式地传入隐层状态 self.hidden
        x, self.hidden = self.GRU_layer(x)
        # print(x.shape)
        x = self.output_linear(x)
        return x, self.hidden

class GNN(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(GNN, self).__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        # GNN Matrix: E x N
        # Start Matrix Item:  define the source node of one edge
        # End Matrix Item:  define the target node of one edge
        # Algorithm details in Residual Gated Graph Convnets: arXiv preprint arXiv:1711.07553
        # or Benchmarking Graph Neural Networks: arXiv preprint arXiv:2003.00982v3

        start, end = create_e_matrix(self.num_classes)
        self.start = Variable(start, requires_grad=False)
        self.end = Variable(end, requires_grad=False)

        dim_in = self.in_channels
        dim_out = self.in_channels

        self.U1 = nn.Linear(dim_in, dim_out, bias=False)
        self.V1 = nn.Linear(dim_in, dim_out, bias=False)
        self.A1 = nn.Linear(dim_in, dim_out, bias=False)
        self.B1 = nn.Linear(dim_in, dim_out, bias=False)
        self.E1 = nn.Linear(dim_in, dim_out, bias=False)

        self.U2 = nn.Linear(dim_in, dim_out, bias=False)
        self.V2 = nn.Linear(dim_in, dim_out, bias=False)
        self.A2 = nn.Linear(dim_in, dim_out, bias=False)
        self.B2 = nn.Linear(dim_in, dim_out, bias=False)
        self.E2 = nn.Linear(dim_in, dim_out, bias=False)

        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(2)
        self.bnv1 = nn.BatchNorm1d(num_classes)
        self.bne1 = nn.BatchNorm1d(num_classes*num_classes)

        self.bnv2 = nn.BatchNorm1d(num_classes)
        self.bne2 = nn.BatchNorm1d(num_classes * num_classes)

        self.act = nn.ReLU()

        self.init_weights_linear(dim_in, 1)

    def init_weights_linear(self, dim_in, gain):
        # conv1
        scale = gain * np.sqrt(2.0 / dim_in)
        self.U1.weight.data.normal_(0, scale)
        self.V1.weight.data.normal_(0, scale)
        self.A1.weight.data.normal_(0, scale)
        self.B1.weight.data.normal_(0, scale)
        self.E1.weight.data.normal_(0, scale)

        self.U2.weight.data.normal_(0, scale)
        self.V2.weight.data.normal_(0, scale)
        self.A2.weight.data.normal_(0, scale)
        self.B2.weight.data.normal_(0, scale)
        self.E2.weight.data.normal_(0, scale)

        bn_init(self.bnv1)
        bn_init(self.bne1)
        bn_init(self.bnv2)
        bn_init(self.bne2)

    def forward(self, x, edge):
        # device
        dev = x.get_device()
        if dev >= 0:
            start = self.start.to(dev)
            end = self.end.to(dev)

        # GNN Layer 1:
        res = x
        Vix = self.A1(x)  # V x d_out
        Vjx = self.B1(x)  # V x d_out
        e = self.E1(edge)  # E x d_out
        edge = edge + self.act(self.bne1(torch.einsum('ev, bvc -> bec', (end, Vix)) + torch.einsum('ev, bvc -> bec',(start, Vjx)) + e))  # E x d_out

        e = self.sigmoid(edge)
        b, _, c = e.shape
        e = e.view(b,self.num_classes, self.num_classes, c)
        e = self.softmax(e)
        e = e.view(b, -1, c)

        Ujx = self.V1(x)  # V x H_out
        Ujx = torch.einsum('ev, bvc -> bec', (start, Ujx))  # E x H_out
        Uix = self.U1(x)  # V x H_out
        x = Uix + torch.einsum('ve, bec -> bvc', (end.t(), e * Ujx)) / self.num_classes  # V x H_out
        x = self.act(res + self.bnv1(x))
        res = x

        # GNN Layer 2:
        Vix = self.A2(x)  # V x d_out
        Vjx = self.B2(x)  # V x d_out
        e = self.E2(edge)  # E x d_out
        edge = edge + self.act(self.bne2(torch.einsum('ev, bvc -> bec', (end, Vix)) + torch.einsum('ev, bvc -> bec', (start, Vjx)) + e))  # E x d_out

        e = self.sigmoid(edge)
        b, _, c = e.shape
        e = e.view(b, self.num_classes, self.num_classes, c)
        e = self.softmax(e)
        e = e.view(b, -1, c)

        Ujx = self.V2(x)  # V x H_out
        Ujx = torch.einsum('ev, bvc -> bec', (start, Ujx))  # E x H_out
        Uix = self.U2(x)  # V x H_out
        x = Uix + torch.einsum('ve, bec -> bvc', (end.t(), e * Ujx)) / self.num_classes  # V x H_out
        x = self.act(res + self.bnv2(x))
        return x, edge

class GNNMASK(nn.Module):
    def __init__(self, in_channels, num_classes=10, neighbor_num=1, metric='dots'):
        super(GNNMASK, self).__init__()
        # in_channels: dim of node feature
        # num_classes: num of nodes
        # neighbor_num: K in paper and we select the top-K nearest neighbors for each node feature.
        # metric: metric for assessing node similarity. Used in FGG module to build a dynamical graph
        # X' = ReLU(X + BN(V(X) + A x U(X)) )

        self.in_channels = in_channels
        self.num_classes = num_classes
        self.relu = nn.ReLU()
        self.metric = metric
        self.neighbor_num = neighbor_num

#         # network
        self.U = nn.Linear(self.in_channels,self.in_channels)
        self.V = nn.Linear(self.in_channels,self.in_channels)
        self.bnv = nn.BatchNorm1d(num_classes)

# #         # init
        self.U.weight.data.normal_(0, math.sqrt(2. / self.in_channels))
        self.V.weight.data.normal_(0, math.sqrt(2. / self.in_channels))
        self.bnv.weight.data.fill_(1)
        self.bnv.bias.data.zero_()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        b, n, c = x.shape

        # build dynamical graph
        # si = x.detach()

# build dynamical graph
        if self.metric == 'dots':
            si = x.detach()
            si = torch.einsum('b i j , b j k -> b i k', si, si.transpose(1, 2))
            threshold = si.topk(k=self.neighbor_num, dim=-1, largest=True)[0][:, :, -1].view(b, n, 1)
            adj = (si >= threshold).float()

        elif self.metric == 'cosine':
            si = x.detach()
            si = F.normalize(si, p=2, dim=-1)
            si = torch.einsum('b i j , b j k -> b i k', si, si.transpose(1, 2))
            threshold = si.topk(k=self.neighbor_num, dim=-1, largest=True)[0][:, :, -1].view(b, n, 1)
            adj = (si >= threshold).float()

        elif self.metric == 'l1':
            si = x.detach().repeat(1, n, 1).view(b, n, n, c)
            si = torch.abs(si.transpose(1, 2) - si)
            si = si.sum(dim=-1)
            threshold = si.topk(k=self.neighbor_num, dim=-1, largest=False)[0][:, :, -1].view(b, n, 1)
            adj = (si <= threshold).float()

        else:
            raise Exception("Error: wrong metric: ", self.metric)


        # si_1 = torch.einsum('b i j , b j k -> b i k', si, si.transpose(1, 2))
        # adj_1 = self.sigmoid(si_1.float())

        # si = x.detach()
        # si = torch.einsum('b i j , b j k -> b i k', si, si.transpose(1, 2))
        # threshold = si.topk(k=self.neighbor_num, dim=-1, largest=True)[0][:, :, -1].view(b, n, 1)
        # adj = (si <= threshold).float()
        # GNN process
        A = normalize_digraph(adj)
        aggregate = torch.einsum('b i j, b j k->b i k', adj, self.V(x))
        x = self.relu(x + self.bnv(aggregate + self.U(x)))
        # si = x.detach()
        si = torch.einsum('b i j , b j k -> b i k', x, x.transpose(1, 2))
        # adj=si
        A = normalize_digraph(si)
        return x
class Head(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(Head, self).__init__()
        # The head of network
        # Input: the feature maps x from backbone
        # Output: the AU recognition probabilities cl And the logits cl_edge of edge features for classification
        # Modules: 1. AFG extracts individual Au feature maps U_1 ---- U_N
        #          2. MEFG: graph edge modeling for learning multi-dimensional edge features
        #          3. Gated-GCN for graph learning with node and multi-dimensional edge features
        # sc: individually calculate cosine similarity between node features and a trainable vector.
        # edge fc: for edge prediction

        self.in_channels = in_channels
        self.num_classes = num_classes
        class_linear_layers = []
        for i in range(self.num_classes):
            layer = LinearBlock(self.in_channels, self.in_channels)
            class_linear_layers += [layer]
        self.class_linears = nn.ModuleList(class_linear_layers)
        self.edge_extractor = MEFG(self.in_channels, self.num_classes)
        self.gnn = GNN(self.in_channels, self.num_classes)
        self.sc = nn.Parameter(torch.FloatTensor(torch.zeros(self.num_classes, self.in_channels)))
        self.edge_fc = nn.Linear(self.in_channels, 4)
        self.relu = nn.ReLU()
        self.mask = GNNMASK(in_channels, num_classes)
        nn.init.xavier_uniform_(self.edge_fc.weight)
        nn.init.xavier_uniform_(self.sc)

    def forward(self, x):
        # AFG
        f_u = []
        for i, layer in enumerate(self.class_linears):
            f_u.append(layer(x).unsqueeze(1))
        f_u = torch.cat(f_u, dim=1)
        #print(f_u.shape)
        f_v = f_u.mean(dim=-2)
        # f_e = self.edge_extractor(f_u, x)
        # f_e = f_e.mean(dim=-2)
 
        
        # Predefined Rules
        # print(f_v.shape)#1750,10,40
        # f_e = torch.zeros(f_v.shape[0], 100, 40).cpu().detach().numpy()
        # # for m in range(f_v.shape[0]):
        #     for i in range(4):
        #         for j in range(4):
        #             a = f_v[m,i,:]
        #             b = f_v[m,j,:]
        #             f_e[m,i*j,:].fill(distance.euclidean(a.cpu().detach().numpy(), b.cpu().detach().numpy()))

        # TTP
        f_v = self.mask(f_v)

        # MEFL
        # f_e = self.edge_extractor(f_u, x)
        # feat_end = f_v.repeat(1, 1, n).view(b, -1, c)
        # feat_start = f_v.repeat(1, n, 1).view(b, -1, c)

        # MEFL
        # f_e = self.edge_extractor(f_u, x)
        # f_e = f_e.mean(dim=-2)

        # feat_end = f_v.repeat(1, 1, n).view(b, -1, c)
        # feat_start = f_v.repeat(1, n, 1).view(b, -1, c)
        # f_e = feat_start - feat_end

        b, n, c = f_v.shape
        # mask = A.view(b,n*n,1)
        # f_e=f_e*mask
        # f_v, f_e = self.gnn(f_v, f_e)
        b, n, c = f_v.shape
        sc = self.sc
        sc = self.relu(sc)
        sc = F.normalize(sc, p=2, dim=-1)
        cl = F.normalize(f_v, p=2, dim=-1)
        cl = (cl * sc.view(1, n, c)).sum(dim=-1)

        # sc = self.sc
        # sc = self.relu(sc)
        # sc = F.normalize(sc, p=2, dim=-1)
        # cl = F.normalize(f_v, p=2, dim=-1)
        # cl = (cl * sc.view(1, n, c)).sum(dim=-1, keepdim=False)
        # cl_edge = self.edge_fc(f_e)
        return cl 
class MEFARG(nn.Module):
    def __init__(self, num_classes=10, neighbor_num=1,metric='dots'):
        super(MEFARG, self).__init__()
        self.bigru = GRUModel(num_classes,num_classes,num_classes).to(device)
        self.bigru2 = GRUModel(num_classes,num_classes,num_classes).to(device)
        self.bigru1 = GRUModel(32,32,32).to(device)
        self.in_channels = 32
        self.out_channels = self.in_channels // 4
        self.global_linear = LinearBlock(self.in_channels, self.out_channels)
        self.head = Head(self.out_channels, num_classes)
        self.linear = nn.Linear(num_classes, 1).to(device)
        self.linear2 = nn.Linear(160, 1).to(device)
    def forward(self, x):
        # x: b d c
        # x,_ = self.bigru(torch.unsqueeze(x,dim=0))
        x,_ = self.bigru1(torch.transpose(x,0,1))
        # print(x.shape)
        x = torch.transpose(x,0,1)
        # print(x.shape)
        
        # cl_5 = self.linear2(torch.transpose(x,0,1)).reshape((5,-1))
        # x = x.reshape(-1,1,160)
        # x = torch.unsqueeze(x,dim=0)
        x = self.global_linear(x)
        # print(x.shape)
        
        # print(x.shape)
        # cl = torch.unsqueeze(self.linear(x),dim=-1)
        cl = self.head(x)
        cl,_ = self.bigru(torch.unsqueeze(cl,dim=0))
        feat = cl
        # cl,_ = self.bigru2(cl)
        # print(cl.shape)
        cl = torch.unsqueeze(self.linear(cl).flatten(),dim=0)
        # print(cl.shape)
        # print(x1)
        return cl,feat

In [9]:
import time
def train_AMEF_with_result(Culture,label,learning_rate,Test_CH=False,Test_BR=False):
    EPOCHS = 100
    # learning_rate = 0.01
    max_length = 1768
    train_file_list = []
    dev_file_list = []
    train_y_list = []
    dev_y_list = []
    for filename in os.listdir('./AVEC2019_CES_traindevel/audio/'):
        if Culture =='DE+HU':
            if 'Train' in filename:
                audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                if all_feat.shape[0]>=max_length:
                    all_feat = all_feat[:max_length,:]
                else:
                    all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                all_feat = torch.tensor(all_feat,dtype=torch.float32)
                train_file_list.append(all_feat)
                y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                y = torch.tensor(y[label].values,dtype=torch.float32)
                train_y_list.append(y)
            else:
                audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                if all_feat.shape[0]>=max_length:
                    all_feat = all_feat[:max_length,:]
                else:
                    all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                all_feat = torch.tensor(all_feat,dtype=torch.float32)
                dev_file_list.append(all_feat)
                y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                y = torch.tensor(y[label].values,dtype=torch.float32)
                dev_y_list.append(y)
        else:
            if Culture in filename:
                if 'Train' in filename:
                    audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                    # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                    # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                    # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                    # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                    min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                    all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                    # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                    # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                    if all_feat.shape[0]>=max_length:
                        all_feat = all_feat[:max_length,:]
                    else:
                        all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                    all_feat = torch.tensor(all_feat,dtype=torch.float32)
                    train_file_list.append(all_feat)
                    y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                    y = torch.tensor(y[label].values,dtype=torch.float32)
                    train_y_list.append(y)
                else:
                    audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                    # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                    # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                    # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                    # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                    min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                    all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                    # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                    # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                    if all_feat.shape[0]>=max_length:
                        all_feat = all_feat[:max_length,:]
                    else:
                        all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                    all_feat = torch.tensor(all_feat,dtype=torch.float32)
                    dev_file_list.append(all_feat)
                    y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                    y = torch.tensor(y[label].values,dtype=torch.float32)
                    dev_y_list.append(y)
    emotion_model = MEFARG().to(device)
    loss_fn = ConcordanceCC()
    optimizer = torch.optim.RMSprop(emotion_model.parameters(),lr = learning_rate)
    max_ccc = 0
    max_ch_ccc = 0
    max_br_ccc = 0
    train_time = []
    infer_time = []
    train_loss = []
    infer_loss = []
    for epoch in range(EPOCHS):
        emotion_model.train()
        begin_one_epoch_train = time.time()
        for all_feat,y in zip(train_file_list,train_y_list):
            optimizer.zero_grad()
            all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
            y_pred,y_pred_5 = emotion_model(all_feat.to(device))
            # y_pred = (y_pred-y_pred.mean())/(y_pred.max()-y_pred.min())

            y_true = torch.unsqueeze(y,axis=0).to(device)
            if y_true.shape[1] < max_length:
                y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
            else:
                y_true = y_true[:,:max_length]
            cccloss = 2-loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)#-loss_fn(y_pred_5,torch.cat([y_true for i in range(5)],dim=0))
            cccloss.backward()
            optimizer.step()#参数更新
        end_one_epoch_train = time.time()
        train_time.append(end_one_epoch_train-begin_one_epoch_train)
        begin_one_epoch_infer = time.time()
        emotion_model.eval()
        with torch.no_grad():
            dev_cc_loss = 0.0
            y_l = []
            feat_y_l = []
            for all_feat,y in zip(dev_file_list,dev_y_list):
                y_l += y.data.cpu().numpy().tolist()
                shape_y = y.shape[0]
                all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
                y_pred,y_pred_5 = emotion_model(all_feat.to(device))
                # y_pred = (y_pred-y_pred.mean())/(y_pred.max()-y_pred.min())
                # y_pred = torch.unsqueeze(y_pred_5.mean(0),dim=0)
                y_true = torch.unsqueeze(y,axis=0).to(device)
                if y_true.shape[1] < max_length:
                    y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
                else:
                    y_true = y_true[:,:max_length]
                cccloss = loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)
                dev_cc_loss += cccloss.item()
                feat_y_l.append(y_pred_5[0,:shape_y,:].data.cpu().numpy())
            feat_y = np.concatenate(feat_y_l,axis=0)
            feat_df = pd.DataFrame()
            feat_df[label] = y_l
            for i in range(feat_y.shape[1]):
                feat_df['feat_'+str(i)] = feat_y[:,i]
            ccc_loss = dev_cc_loss/len(dev_file_list)
            print(f'dev_CCC_loss:{ccc_loss}')
            if ccc_loss>max_ccc:
                max_ccc = ccc_loss
                torch.save(emotion_model.state_dict(), f'./saved_model_TTP/{Culture}_{label}_{ccc_loss}_model.bin')
                feat_df.to_csv(f'./saved_feat_TTP/{Culture}_{label}_{ccc_loss}.csv',index=0)
        end_one_epoch_infer = time.time()
        infer_time.append(end_one_epoch_infer-begin_one_epoch_infer)
        infer_loss.append(ccc_loss)
        with torch.no_grad():
            train_cc_loss = 0.0
            y_l = []
            for all_feat,y in zip(train_file_list,train_y_list):
                y_l += y.data.cpu().numpy().tolist()
                shape_y = y.shape[0]
                all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
                y_pred,y_pred_5 = emotion_model(all_feat.to(device))
                y_true = torch.unsqueeze(y,axis=0).to(device)
                if y_true.shape[1] < max_length:
                    y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
                else:
                    y_true = y_true[:,:max_length]
                cccloss = loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)
                train_cc_loss += cccloss.item()
            ccc_loss = train_cc_loss/len(train_file_list)
        train_loss.append(ccc_loss)
    result = pd.DataFrame()
    result['train_time'] = train_time
    result['infer_time'] = infer_time
    result['train_loss'] = train_loss
    result['infer_loss'] = infer_loss
    result.to_csv(f'./saved_result_TTP/{Culture}_{label}_{max_ccc}.csv',index=0)
    print(f'{Culture}__{label}__ccc_loss:{max_ccc}')

In [5]:
#Cross_validation
from sklearn.model_selection import StratifiedKFold
import time
def train_AMEF_Cross_validation(Culture,label,learning_rate,fold=12,Test_CH=False,Test_BR=False):
    EPOCHS = 50
    # learning_rate = 0.01
    max_length = 1768
    train_file_list = []
    dev_file_list = []
    train_y_list = []
    dev_y_list = []
    data_process_begin = time.time()
    
    for filename in os.listdir('./AVEC2019_CES_traindevel/audio/'):
        if Culture =='DE+HU':
            if 'Train' in filename:
                audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                if all_feat.shape[0]>=max_length:
                    all_feat = all_feat[:max_length,:]
                else:
                    all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                all_feat = torch.tensor(all_feat,dtype=torch.float32)
                train_file_list.append(all_feat)
                y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                y = torch.tensor(y[label].values,dtype=torch.float32)
                train_y_list.append(y)
            else:
                audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                if all_feat.shape[0]>=max_length:
                    all_feat = all_feat[:max_length,:]
                else:
                    all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                all_feat = torch.tensor(all_feat,dtype=torch.float32)
                dev_file_list.append(all_feat)
                y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                y = torch.tensor(y[label].values,dtype=torch.float32)
                dev_y_list.append(y)
        else:
            if Culture in filename:
                if 'Train' in filename:
                    audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                    # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                    # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                    # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                    # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                    min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                    all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                    # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                    # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                    if all_feat.shape[0]>=max_length:
                        all_feat = all_feat[:max_length,:]
                    else:
                        all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                    all_feat = torch.tensor(all_feat,dtype=torch.float32)
                    train_file_list.append(all_feat)
                    y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                    y = torch.tensor(y[label].values,dtype=torch.float32)
                    train_y_list.append(y)
                else:
                    audio_features_deepspectrum = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/deepspectrum/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_egemaps_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/egemaps_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_xbow = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_xbow/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    audio_features_mfcc_functionals = pd.read_csv('./AVEC2019-master/Baseline_systems/CES/output/mfcc_functionals/'+filename.split('.')[0]+'.csv').iloc[:,:].values
                    # audio_features_deepspectrum = (audio_features_deepspectrum-audio_features_deepspectrum.mean())/(audio_features_deepspectrum.max()-audio_features_deepspectrum.min())
                    # audio_features_egemaps_xbow = (audio_features_egemaps_xbow-audio_features_egemaps_xbow.mean())/(audio_features_egemaps_xbow.max()-audio_features_egemaps_xbow.min())
                    # audio_features_egemaps_functionals = (audio_features_egemaps_functionals - audio_features_egemaps_functionals.mean())/(audio_features_egemaps_functionals.max()-audio_features_egemaps_functionals.min())
                    # audio_features_mfcc_xbow = (audio_features_mfcc_xbow - audio_features_mfcc_xbow.mean())/(audio_features_mfcc_xbow.max()-audio_features_mfcc_xbow.min())
                    # audio_features_mfcc_functionals = (audio_features_mfcc_functionals- audio_features_mfcc_functionals.mean())/(audio_features_mfcc_functionals.max()-audio_features_mfcc_functionals.min())

                    min_shape = min(audio_features_deepspectrum.shape[0],audio_features_egemaps_xbow.shape[0],audio_features_egemaps_functionals.shape[0],audio_features_mfcc_xbow.shape[0],audio_features_mfcc_functionals.shape[0])
                    all_feat = np.concatenate([audio_features_deepspectrum[:min_shape,],audio_features_egemaps_xbow[:min_shape,],audio_features_egemaps_functionals[:min_shape,],audio_features_mfcc_xbow[:min_shape,],audio_features_mfcc_functionals[:min_shape,]],axis=1)
                    # all_feat = audio_features_deepspectrum[:min_shape,]+audio_features_egemaps_xbow[:min_shape,]+audio_features_egemaps_functionals[:min_shape,]+audio_features_mfcc_xbow[:min_shape,]+audio_features_mfcc_functionals[:min_shape,]
                    # all_feat = (all_feat)/(all_feat.max()-all_feat.min())
                    if all_feat.shape[0]>=max_length:
                        all_feat = all_feat[:max_length,:]
                    else:
                        all_feat = np.concatenate([all_feat,np.zeros((max_length-all_feat.shape[0],all_feat.shape[1]))],axis=0)
                    all_feat = torch.tensor(all_feat,dtype=torch.float32)
                    dev_file_list.append(all_feat)
                    y = pd.read_csv('./AVEC2019_CES_traindevel/labels/'+filename.split('.')[0]+'.csv',sep=';')
                    y = torch.tensor(y[label].values,dtype=torch.float32)
                    dev_y_list.append(y)
    data_process_end = time.time()
    print(f'data_process_time:{str(data_process_end-data_process_begin)}')
    emotion_model = MEFARG().to(device)
    loss_fn = ConcordanceCC()
    optimizer = torch.optim.RMSprop(emotion_model.parameters(),lr = learning_rate)
    max_ccc = 0
    max_ch_ccc = 0
    max_br_ccc = 0
    fold_id = []
    epoch_id = []
    train_loss = []
    infer_loss = []
    K_Fold = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
    CV_begin = time.time()
    for fold, (train_index, val_index) in enumerate(K_Fold.split(train_file_list, [0 for i in range(len(train_file_list))])):
        print(f'fold {str(fold)} begin')
        CV_begin_one_fold = time.time()
        for epoch in range(EPOCHS):
            fold_id.append(fold)
            epoch_id.append(epoch)
            CV_begin_one_epoch_train = time.time()
            emotion_model.train()
            for all_feat,y in zip([j for i,j in enumerate(train_file_list) if i in train_index],[j for i,j in enumerate(train_y_list) if i in train_index]):
                optimizer.zero_grad()
                all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
                y_pred,y_pred_5 = emotion_model(all_feat.to(device))
                # y_pred = (y_pred-y_pred.mean())/(y_pred.max()-y_pred.min())

                y_true = torch.unsqueeze(y,axis=0).to(device)
                if y_true.shape[1] < max_length:
                    y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
                else:
                    y_true = y_true[:,:max_length]
                cccloss = 2-loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)#-loss_fn(y_pred_5,torch.cat([y_true for i in range(5)],dim=0))
                cccloss.backward()
                optimizer.step()#参数更新
            CV_end_one_epoch_train = time.time()
            print(f'CV_one_epoch_train_time:{str(CV_end_one_epoch_train-CV_begin_one_epoch_train)}')
            CV_begin_one_epoch_infer = time.time()
            emotion_model.eval()
            with torch.no_grad():
                dev_cc_loss = 0.0
                y_l = []
                feat_y_l = []
                for all_feat,y in zip(dev_file_list,dev_y_list):
                    y_l += y.data.cpu().numpy().tolist()
                    shape_y = y.shape[0]
                    all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
                    y_pred,y_pred_5 = emotion_model(all_feat.to(device))
                    # y_pred = (y_pred-y_pred.mean())/(y_pred.max()-y_pred.min())
                    # y_pred = torch.unsqueeze(y_pred_5.mean(0),dim=0)
                    y_true = torch.unsqueeze(y,axis=0).to(device)
                    if y_true.shape[1] < max_length:
                        y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
                    else:
                        y_true = y_true[:,:max_length]
                    cccloss = loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)
                    dev_cc_loss += cccloss.item()
                    feat_y_l.append(y_pred_5[0,:shape_y,:].data.cpu().numpy())
                feat_y = np.concatenate(feat_y_l,axis=0)
                feat_df = pd.DataFrame()
                feat_df[label] = y_l
                for i in range(feat_y.shape[1]):
                    feat_df['feat_'+str(i)] = feat_y[:,i]
                ccc_loss = dev_cc_loss/len(dev_file_list)
                print(f'dev_CCC_loss:{ccc_loss}')
                if ccc_loss>max_ccc:
                    max_ccc = ccc_loss
                    torch.save(emotion_model.state_dict(), f'./saved_model_TTP/{Culture}_{label}_{ccc_loss}_model.bin')
                    feat_df.to_csv(f'./saved_feat_TTP/{Culture}_{label}_{ccc_loss}.csv',index=0)
            infer_loss.append(ccc_loss)
            with torch.no_grad():
                train_cc_loss = 0.0
                y_l = []
                for all_feat,y in zip(train_file_list,train_y_list):
                    y_l += y.data.cpu().numpy().tolist()
                    shape_y = y.shape[0]
                    all_feat = torch.transpose(torch.reshape(all_feat,[all_feat.shape[0],32,5]),1,2)
                    y_pred,y_pred_5 = emotion_model(all_feat.to(device))
                    y_true = torch.unsqueeze(y,axis=0).to(device)
                    if y_true.shape[1] < max_length:
                        y_true = torch.cat([y_true,torch.zeros((1,max_length-y_true.shape[1])).to(device)],axis=1)
                    else:
                        y_true = y_true[:,:max_length]
                    cccloss = loss_fn(torch.unsqueeze(y_pred.flatten(),axis=0),y_true)
                    train_cc_loss += cccloss.item()
                ccc_loss = train_cc_loss/len(train_file_list)
            train_loss.append(ccc_loss)
            CV_end_one_epoch_infer = time.time()
            print(f'CV_one_epoch_infer_time:{str(CV_end_one_epoch_infer-CV_begin_one_epoch_infer)}')
        CV_end_one_fold = time.time()
        print(f'CV_one_fold_time:{str(CV_end_one_fold-CV_begin_one_fold)}')
    cv_result = pd.DataFrame()
    cv_result['fold_id'] = fold_id
    cv_result['epoch_id'] = epoch_id
    cv_result['train_loss'] = train_loss
    cv_result['infer_loss'] = infer_loss
    result.to_csv(f'./saved_cv_result/{Culture}_{label}_{ccc_loss}.csv',index=0)
    CV_end = time.time()
    print(f'CV_time:{str(CV_end-CV_begin)}')
    print(f'{Culture}__{label}__ccc_loss:{max_ccc}')

In [None]:
train_AMEF(Culture='DE',label='valence',learning_rate=0.005)