In [2]:
import librosa
import os
import pandas as pd
import numpy as np

from transformers import AutoProcessor, Data2VecAudioModel
from transformers import AutoTokenizer, Data2VecTextModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset, Audio
import datasets
import glob
import collections
from tqdm import tqdm

import matplotlib.pyplot as plt

In [3]:
def data_path(session_num, modal):
# modal : wav, txt 둘 중 하나 입력
# 이렇게 불러온 데이터는 시간 순은 아님을 유의
    if session_num <= 9:
        dir_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/Session0' + str(session_num)
    else:
        dir_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/Session' + str(session_num)
    data_path = glob.glob(dir_path + '/*.' + modal)
    data_path = sorted(data_path)
    return data_path

def make_audio_datasets(num_sessions):
# num_sessions : from 1 to 40
# type(return) = dict{1:audio_dataset,...,40:audio_dataset}
    audio_datasets = {}
    
    for i in tqdm(range(1, num_sessions + 1)):
        # audio_datasets[i] = datasets.Dataset.from_dict({'audio': data_path(i, 'wav')})
        audio_datasets[i] = datasets.Dataset.from_dict({'audio': data_path(i, 'wav')}).cast_column("audio", Audio())
    return audio_datasets

def make_text_datasets(num_sessions):
# num_sessions : from 1 to 40
# type(return) = dict{1:[paths, corpus],...,40:[paths, corpus]}
    text_datasets = {}
    
    for i in tqdm(range(1, num_sessions + 1)):
        
        paths = []
        corpus = []
        
        for path in data_path(i, 'txt'):
            paths.append(path)
            with open(path, "r") as f:
                corpus.append(f.read())
        
        text_datasets[i] = [paths, corpus]

    return text_datasets  

def audio_embedding(audio_datasets, batch_size=32):
# audio_datasets = {1:audio_dataset, ... , 40:audio_dataset}
    batch_size = batch_size
    
    audio_processed_dict = {}
    for session, audio_dataset in audio_datasets.items():
        
        processed_audio_by_session = []
        for i in range(0, audio_dataset.num_rows, batch_size):
            
            audio_arrayes = []
            for j in audio_dataset[i:i+batch_size]['audio']:
                audio_arrayes.append(j['array'])
        
            inputs = processor(audio_arrayes, sampling_rate = sampling_rate,  padding=True, max_length=16000, truncation = True,return_attention_mask=True, return_tensors="pt")
            processed_audio_by_session.append(inputs)
        
        audio_processed_dict[session] = processed_audio_by_session

    
# audio_processed_dict : {1 : [[batch 당 processing][batch 당 processing][batch 당 processing]],2,3,}    
    audio_embedded_dict = {}        
    for session, val_dic_list in tqdm(audio_processed_dict.items()):
        
        val_dic_by_session = []
        for val_dic in val_dic_list:
            
            with torch.no_grad():
                outputs = audio_d2v(**val_dic)
            val_dic_by_session.append(outputs)    
            
        audio_embedded_dict[session] = val_dic_by_session
# audio_embedded_dict = {1:embedded_data_by_session, ..., 40:embedded_data_by_session}    
    
    return audio_embedded_dict

def text_embedding(text_datasets, batch_size=32):
# text_dataset = {1:[path, corpus], ..., 40:[path, corpus]}    
    
    text_embedded_dict = {}
    
    for session, ( _ , corpus) in tqdm(text_datasets.items()):
        
        embedded_data_by_batch = []
        print(len(corpus))
        for i in range(0, len(corpus), batch_size):
            
            corpus_by_batch = corpus[i:i+batch_size]
            inputs = tokenizer(corpus_by_batch, padding= True, max_length = 20, truncation = True, return_tensors="pt")
            outputs = text_d2v(**inputs)
            embedded_data_by_batch.append(outputs)
            
        text_embedded_dict[session] = embedded_data_by_batch
# text_embedded_dict = {1:embedded_data_by_session, ..., 40:embedded_data_by_session}         
    return text_embedded_dict

def make_target_dataframe(session_num):
# session_num : from 1 to 40

    if session_num <= 9:
        target_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/annotation/Sess0' + str(session_num) + '_eval.csv'
    
    else:
        target_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/annotation/Sess' + str(session_num) + '_eval.csv'

    train = pd.read_csv(target_path)
    train = train[['Segment ID', 'Total Evaluation',' .1',' .2']]
    train.columns = ['segment_id','emotion','valence','arousal']
    train = train.drop([0], axis = 0)
    train = train.sort_values('segment_id', ascending=True)
    train = train.reset_index(drop=True)
    
    return train

def make_target_dict(num_sessions):
# num_sessions : from 1 to 40

    target_dict = {}
    for i in tqdm(range(1, num_sessions + 1)):
        target_data = {}
        target_dataframe = make_target_dataframe(i)
        columns = target_dataframe.columns
        
        for j in columns:
            target_data[j] = target_dataframe[j]

        target_dict[i] = target_data
        
# target_dict = {1:{segment_id:_,emotion:_,valence:_, arousal:_}, ..., 40: {segment_id:_,emotion:_,valence:_, arousal:_}}        
    return target_dict

In [4]:
processor = AutoProcessor.from_pretrained("facebook/data2vec-audio-base-960h")
audio_d2v = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base-960h")

tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
text_d2v = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")

Some weights of the model checkpoint at facebook/data2vec-audio-base-960h were not used when initializing Data2VecAudioModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Data2VecAudioModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecAudioModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with anothe

In [5]:
audio_datasets = make_audio_datasets(1)
sampling_rate = audio_datasets[1][0]['audio']['sampling_rate']
text_datasets = make_text_datasets(1)

100%|██████████| 1/1 [00:00<00:00, 98.65it/s]
100%|██████████| 1/1 [00:00<00:00, 124.56it/s]


In [6]:
audio_embedded_dict = audio_embedding(audio_datasets,32)
text_embedded_dict = text_embedding(text_datasets,32)

2023-03-30 12:52:31.556861: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 12:52:32.398040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11/include:/usr/local/cuda-11/lib64::/usr/local/cuda/extras/CUPTI/lib64
2023-03-30 12:52:32.398115: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11/include:/usr/local/cuda

311


100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


In [7]:
input_audio_shape = audio_embedded_dict[1][0]['last_hidden_state'].shape
a_b, a_l, a_e = input_audio_shape
input_text_shape = text_embedded_dict[1][0]['last_hidden_state'].shape
t_b, t_l, t_e = input_text_shape
iteration = len(audio_embedded_dict[1])
batch_size = len(audio_embedded_dict[1][0]['last_hidden_state'])

In [8]:
class Audio_Mlp_For_Tensor_Fusion(nn.Module):

    def __init__(self):
        super(Audio_Mlp_For_Tensor_Fusion, self).__init__()
        
        self.fc1 = nn.Linear(a_l * a_e, 768)
        self.fc2 = nn.Linear(768, 32)

    def forward(self, x):
                          
        x = torch.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

audio_mlp_for_tensor_fusion = Audio_Mlp_For_Tensor_Fusion()

In [9]:
class Text_Mlp_For_Tensor_Fusion(nn.Module):

    def __init__(self):
        super(Text_Mlp_For_Tensor_Fusion, self).__init__()
        
        self.fc1 = nn.Linear(t_l * t_e, 768)
        self.fc2 = nn.Linear(768, 32)

    def forward(self, x):
                          
        x = torch.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

text_mlp_for_tensor_fusion = Text_Mlp_For_Tensor_Fusion()

In [10]:
tensor_fusion_by_iteration = {}
for i in range(iteration):
    n, _, _ = audio_embedded_dict[1][i]['last_hidden_state'].shape
    
    temp_list = []
    for j in range(n):
        a_f_tf = audio_mlp_for_tensor_fusion(audio_embedded_dict[1][i]['last_hidden_state'][j])

        t_f_tf = text_mlp_for_tensor_fusion(text_embedded_dict[1][i]['last_hidden_state'][j])
        temp_tensor = torch.outer(a_f_tf, t_f_tf)
        a, t = temp_tensor.shape
        temp_tensor = temp_tensor.view(1, a, t)
        temp_list.append(temp_tensor)

    tensor_fusion_by_iteration[i+1] = torch.tensor(torch.concat(temp_list),requires_grad = True)

  tensor_fusion_by_iteration[i+1] = torch.tensor(torch.concat(temp_list),requires_grad = True)


In [11]:
print('iteration 수 : ', len(tensor_fusion_by_iteration[1]))
print('iteration 내부 텐서 shape : ', tensor_fusion_by_iteration[1].shape)

iteration 수 :  32
iteration 내부 텐서 shape :  torch.Size([32, 32, 32])


In [12]:
class Mlp_For_Regression(nn.Module):

    def __init__(self):
        super(Mlp_For_Regression, self).__init__()
        
        self.fc2 = nn.Linear(32 * 32, 32)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
                          
        x = torch.flatten(x)
        x = F.gelu(self.fc2(x))
        x = self.fc3(x)
        return x

mlp_for_regression = Mlp_For_Regression()

In [13]:
class Mlp_For_Classification(nn.Module):

    def __init__(self):
        super(Mlp_For_Classification, self).__init__()
        
        self.fc2 = nn.Linear(32 * 32, 32)
        self.fc3 = nn.Linear(32, 7)

    def forward(self, x):
                          
        x = torch.flatten(x)
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x

mlp_for_classification = Mlp_For_Classification()

In [14]:
class CCC(nn.Module):
    def __init__(self):
        super(CCC, self).__init__()
        self.mean = torch.mean
        self.var = torch.var
        self.sum = torch.sum
        self.sqrt = torch.sqrt
        self.std = torch.std
        
    def forward(self, pred, target):
        mean_gt = self.mean (target, 0)
        mean_pred = self.mean (pred, 0)
        var_gt = self.var (target, 0)
        var_pred = self.var (pred, 0)
        v_pred = pred - mean_pred
        v_gt = target - mean_gt
        cor = self.sum (v_pred * v_gt) / (self.sqrt(self.sum(v_pred ** 2)) * self.sqrt(self.sum(v_gt ** 2)))
        sd_gt = self.std(target)
        sd_pred = self.std(pred)
        numerator = 2 * cor * sd_gt * sd_pred
        denominator = var_gt + var_pred + (mean_gt-mean_pred) ** 2
        ccc = numerator / denominator
        return ccc

In [15]:
target_dict = make_target_dict(40)

100%|██████████| 40/40 [00:00<00:00, 167.00it/s]


In [16]:
# valence arousal

valence = torch.tensor(target_dict[1]['valence'].astype('float32'))
valence = valence.view(1, valence.size(0))
arousal = torch.tensor(target_dict[1]['arousal'].astype('float32'))
arousal = arousal.view(1, arousal.size(0))
target = torch.concat([valence, arousal])
target = target.T

In [17]:
## arousal과 valence

x_train = list(tensor_fusion_by_iteration.values())
y_train = target

criterion = nn.MSELoss()
optimizer = optim.SGD(mlp_for_regression.parameters(), lr=0.01)
ccc = CCC()

batch_size, _, _ = tensor_fusion_by_iteration[1].shape
num_epochs = 10

num_samples = 0
for i in range(iteration):
    num, _, _ = tensor_fusion_by_iteration[i+1].shape
    num_samples += num

In [18]:
## arousal과 valence / loss_2개

epoch_loss_v = 0
epoch_loss_a = 0
ccc_difference = 0

for epoch in range(num_epochs):
    running_loss_v = 0.0
    running_loss_a = 0.0
    running_ccc_difference = 0.0
    
    for i in tqdm(range(iteration)):
        
        input = x_train[i]
        num, _, _ = input.shape
        targets = y_train[i * batch_size :i * batch_size + num]
        targets.requires_grad = True
        
        outputs = []        
        for j in range(num):
            
            output = mlp_for_regression(input[j])
            output = output.view(1, output.size(0))
            outputs.append(output)
            
        outputs = torch.concat(outputs)
        
        loss_v = criterion(outputs[:,0], targets[:,0])
        loss_a = criterion(outputs[:,1], targets[:,1])
        
        optimizer.zero_grad()
        loss_v.backward(retain_graph =True)
        loss_a.backward()
        optimizer.step()

        pred_ccc = ccc(outputs[:,0], outputs[:,1])
        target_ccc = ccc(targets[:,0], targets[:,1])
        difference = target_ccc - pred_ccc
        running_ccc_difference += difference
        
        ccc_difference += ccc_difference  * num
        running_loss_v += loss_v.item() * num
        running_loss_a += loss_a.item() * num

    epoch_loss_v = running_loss_v / num_samples
    epoch_loss_a = running_loss_a / num_samples
    ccc_difference = running_ccc_difference / num_samples
    print("Epoch {}, Loss_Valence: {}".format(epoch+1, epoch_loss_v))
    print("Epoch {}, Loss_Arousal: {}".format(epoch+1, epoch_loss_a))
    print('CCC Difference : {}'.format(ccc_difference))
    print('-'*30)


100%|██████████| 10/10 [00:00<00:00, 19.47it/s]


Epoch 1, Loss_Valence: 9.412535546293595
Epoch 1, Loss_Arousal: 7.74254533792232
CCC Difference : 0.012114864774048328
------------------------------


100%|██████████| 10/10 [00:00<00:00, 66.55it/s]


Epoch 2, Loss_Valence: 6.063390145347816
Epoch 2, Loss_Arousal: 4.948316414639881
CCC Difference : 0.012114851735532284
------------------------------


100%|██████████| 10/10 [00:00<00:00, 60.71it/s]


Epoch 3, Loss_Valence: 3.802438705297145
Epoch 3, Loss_Arousal: 3.085053010768829
CCC Difference : 0.01211480237543583
------------------------------


100%|██████████| 10/10 [00:00<00:00, 63.14it/s]


Epoch 4, Loss_Valence: 2.28096360907294
Epoch 4, Loss_Arousal: 1.8421253529392254
CCC Difference : 0.012114626355469227
------------------------------


100%|██████████| 10/10 [00:00<00:00, 61.51it/s]


Epoch 5, Loss_Valence: 1.2964051877570688
Epoch 5, Loss_Arousal: 1.0403137412101893
CCC Difference : 0.012113993987441063
------------------------------


100%|██████████| 10/10 [00:00<00:00, 64.21it/s]


Epoch 6, Loss_Valence: 0.712212885643125
Epoch 6, Loss_Arousal: 0.5622005958074159
CCC Difference : 0.012111721560359001
------------------------------


100%|██████████| 10/10 [00:00<00:00, 64.25it/s]


Epoch 7, Loss_Valence: 0.4085698378095673
Epoch 7, Loss_Arousal: 0.31040899256705473
CCC Difference : 0.01210432592779398
------------------------------


100%|██████████| 10/10 [00:00<00:00, 62.08it/s]


Epoch 8, Loss_Valence: 0.27288672759218613
Epoch 8, Loss_Arousal: 0.19581423668711898
CCC Difference : 0.012085750699043274
------------------------------


100%|██████████| 10/10 [00:00<00:00, 65.17it/s]


Epoch 9, Loss_Valence: 0.21971068131195387
Epoch 9, Loss_Arousal: 0.1501296289576595
CCC Difference : 0.012055540457367897
------------------------------


100%|██████████| 10/10 [00:00<00:00, 65.14it/s]

Epoch 10, Loss_Valence: 0.20051142338579492
Epoch 10, Loss_Arousal: 0.13355249391683044
CCC Difference : 0.012029086239635944
------------------------------





In [19]:
## arousal과 valence / loss 1개

epoch_loss = 0
ccc_difference = 0

for epoch in range(num_epochs):
    running_loss = 0.0
    running_ccc_difference = 0.0
    
    for i in tqdm(range(iteration)):
        
        input = x_train[i]
        num, _, _ = input.shape
        targets = y_train[i * batch_size :i * batch_size + num]
        targets.requires_grad = True
        
        outputs = []        
        for j in range(num):
            
            output = mlp_for_regression(input[j])
            output = output.view(1, output.size(0))
            outputs.append(output)
            
        outputs = torch.concat(outputs)
        
        loss = criterion(outputs, targets)

        pred_ccc = ccc(outputs[:,0], outputs[:,1])
        target_ccc = ccc(targets[:,0], targets[:,1])
        difference = target_ccc - pred_ccc
        running_ccc_difference += difference
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        ccc_difference += ccc_difference * num
        running_loss += loss.item() * num

        
    epoch_loss = running_loss / num_samples
    ccc_difference = running_ccc_difference / num_samples
    print("Epoch {}, Loss: {}".format(epoch+1, epoch_loss))
    print('CCC Difference : {}'.format(ccc_difference))
    print('-'*30)


100%|██████████| 10/10 [00:00<00:00, 95.65it/s]


Epoch 1, Loss: 0.16094960023160915
CCC Difference : 0.012060631066560745
------------------------------


100%|██████████| 10/10 [00:00<00:00, 102.52it/s]


Epoch 2, Loss: 0.15908928792863797
CCC Difference : 0.012056855484843254
------------------------------


100%|██████████| 10/10 [00:00<00:00, 104.16it/s]


Epoch 3, Loss: 0.15799454247453204
CCC Difference : 0.012054767459630966
------------------------------


100%|██████████| 10/10 [00:00<00:00, 106.03it/s]


Epoch 4, Loss: 0.15734340173830175
CCC Difference : 0.012053962796926498
------------------------------


100%|██████████| 10/10 [00:00<00:00, 105.12it/s]


Epoch 5, Loss: 0.15695037886835755
CCC Difference : 0.012054027989506721
------------------------------


100%|██████████| 10/10 [00:00<00:00, 96.97it/s]


Epoch 6, Loss: 0.15670887445521892
CCC Difference : 0.012054613791406155
------------------------------


100%|██████████| 10/10 [00:00<00:00, 103.18it/s]


Epoch 7, Loss: 0.15655745843790736
CCC Difference : 0.012055478990077972
------------------------------


100%|██████████| 10/10 [00:00<00:00, 105.51it/s]


Epoch 8, Loss: 0.15646047248142708
CCC Difference : 0.012056452222168446
------------------------------


100%|██████████| 10/10 [00:00<00:00, 102.87it/s]


Epoch 9, Loss: 0.15639707192155708
CCC Difference : 0.012057436630129814
------------------------------


100%|██████████| 10/10 [00:00<00:00, 101.91it/s]

Epoch 10, Loss: 0.15635480681416306
CCC Difference : 0.012058368884027004
------------------------------



