In [3]:
import librosa
import os
import pandas as pd
import numpy as np

from transformers import AutoProcessor, Data2VecAudioModel
from transformers import AutoTokenizer, Data2VecTextModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset, Audio
import datasets
import glob
import collections
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [78]:
def data_path(session_num, modal):
# modal : wav, txt 둘 중 하나 입력
# 이렇게 불러온 데이터는 시간 순은 아님을 유의
    if session_num <= 9:
        dir_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/Session0' + str(session_num)
    else:
        dir_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/Session' + str(session_num)
    data_path = glob.glob(dir_path + '/*.' + modal)
    data_path = sorted(data_path)
    return data_path

def make_audio_datasets(num_sessions):
# num_sessions : from 1 to 40
# type(return) = dict{1:audio_dataset,...,40:audio_dataset}
    audio_datasets = {}
    
    for i in tqdm(range(1, num_sessions + 1)):
        # audio_datasets[i] = datasets.Dataset.from_dict({'audio': data_path(i, 'wav')})
        audio_datasets[i] = datasets.Dataset.from_dict({'audio': data_path(i, 'wav')}).cast_column("audio", Audio())
    return audio_datasets

def make_text_datasets(num_sessions):
# num_sessions : from 1 to 40
# type(return) = dict{1:[paths, corpus],...,40:[paths, corpus]}
    text_datasets = {}
    
    for i in tqdm(range(1, num_sessions + 1)):
        
        paths = []
        corpus = []
        
        for path in data_path(i, 'txt'):
            paths.append(path)
            with open(path, "r") as f:
                corpus.append(f.read())
        
        text_datasets[i] = [paths, corpus]

    return text_datasets  

def audio_embedding(audio_datasets, batch_size=32):
# audio_datasets = {1:audio_dataset, ... , 40:audio_dataset}
    batch_size = batch_size
    
    audio_processed_dict = {}
    for session, audio_dataset in audio_datasets.items():
        
        processed_audio_by_session = []
        for i in range(0, audio_dataset.num_rows, batch_size):
            
            audio_arrayes = []
            for j in audio_dataset[i:i+batch_size]['audio']:
                audio_arrayes.append(j['array'])
            
            inputs = processor(audio_arrayes, sampling_rate=sampling_rate, padding=True, max_length=400, return_attention_mask=True, return_tensors="pt")
            processed_audio_by_session.append(inputs)
        
        audio_processed_dict[session] = processed_audio_by_session

    
# audio_processed_dict : {1 : [[batch 당 processing][batch 당 processing][batch 당 processing]],2,3,}    
    audio_embedded_dict = {}        
    for session, val_dic_list in tqdm(audio_processed_dict.items()):
        
        val_dic_by_session = []
        for val_dic in val_dic_list:
            
            with torch.no_grad():
                outputs = audio_d2v(**val_dic)
            val_dic_by_session.append(outputs)    
            
        audio_embedded_dict[session] = val_dic_by_session
# audio_embedded_dict = {1:embedded_data_by_session, ..., 40:embedded_data_by_session}    
    
    return audio_embedded_dict

def text_embedding(text_datasets, batch_size=32):
# text_dataset = {1:[path, corpus], ..., 40:[path, corpus]}    
    
    text_embedded_dict = {}
    
    for session, ( _ , corpus) in tqdm(text_datasets.items()):
        
        embedded_data_by_batch = []
        print(len(corpus))
        for i in range(0, len(corpus), batch_size):
            
            corpus_by_batch = corpus[i:i+batch_size]
            inputs = tokenizer(corpus_by_batch, padding= True, max_length = 35, return_tensors="pt")
            outputs = text_d2v(**inputs)
            embedded_data_by_batch.append(outputs)
            
        text_embedded_dict[session] = embedded_data_by_batch
# text_embedded_dict = {1:embedded_data_by_session, ..., 40:embedded_data_by_session}         
    return text_embedded_dict

def make_target_dataframe(session_num):
# session_num : from 1 to 40

    if session_num <= 9:
        target_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/annotation/Sess0' + str(session_num) + '_eval.csv'
    
    else:
        target_path = '/home/arplab/project/paradeigma/multi_modal/org_KEMDy20/annotation/Sess' + str(session_num) + '_eval.csv'

    train = pd.read_csv(target_path)
    train = train[['Segment ID', 'Total Evaluation',' .1',' .2']]
    train.columns = ['segment_id','emotion','valence','arousal']
    train = train.drop([0], axis = 0)
    train = train.sort_values('segment_id', ascending=True)
    train = train.reset_index(drop=True)
    
    return train

def make_target_dict(num_sessions):
# num_sessions : from 1 to 40

    target_dict = {}
    for i in tqdm(range(1, num_sessions + 1)):
        target_data = {}
        target_dataframe = make_target_dataframe(i)
        columns = target_dataframe.columns
        
        for j in columns:
            target_data[j] = target_dataframe[j]

        target_dict[i] = target_data
        
# target_dict = {1:{segment_id:_,emotion:_,valence:_, arousal:_}, ..., 40: {segment_id:_,emotion:_,valence:_, arousal:_}}        
    return target_dict
            
def tensor_fusion(num):

    fusion = torch.outer(audio_outputs[num][0,:,0], text_outputs[num][0,:,0])
    a,b = fusion.shape
    fusion = fusion.view([a, b, -1])
    
    for i in tqdm(range(1, 768)):
        fusion_1 = torch.outer(audio_outputs[num][0,:,i], text_outputs[num][0,:,i])
        a,b = fusion_1.shape
        fusion_1 = fusion_1.view([a, b, -1])
        fusion = torch.concat((fusion, fusion_1), dim=2)
        
    return fusion

In [79]:
processor = AutoProcessor.from_pretrained("facebook/data2vec-audio-base-960h")
audio_d2v = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base-960h")

tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
text_d2v = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")

Some weights of the model checkpoint at facebook/data2vec-audio-base-960h were not used when initializing Data2VecAudioModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Data2VecAudioModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecAudioModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with anothe

In [18]:
for session, (_, corpus) in text_datasets.items():
    print(session)
    print(corpus)
    print(corpus[0:32], len(corpus[0:32]))
    break

1
['n/ 아 친구들도? l/\n', 'l/ 나는 생일?\n', '생일날이면은 b/ 내가 고기를 되게 좋아하니까 엄마도 그걸 아니까 미역국도 같이 해주시는데 꼭 고기를 같이 구워주시거나 아니면 여행을 가서 고기를 구워 먹는다거나 꼭 고깃집에 가서 어 해먹어, 꼭.\n', 'b/ 선물이라 이 보통 돈으로 주시지. l/\n', 'l/ 용돈으로 주시고.\n', '옛날에는 n/ b/ 내가 원하는 게 뭔지 일단 물어보시곤 했던 거 같애.\n', 'c/ 옷이라든가 갖고 싶은 거 있나라든가 n/ 근데 이제 b/ 나이를 먹을수록 현금으로 챙겨주시더라구.l/\n', '응 나도 되게 b/ 여행 가는 걸 좋아하긴 하는데 가족 다 같이 가는 것보다 되게 엄마랑 여행 가는 게 많았던 거 같아. \n', '어 단둘이.\n', '그 되게 아빠나 오빠가 시간이 안 나거나 꼭 b/ 어디 약속이 있다거나 그래가지고 엄마가 주말마다 좀 같이 둘이 어디 데려다가 줬어, 그냥.\n', '여행 여행을.\n', 'b/ 최근 최근에?\n', 'b/ 굳이 가자면 나는 부산에 가면 꼭 엄마가 생각나.\n', '어 부산을 좀 가기도 많이 갔는데 부산에 엄마가 좀 n/ 그 친척들이 u/ 가지고.\n', '아 n/ 되게 되게 처음이라 엉망진창이었던 거 같긴 한데.\n', '나는 어 어머니, 엄마가 이제 생일 때마다 이제 미역국도 많이 해서 미역국을 되게 좋아하거든.\n', '전에도 말했지만 그래서 미역국이랑 볶음김치 막 해주셔서 맨날 그렇게 차려주시는데 어 옛날에는 이제 집에서 밥 먹는 거 보다 그 캔모아 빙수 있는데 있잖아.\n', 'c/ 어 응 거기를 되게 엄 집 앞에 그 조그만한 가게로 있었는데 거기를 엄청 생일 때마다 매번 같이 데려갔었거든.\n', '거기서 엄마랑 둘이 생일 파티 그냥 간단하게 하는 경우도 있었고 b/ 보 음 보통 그렇게 아니면 친구들 이제 거기 빙수 가게로 불러서 같이 빙수 먹으면서 케익도 불고 막 이랬었었어.\n', '응응 그래서 생일 때 그렇게 했던 경험이 있는데 그때가 이제 초

In [65]:
audio_datasets = make_audio_datasets(1)
sampling_rate = audio_datasets[1][0]['audio']['sampling_rate']
text_datasets = make_text_datasets(1)


100%|██████████| 1/1 [00:00<00:00, 119.56it/s]
100%|██████████| 1/1 [00:00<00:00, 104.88it/s]


In [69]:
# batch 4짜리

inputs = []
for i in range(4):
    inputs.append(audio_datasets[1][i]['audio']['array'])
    
processed_inputs = processor(inputs, sampling_rate=sampling_rate, padding=True, max_length=400, return_attention_mask=True, return_tensors="pt")
audio_outputs = audio_d2v(**processed_inputs)

In [7]:
# batch 4짜리

inputs = []
for i in range(4):
    inputs.append(text_datasets[1][1][i])
    
processed_inputs = tokenizer(inputs, padding=True, max_length=20, return_attention_mask=True, return_tensors="pt")
text_outputs = text_d2v(**processed_inputs)



In [None]:
audio_datasets[1]['audio']['array']

In [80]:
# cpu만 사용하면 8분걸림
# cuda를 사용하면 OOM

audio_embedded_dict = audio_embedding(audio_datasets)

100%|██████████| 1/1 [08:22<00:00, 502.70s/it]


In [81]:
# cpu만 사용하다가 커널이 죽었음
# cuda를 사용하면 OOM

text_embedded_dict = text_embedding(text_datasets)



311


: 

: 

In [10]:
target_dict = make_target_dict(40)

100%|██████████| 40/40 [00:00<00:00, 93.94it/s]


In [11]:
target = target_dict[1]['emotion'][0:4]
target[0:3] = 0
target[3] = 1
target = target.astype('float32')
target = target.values
target = torch.tensor(target)

In [12]:
class MLP_1(nn.Module):
    
    def __init__(self):
        super(MLP_1, self).__init__()
        
        self.fc1 = nn.Linear(1235 * 768, 768)
        self.fc2 = nn.Linear(768, 1)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
        
mlp_1 = MLP_1()

In [64]:
# 차원 축소를 무조건 수행해야 할 것. 

class MLP_2(nn.Module):
    
    def __init__(self):
        super(MLP_2, self).__init__()
        
        self.fc1 = nn.Linear(988 * 247 * 768, 768)
        self.fc2 = nn.Linear(768, 1)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
        
mlp_2 = MLP_2()

RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 575753158656 bytes. Error code 12 (Cannot allocate memory)

In [132]:
shape = (4, 1235 , 768)
x_train = torch.rand(shape)
y_train = target

criterion = nn.MSELoss()
optimizer = optim.SGD(mlp_1.parameters(), lr=0.01)

num_epochs = 2
batch_size = 1
num_samples = x_train.shape[0]

In [None]:
x_train = torch.tensor(torch.cat((audio_outputs['last_hidden_state'], text_outputs['last_hidden_state']), dim=1))
y_train = target

criterion = nn.MSELoss()
optimizer = optim.SGD(mlp_1.parameters(), lr=0.01)

num_epochs = 2
batch_size = 1
num_samples = x_train.shape[0]

In [61]:
target[0]

tensor(0.)

In [62]:
x_train = a_1
y_train = target[0]

criterion = nn.MSELoss()
optimizer = optim.SGD(mlp_1.parameters(), lr=0.01)

num_epochs = 2
batch_size = 1
num_samples = x_train.shape[0]

In [16]:
epoch_loss = 0
for epoch in range(num_epochs):
    
    running_loss = 0.0
    for i in range(0, num_samples, batch_size):
        print(i)
        inputs = x_train[i:i+batch_size]
        targets = y_train[i:i+batch_size]
        
        outputs = mlp_1(inputs)

        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_size
        
    epoch_loss = running_loss / num_samples
    print("Epoch {}, Loss: {}".format(epoch+1, epoch_loss))
        

0
1
2
3
Epoch 1, Loss: 1931420318564352.0
0
1
2
3
Epoch 2, Loss: 1643181204570112.0


In [63]:
x_train = torch.randint(0, 1, (1,120))
y_train = torch.randint(0, 1, (1,10))

criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

num_epochs = 5
batch_size = 1
num_samples = x_train.shape[0]

NameError: name 'net' is not defined

In [66]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x


net = Net()
print(net)

Net(
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [67]:
net(x_train)

RuntimeError: mat1 and mat2 must have the same dtype

In [68]:
# 모델 정의
class SimpleModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 데이터셋 생성
x_train = torch.randn((30, 10))
y_train = torch.randn((30, 5))

# 하이퍼파라미터 설정
input_dim = x_train.shape[1]
hidden_dim = 20
output_dim = y_train.shape[1]
lr = 0.01
num_epochs = 3
batch_size = 5

# 모델 초기화
model = SimpleModel(input_dim, hidden_dim, output_dim)

# 손실함수 정의
criterion = nn.MSELoss()

# 옵티마이저 정의
optimizer = optim.SGD(model.parameters(), lr=lr)

# 훈련
for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, num_samples, batch_size):
        # 배치 데이터 로드
        inputs = x_train[i:i+batch_size]
        targets = y_train[i:i+batch_size]
        
        # 순전파
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 역전파
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 손실값 누적
        running_loss += loss.item() * batch_size
    
    # 에폭별 손실값 출력
    print('Epoch {} Loss: {:.4f}'.format(epoch+1, running_loss/num_samples))

Epoch 1 Loss: 0.8839
Epoch 2 Loss: 0.8758
Epoch 3 Loss: 0.8678


In [71]:
x_train

tensor([[[-1.1453e+00, -1.3806e-01, -6.6733e-02,  ...,  6.6787e-03,
          -7.2099e-02,  2.8462e-01],
         [-1.1453e+00, -1.3806e-01, -6.6732e-02,  ...,  6.6784e-03,
          -7.2098e-02,  2.8462e-01],
         [-1.1453e+00, -1.3806e-01, -6.6733e-02,  ...,  6.6792e-03,
          -7.2098e-02,  2.8462e-01],
         ...,
         [-1.7483e-02, -5.4983e-03, -6.0314e-02,  ...,  1.3965e-02,
          -1.4430e-01,  1.0852e-01],
         [-1.7483e-02, -5.4983e-03, -6.0314e-02,  ...,  1.3965e-02,
          -1.4430e-01,  1.0852e-01],
         [-1.7483e-02, -5.4984e-03, -6.0314e-02,  ...,  1.3965e-02,
          -1.4430e-01,  1.0852e-01]],

        [[-9.5902e-01, -6.7264e-02, -2.1643e-01,  ...,  9.4446e-02,
          -7.7794e-04,  4.8156e-01],
         [-9.5902e-01, -6.7264e-02, -2.1643e-01,  ...,  9.4447e-02,
          -7.7803e-04,  4.8156e-01],
         [-9.5902e-01, -6.7264e-02, -2.1643e-01,  ...,  9.4446e-02,
          -7.7759e-04,  4.8156e-01],
         ...,
         [ 7.2260e-02, -2

In [69]:
x_train

tensor([[ 0.7589, -0.2451,  0.3043, -1.1257, -0.7017, -0.5993,  1.4000,  0.9494,
         -0.0170, -0.0424],
        [-0.8227,  0.0786, -0.7696,  0.7970,  2.1975,  0.3471,  0.3051, -1.5030,
          1.1239,  0.3276],
        [ 1.1657, -0.7739,  1.4156, -1.7199,  1.0934,  0.5445, -0.7570, -1.4942,
         -1.0362,  0.5432],
        [-0.2782, -1.2190,  1.0431,  0.0808, -0.1665,  0.7406, -1.2880,  0.7423,
          0.8216,  1.6441],
        [ 0.3207,  0.8375,  0.5336,  1.1942,  0.5790,  0.2433, -1.0244, -0.9470,
         -0.0452, -0.8878],
        [ 0.8417,  0.2664,  0.5194,  0.7736, -1.3530, -1.0820,  0.6434, -0.9518,
         -0.4851, -0.5505],
        [ 0.1473,  2.3938, -0.8641,  0.2462,  0.0651,  0.2549, -0.9982, -0.8814,
         -0.0098,  1.5151],
        [ 0.5930, -0.8168, -0.8172, -0.1530,  0.0569,  0.6663, -0.3458, -1.2687,
         -0.3133, -0.4569],
        [-0.9944, -0.7339, -0.1201, -0.6393,  0.7754, -1.3064, -0.4372, -0.0418,
         -0.4973, -0.1493],
        [-0.4678, -