# working model for tensorfusion

In [66]:
import pickle
import torch
from torchmetrics import F1Score
import pandas as pd
print(torch.__version__)

1.13.1+cu117


In [67]:
# data load
with open('./data/lou_dataset_1_3.pkl', 'rb') as f:
    raw_dataset = pickle.load(f)
session = 'Session01'
raw_dataset[session].keys()

dict_keys(['file_names', 'text_embeddings', 'wav_embeddings', 'Emotion', 'Arousal', 'Valence'])

In [68]:
from glob import glob
annot_lst = glob('../org_KEMDy20/annotation/*.csv')
emotion_list = []
for annot_file in annot_lst:
    annot = pd.read_csv(annot_file, skiprows=1)
    emotion_list.append(annot['Emotion'])
emotion_list = list(pd.Series([j for i in emotion_list for j in i]).unique())
len(emotion_list)


24

In [69]:
### 
emotion_lst = []
sessions = ['Session01', 'Session02', 'Session03']
for session in sessions:
    emotion_lst.append(raw_dataset[session]['Emotion'].unique())
emotion_lst = [j for i in emotion_lst for j in i]
emotion_lst = list(pd.Series(emotion_lst).unique())
len(emotion_lst)


12

In [70]:
# encoding Emotion for whole data
encode_dict = {b:i for i, b in enumerate(emotion_list)}
decode_dict = {i:b for i, b in enumerate(emotion_list)}
encode_dict, decode_dict

({'happy': 0,
  'neutral': 1,
  'surprise': 2,
  'surprise;neutral': 3,
  'sad': 4,
  'neutral;sad': 5,
  'happy;neutral': 6,
  'angry;neutral': 7,
  'neutral;disqust': 8,
  'angry': 9,
  'fear': 10,
  'neutral;fear': 11,
  'disqust': 12,
  'happy;surprise': 13,
  'happy;angry;neutral': 14,
  'angry;disqust': 15,
  'happy;surprise;neutral': 16,
  'happy;fear': 17,
  'happy;neutral;fear': 18,
  'angry;neutral;disqust': 19,
  'neutral;disqust;sad': 20,
  'angry;neutral;disqust;fear;sad': 21,
  'happy;sad': 22,
  'happy;neutral;disqust': 23},
 {0: 'happy',
  1: 'neutral',
  2: 'surprise',
  3: 'surprise;neutral',
  4: 'sad',
  5: 'neutral;sad',
  6: 'happy;neutral',
  7: 'angry;neutral',
  8: 'neutral;disqust',
  9: 'angry',
  10: 'fear',
  11: 'neutral;fear',
  12: 'disqust',
  13: 'happy;surprise',
  14: 'happy;angry;neutral',
  15: 'angry;disqust',
  16: 'happy;surprise;neutral',
  17: 'happy;fear',
  18: 'happy;neutral;fear',
  19: 'angry;neutral;disqust',
  20: 'neutral;disqust;sad',

In [71]:
merged_dataset = {}
for session_key in raw_dataset.keys():
    raw_dataset[session_key]['Emotion'] = raw_dataset[session_key]['Emotion'].map(encode_dict)
    for data_name in raw_dataset[session_key].keys():
        if data_name in merged_dataset.keys():
            for data in raw_dataset[session_key][data_name]:
                merged_dataset[data_name].append(data)
        else:
            merged_dataset[data_name] = []
            for data in raw_dataset[session_key][data_name]:
                merged_dataset[data_name].append(data)
    
for data_name in merged_dataset.keys():
    if data_name == 'text_embeddings' or data_name == 'wav_embeddings':
        merged_dataset[data_name] = torch.stack(merged_dataset[data_name])

merged_dataset.keys()

dict_keys(['file_names', 'text_embeddings', 'wav_embeddings', 'Emotion', 'Arousal', 'Valence'])

In [72]:
from collections import Counter
sorted(Counter(merged_dataset['Emotion']))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13]

In [73]:
# check the length of each data column
for i in merged_dataset.keys():
    print(f"{i}: ", len(merged_dataset[i]))

file_names:  917
text_embeddings:  917
wav_embeddings:  917
Emotion:  917
Arousal:  917
Valence:  917


# torch dataset 만들기
- 참고: https://tutorials.pytorch.kr/beginner/basics/data_tutorial.html

In [74]:
import os
import pandas as pd
from datasets import Dataset
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split

In [75]:
class EtriDataset(Dataset):
    def __init__(self, file_names, text_embeddings, wav_embeddings, Emotion, Arousal, Valence):
        self.file_names = file_names
        self.text_embeddings = text_embeddings
        self.wav_embeddings = wav_embeddings
        self.label_emotion = Emotion
        self.label_arousal = Arousal
        self.label_valence = Valence
        
    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        text_embeddings = self.text_embeddings[idx]
        wav_embeddings = self.wav_embeddings[idx]
        label_emotion = self.label_emotion[idx]
        label_arousal = self.label_arousal[idx]
        label_valence = self.label_valence[idx]
        return text_embeddings, wav_embeddings, label_emotion, label_arousal, label_valence

In [76]:
# data load 및 나누기: https://076923.github.io/posts/Python-pytorch-11/

# 개별 session 데이터 셋을 만들었을 때 
# dataset = EtriDataset(raw_dataset[session]['file_names'],
#                       raw_dataset[session]['text_embeddings'],
#                       raw_dataset[session]['wav_embeddings'],
#                       raw_dataset[session]['Emotion'],
#                       raw_dataset[session]['Arousal'],
#                       raw_dataset[session]['Valence'])

# session을 통합시킨 데이터 셋을 만들었을 때 
dataset = EtriDataset(merged_dataset['file_names'],
                      merged_dataset['text_embeddings'],
                      merged_dataset['wav_embeddings'],
                      merged_dataset['Emotion'],
                      merged_dataset['Arousal'],
                      merged_dataset['Valence'])


In [77]:
dataset_size = len(dataset)
train_size = int(dataset_size * 0.7)
validation_size = int(dataset_size * 0.15)
test_size = dataset_size - train_size - validation_size

train_dataset, validation_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size])

print(train_size, test_size, validation_size)
print(f"Training Data Size : {len(train_dataset)}")
print(f"Validation Data Size : {len(validation_dataset)}")
print(f"Testing Data Size : {len(test_dataset)}")

641 139 137
Training Data Size : 641
Validation Data Size : 137
Testing Data Size : 139


In [78]:
# data size
merged_dataset['wav_embeddings'][0].shape 
# raw_dataset[session]['wav_embeddings'][0].shape


torch.Size([1, 49, 768])

In [79]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, drop_last=True)

# NetWork 만들기

In [80]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [81]:
class MLPNetwork_pre(nn.Module):
    def __init__(self, input_length, input_width):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_length*input_width, 768)
        self.gelu1 = nn.GELU()
        self.bn1 = nn.BatchNorm1d(768)
        self.fc2 = nn.Linear(768, 512)
        self.gelu2 = nn.GELU()
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 32)
        self.gelu3 = nn.GELU()
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.gelu1(x)
        x = self.bn1(x)
        x = self.fc2(x)
        x = self.gelu2(x)
        x = self.bn2(x)
        x = self.fc3(x)
        output = self.gelu3(x)
        return output

class MLPNetwork_final(nn.Module):
    def __init__(self, input_length, input_width):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_length*input_width, 256)
        self.gelu1 = nn.GELU()
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 64)
        self.gelu2 = nn.GELU()
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 14)
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.gelu1(x)
        x = self.bn1(x)
        x = self.fc2(x)
        x = self.gelu2(x)
        x = self.bn2(x)
        output = self.fc3(x)
        
        return output


In [82]:
class TensorFusionMixer(nn.Module):
    def __init__(self, ModelA, ModelB):
        super().__init__()
        self.ModelA = ModelA
        self.ModelB = ModelB
        self.Model_mlp_final = MLPNetwork_final(32,32).to(device)
        self.softmax = nn.Softmax(dim=1)
        
    def tensor_fusion(self, batch_arr1, batch_arr2):
        fusion_matrix_lst = []
        for i, (arr1, arr2) in enumerate(zip(batch_arr1, batch_arr2)):
            outer_matrix = torch.outer(arr1, arr2)
            l, w = outer_matrix.shape
            outer_matrix = outer_matrix.view(1, l, w)
            fusion_matrix_lst.append(outer_matrix)
        fusion_matrix = torch.concat(fusion_matrix_lst)
        # print(fusion_matrix.shape)
        return fusion_matrix
        
    def forward(self, x1, x2):
        x1 = self.ModelA(x1)
        x2 = self.ModelB(x2)
        fusion_matrix = self.tensor_fusion(x1, x2) 
        x = self.Model_mlp_final(fusion_matrix)
        output = self.softmax(x)
        return output     


# txt_input_length, txt_input_width = raw_dataset[session]['text_embeddings'][0].shape | 마지막엔 지울 것
# _, wav_input_length, wav_input_width = raw_dataset[session]['wav_embeddings'][0].shape
txt_input_length, txt_input_width = merged_dataset['text_embeddings'][0].shape
_, wav_input_length, wav_input_width = merged_dataset['wav_embeddings'][0].shape

# tf_mixer에 들어갈 wav mlp, txt mlp 선언
model_mlp_txt = MLPNetwork_pre(txt_input_length,txt_input_width).to(device)
model_mlp_wav = MLPNetwork_pre(wav_input_length,wav_input_width).to(device)

# 최종 모델 선언
model_tf_mixer = TensorFusionMixer(ModelA = model_mlp_txt, ModelB = model_mlp_wav).to(device)

# model 병렬 학습 처리
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model_mlp_txt = nn.DataParallel(model_mlp_txt).to(device)
    model_mlp_wav = nn.DataParallel(model_mlp_wav).to(device)
    model_tf_mixer = nn.DataParallel(model_tf_mixer).to(device)
print(model_tf_mixer)

Let's use 4 GPUs!
DataParallel(
  (module): TensorFusionMixer(
    (ModelA): MLPNetwork_pre(
      (flatten): Flatten(start_dim=1, end_dim=-1)
      (fc1): Linear(in_features=61440, out_features=768, bias=True)
      (gelu1): GELU(approximate='none')
      (bn1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc2): Linear(in_features=768, out_features=512, bias=True)
      (gelu2): GELU(approximate='none')
      (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc3): Linear(in_features=512, out_features=32, bias=True)
      (gelu3): GELU(approximate='none')
    )
    (ModelB): MLPNetwork_pre(
      (flatten): Flatten(start_dim=1, end_dim=-1)
      (fc1): Linear(in_features=37632, out_features=768, bias=True)
      (gelu1): GELU(approximate='none')
      (bn1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc2): Linear(in_features=768, out_features=512, bias=Tr

# 학습을 위한 train, test method 만들기

In [83]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X_txt, X_wav, y, _, _) in enumerate(dataloader): # data 순서: file_names, text_embeddings, wav_embeddings, label_emotion, label_arousal, label_valence
        
        # 예측 오류 계산
        X_txt, X_wav, y = X_txt.to(device), X_wav.to(device),y.type(torch.LongTensor).to(device)
        pred = model(X_txt, X_wav)
        loss = loss_fn(pred, y)

        # 역전파
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X_txt)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [84]:
def test(dataloader, model, loss_fn, mode = 'test'):
    
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    f1 = F1Score(task= 'multiclass', num_classes=14).to(device)   
    preds = []
    targets = []
    with torch.no_grad():
        for batch, (X_txt, X_wav, y, _, _) in enumerate(dataloader): # text_embeddings, wav_embeddings, label_emotion, label_arousal, label_valence
            # 예측 오류 계산
            X_txt, X_wav, y = X_txt.to(device), X_wav.to(device),y.type(torch.LongTensor).to(device)
            pred = model(X_txt, X_wav)
            preds.append(pred)
            targets.append(y)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
            
            
    test_loss /= num_batches
    correct /= size
    if mode == 'test':
        print(torch.cat(preds), torch.cat(preds).shape)
        print("f1 score: ", f1(torch.cat(preds).to(device), torch.cat(targets).to(device)))
        print(f"Test Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}\n")
    elif mode == 'val':
        print(f"Validation Error: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

# 학습시키기

In [85]:
# Set the Training Parameters
lr = 1e-3
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model_tf_mixer.parameters(), lr=lr)

epochs = 10
for epoch in range(epochs):
    print(f"---------------Epoch {epoch+1}----------------")
    train(train_dataloader, model_tf_mixer, loss_fn, optimizer)
    test(validation_dataloader, model_tf_mixer, loss_fn, mode = 'val')
print("Done!")

---------------Epoch 1----------------
loss: 2.645760  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640900 

---------------Epoch 2----------------
loss: 2.640558  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640751 

---------------Epoch 3----------------
loss: 2.640400  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640806 

---------------Epoch 4----------------
loss: 2.640406  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640808 

---------------Epoch 5----------------
loss: 2.640128  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640778 

---------------Epoch 6----------------
loss: 2.640275  [    0/  641]
Validation Error: Accuracy: 0.7%, Avg loss: 2.640784 

---------------Epoch 7----------------
loss: 2.640244  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640738 

---------------Epoch 8----------------
loss: 2.640614  [    0/  641]
Validation Error: Accuracy: 1.5%, Avg loss: 2.640616 

--------

## 검증

In [86]:
test(test_dataloader, model_tf_mixer, loss_fn, mode = 'test')

tensor([[0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721],
        [0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721],
        [0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721],
        ...,
        [0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721],
        [0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721],
        [0.0790, 0.0697, 0.0661,  ..., 0.0743, 0.0656, 0.0721]],
       device='cuda:0') torch.Size([128, 14])
f1 score:  tensor(0.0078, device='cuda:0')
Test Error: Accuracy: 0.7%, Avg loss: 2.640086



In [None]:
 for batch, (txt,wav,target_E, _, _) in list(enumerate(test_dataloader))[:3]:
     print(txt,wav,target_E, _, _)

In [None]:
probs = model_tf_mixer(txt.to(device), wav.to(device))
print(probs)
for i in torch.argmax(probs, dim=1):
    print(decode_dict[int(i)])