# working model for tensorfusion

1. 해당 Note는 EDA와 TEMP 데이터를 제외한 Wav와 Text 데이터만을 사용했습니다.

2. Tensorfusion은 kronecker product로 하였고 이후 Conv1D를 달았습니다.

In [2]:
import pickle
import torch
from torchmetrics import F1Score
from torchmetrics.classification import MulticlassPrecision
import pandas as pd
import numpy as np
from glob import glob
from collections import Counter
import torch.nn.functional as F
from tqdm import tqdm
import os
from datasets import Dataset
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import random
from torchsummary import summary as summary

print(torch.__version__)

1.13.1+cu117


## wav, text데이터 불러오기

In [3]:
# wav and text data load
# kemdy19, kemdy20의 데이터 정리 + 임베딩 데이터 불러오기 
dataset_file_lst = ['../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_annotation_nonmissing.pkl',
                    '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_embedding_for_dataset.pkl',
                    '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_annotation_nonmissing.pkl',
                    '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_embedding_for_dataset.pkl']
dataset_file_lst = sorted(dataset_file_lst)
dataset_file_lst

['../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_annotation_nonmissing.pkl',
 '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_embedding_for_dataset.pkl',
 '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_annotation_nonmissing.pkl',
 '../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_embedding_for_dataset.pkl']

In [4]:
var_names = ['kemdy19_annot', 'kemdy19_emb', 'kemdy20_annot', 'kemdy20_emb']
for file, var_name in zip(dataset_file_lst, var_names):
    print(file, var_name)
    with open(file, 'rb') as f:
        globals()[var_name] = pickle.load(f)

../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_annotation_nonmissing.pkl kemdy19_annot
../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY19_embedding_for_dataset.pkl kemdy19_emb
../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_annotation_nonmissing.pkl kemdy20_annot
../../../paradeigma/multi_modal/model/data/paradeigma_KEMDY20_embedding_for_dataset.pkl kemdy20_emb


<2020년>
'angry;disqust',  'angry;disqust;fear;neutral;sad',  'angry;disqust;neutral', 'angry;happy;neutral', 'angry;neutral', 'disqust;happy;neutral', 'disqust;neutral', 'disqust;neutral;sad', 'fear;happy', 'fear;happy;neutral', 'fear;neutral', 'happy;neutral', 'happy;neutral;surprise', 'happy;sad', 'happy;surprise', 'neutral;sad', 'neutral;surprise'

<2019년>
'angry;disgust;fear;neutral;surprise', 'angry;fear', 'angry;fear;neutral', 'angry;fear;surprise', 'angry;happy', 'angry;neutral;surprise', 'angry;sad', 'angry;surprise', 'disgust;fear', 'disgust;happy','disgust;neutral;surprise', 'disgust;sad', 'disgust;surprise', 'fear;neutral;surprise', 'fear;sad', 'fear;surprise', 'happy;neutral;sad', 'neutral;sad;surprise', 'sad;surprise'}


In [7]:
encode_dict = {'angry':0, 'disgust':1, 'fear':2,'happy':3,'neutral':4, 'sad':5, 'surprise':6,  
               'neutral;surprise': 450, 'neutral;sad': 460, 'happy;neutral': 340, 
               'angry;neutral': 40, 'disgust;neutral': 140, 'fear;neutral': 240, 
               'happy;surprise': 350, 'angry;happy;neutral': 7340, 'angry;disgust': 10, 
               'happy;neutral;surprise': 3450, 'fear;happy': 230,'fear;happy;neutral': 2340,
               'angry;disgust;neutral': 7140, 'disgust;neutral;sad': 1460, 
               'happy;sad': 360, 'disgust;happy;neutral': 3410, 'angry;fear': 20, 'angry;fear;neutral':7240,
               'angry;fear;surprise': 7250, 'angry;happy': 730, 'angry;neutral;surprise':7450, 
               'angry;sad': 60, 'angry;surprise': 50, 'disgust;fear':120, 'disgust;happy': 130,
               'disgust;neutral;surprise':1450, 'disgust;sad': 160, 'disgust;surprise':150, 
               'fear;neutral;surprise':2450, 'fear;sad': 260, 'fear;surprise':250, 'happy;neutral;sad':3460,
               'neutral;sad;surprise':4560, 'sad;surprise': 560,
               'angry;disgust;fear;neutral;sad': 10000,'angry;disgust;fear;neutral;surprise':20000}

In [8]:
encode_dict = {k: v for k, v in sorted(encode_dict.items(), key=lambda item: item[1])}
decode_dict = {b:i for i, b in encode_dict.items()}
print(encode_dict, '\n', decode_dict)

{'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6, 'angry;disgust': 10, 'angry;fear': 20, 'angry;neutral': 40, 'angry;surprise': 50, 'angry;sad': 60, 'disgust;fear': 120, 'disgust;happy': 130, 'disgust;neutral': 140, 'disgust;surprise': 150, 'disgust;sad': 160, 'fear;happy': 230, 'fear;neutral': 240, 'fear;surprise': 250, 'fear;sad': 260, 'happy;neutral': 340, 'happy;surprise': 350, 'happy;sad': 360, 'neutral;surprise': 450, 'neutral;sad': 460, 'sad;surprise': 560, 'angry;happy': 730, 'disgust;neutral;surprise': 1450, 'disgust;neutral;sad': 1460, 'fear;happy;neutral': 2340, 'fear;neutral;surprise': 2450, 'disgust;happy;neutral': 3410, 'happy;neutral;surprise': 3450, 'happy;neutral;sad': 3460, 'neutral;sad;surprise': 4560, 'angry;disgust;neutral': 7140, 'angry;fear;neutral': 7240, 'angry;fear;surprise': 7250, 'angry;happy;neutral': 7340, 'angry;neutral;surprise': 7450, 'angry;disgust;fear;neutral;sad': 10000, 'angry;disgust;fear;neutral;surprise': 2

## Annotation Encoding

In [10]:
kemdy19_annot.Emotion = list(kemdy19_annot.Emotion.map(encode_dict))
kemdy20_annot.Emotion = list(kemdy20_annot.Emotion.map(encode_dict))
kemdy19_annot.Emotion[:3], kemdy20_annot.Emotion[:3]


(0    6
 1    2
 2    0
 Name: Emotion, dtype: int64,
 0    4
 1    4
 2    4
 Name: Emotion, dtype: int64)

# EDA, TEMP Padding

In [11]:
import math
def add_padding(pd_series, length = 50):
    if isinstance(pd_series, float):
        if math.isnan(pd_series):
            return np.zeros(10)
    if len(pd_series) < length:
        pd_series = np.concatenate([pd_series, np.zeros(length - len(pd_series))])
        return np.array(pd_series)
    elif len(pd_series) == length:
        return np.array(pd_series)
    elif len(pd_series) > length:
        pd_series = pd_series[:length]
        return np.array(pd_series)

In [13]:
kemdy19_annot['Scaled EDA'] = kemdy19_annot['Scaled EDA'].apply(add_padding)
kemdy20_annot['Scaled EDA'] = kemdy20_annot['Scaled EDA'].apply(add_padding)
kemdy19_annot['Scaled TEMP'] = kemdy19_annot['Scaled TEMP'].apply(add_padding)
kemdy20_annot['Scaled TEMP'] = kemdy20_annot['Scaled TEMP'].apply(add_padding)
# check
kemdy20_annot['Scaled EDA'][10], kemdy19_annot['Scaled EDA'][10], kemdy20_annot['Scaled TEMP'][3],kemdy19_annot['Scaled TEMP'][15]

(array([-1.37470392, -0.702349  , -0.57776951,  0.36337337,  0.15086029,
         0.23434567,  0.6442029 ,  0.8643034 ,  0.14326694,  0.33301774,
         0.55311825, -0.16032486, -0.10719511,  0.09773054,  0.10531797,
         0.12809209,  0.40890977,  0.23435159, -0.29694591, -0.36525051,
        -0.25140359, -0.46391666, -0.14514408,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]),
 array([-0.21123698, -0.18926761, -0.18665215, -0.1861293 , -0.1861293 ,
        -0.18874476, -0.20757542, -0.25517533, -0.29074444, -0.29179054,
        -0.30434459, -0.24105233, -0.24628284, -0.48480603, -0.52822112,
        -0.60860162, -0.77337071, -0.85676544, -0

In [14]:
length_wav = 0
length_txt = 0
for i,j in zip(kemdy19_emb[0], kemdy19_emb[1]):
    length_wav += len(kemdy19_emb[0][i])
    length_txt += len(kemdy19_emb[1][j])
    # lengt += len(i)
print(f'session 19 length: wav - {length_wav}, txt - {length_txt}')

length_wav = 0
length_txt = 0
for i,j in zip(kemdy20_emb[0], kemdy20_emb[1]):
    length_wav += len(kemdy20_emb[0][i])
    length_txt += len(kemdy20_emb[1][j])
    
print(f'session 20 length: wav - {length_wav}, txt - {length_txt}')

session 19 length: wav - 9008, txt - 9008
session 20 length: wav - 12715, txt - 12715


## Session pick 
- test(.2), validation(.2), train(.8)

In [15]:
def choice_and_remove_list(original_list, k = 8):
    removed_new_list = []
    chosen_list = random.sample(original_list, k = k)
    for session in original_list:
        if session in chosen_list:
            pass
        else:
            removed_new_list.append(session) 
    return sorted(removed_new_list), sorted(chosen_list)

In [16]:
# session을 train vs test&val로 나눠줌
session_20_lst = ['Sess0' + str(i+1) if i < 9 else 'Sess' + str(i+1) for i in range(40)]
session_20_lst.remove('Sess12')
session_20_lst.remove('Sess17')
sessions_20_train_lst, sessions_20_test_lst = choice_and_remove_list(session_20_lst, k = 8)
sessions_20_train_lst, sessions_20_val_lst = choice_and_remove_list(sessions_20_train_lst, k = 8)
print(sessions_20_train_lst, sessions_20_test_lst, sessions_20_val_lst, sep = '\n')

session_19_lst = ['Sess0' + str(i+1) if i < 9 else 'Sess' + str(i+1) for i in range(20)]
sessions_19_train_lst, sessions_19_test_lst = choice_and_remove_list(session_19_lst, k = 0)
sessions_19_train_lst, sessions_19_val_lst = choice_and_remove_list(sessions_19_train_lst, k = 0)
print('\n', sessions_19_train_lst,sessions_19_test_lst, sessions_19_val_lst, sep = '\n')

['Sess01', 'Sess03', 'Sess07', 'Sess08', 'Sess09', 'Sess10', 'Sess13', 'Sess15', 'Sess18', 'Sess19', 'Sess20', 'Sess23', 'Sess24', 'Sess26', 'Sess29', 'Sess32', 'Sess35', 'Sess36', 'Sess37', 'Sess38', 'Sess39', 'Sess40']
['Sess02', 'Sess04', 'Sess06', 'Sess14', 'Sess16', 'Sess21', 'Sess28', 'Sess30']
['Sess05', 'Sess11', 'Sess22', 'Sess25', 'Sess27', 'Sess31', 'Sess33', 'Sess34']


['Sess01', 'Sess02', 'Sess03', 'Sess04', 'Sess05', 'Sess06', 'Sess07', 'Sess08', 'Sess09', 'Sess10', 'Sess11', 'Sess12', 'Sess13', 'Sess14', 'Sess15', 'Sess16', 'Sess17', 'Sess18', 'Sess19', 'Sess20']
[]
[]


## data reconstruct
- dict - session - txt, wav, segment_id

In [17]:
kemdy20_emb_new = {}
for session in kemdy20_emb[0].keys():
    kemdy20_emb_new[session] = {}
    kemdy20_emb_new[session]['wav'] = kemdy20_emb[0][session]
    kemdy20_emb_new[session]['txt'] = kemdy20_emb[1][session]
    kemdy20_emb_new[session]['segment_id'] = kemdy20_annot['Segment ID'][kemdy20_annot['Segment ID'].str.startswith(session)]
    # print(session, len(kemdy20_emb_new[session]['wav']), len(kemdy20_emb_new[session]['txt']), len(kemdy20_emb_new[session]['segment_id']))

# print('\n')
kemdy19_emb_new = {}
for session in kemdy19_emb[0].keys():
    kemdy19_emb_new[session] = {}
    kemdy19_emb_new[session]['wav'] = kemdy19_emb[0][session]
    kemdy19_emb_new[session]['txt'] = kemdy19_emb[1][session]
    kemdy19_emb_new[session]['segment_id'] = kemdy19_annot['Segment ID'][kemdy19_annot['Segment ID'].str.startswith(session)]
    # print(session, len(kemdy19_emb_new[session]['wav']), len(kemdy19_emb_new[session]['txt']), len(kemdy19_emb_new[session]['segment_id']))


In [18]:
# 나눠준 세션을 emb(wav, text순), annot set에 적용
def get_data_by_session(data, session_lst):
    if isinstance(data, pd.DataFrame):
        print('dataframe')
        for idx, session in enumerate(session_lst):
            if idx == 0:
                dataframe = data[data['Segment ID'].str.startswith(session)]
            else:
                dataframe = pd.concat([dataframe, data[data['Segment ID'].str.startswith(session)]])
        return dataframe
    
    elif isinstance(data, dict):
        print('dict')
        emb_data = {}
        emb_data['wav'] = []
        emb_data['txt'] = []
        emb_data['segment_id'] = []
        for session in session_lst:
            emb_data['wav'].extend(data[session]['wav'])
            emb_data['txt'].extend(data[session]['txt'])
            emb_data['segment_id'].extend(data[session]['segment_id'])            
        return emb_data

In [19]:
kemdy19_annot_train = get_data_by_session(kemdy19_annot, sessions_19_train_lst)

kemdy20_annot_train = get_data_by_session(kemdy20_annot, sessions_20_train_lst)
kemdy20_annot_test = get_data_by_session(kemdy20_annot, sessions_20_test_lst)
kemdy20_annot_val = get_data_by_session(kemdy20_annot, sessions_20_val_lst)

dataframe
dataframe
dataframe
dataframe


In [20]:
# session 정보가 없어집니다. 
kemdy19_emb_train = get_data_by_session(kemdy19_emb_new, sessions_19_train_lst)

kemdy20_emb_train = get_data_by_session(kemdy20_emb_new, sessions_20_train_lst)
kemdy20_emb_test = get_data_by_session(kemdy20_emb_new, sessions_20_test_lst)
kemdy20_emb_val = get_data_by_session(kemdy20_emb_new, sessions_20_val_lst)

dict
dict
dict
dict


In [22]:
# kemdy 20 - 12715가 정상
print(len(kemdy20_emb_train['wav']), len(kemdy20_emb_train['txt']), len(kemdy20_emb_train['segment_id']))
print(len(kemdy20_emb_test['wav']), len(kemdy20_emb_test['txt']), len(kemdy20_emb_test['segment_id']))
print(len(kemdy20_emb_val['wav']), len(kemdy20_emb_val['txt']), len(kemdy20_emb_val['segment_id']))
print(len(kemdy20_emb_train['wav']) + len(kemdy20_emb_test['wav'])+ len(kemdy20_emb_val['wav']))

7427 7427 7427
2570 2570 2570
2718 2718 2718
12715


## Train Data neutral pick하기

In [25]:
# 각 데이터 셋에서 몇개 뽑아야 되는지 계산 neutral: 4

target_neutral_num = Counter(kemdy20_annot_train['Emotion'])[4]

target_neutral_num_19 = 0
target_neutral_num_20 = target_neutral_num - target_neutral_num_19
print(f'kemdy19에서 {target_neutral_num_19}개, kemdy20에서 {target_neutral_num_20}개 추출')

kemdy19에서 0개, kemdy20에서 6134개 추출


In [26]:
# 각 train dataset에서 뽑아야 되는 갯수만큼 랜덤으로 뽑아옴
kemdy19_annot_train_not_neut = kemdy19_annot_train[kemdy19_annot_train['Emotion'] != 4]
kemdy20_annot_train_not_neut = kemdy20_annot_train[kemdy20_annot_train['Emotion'] != 4]

kemdy19_annot_train_neut = kemdy19_annot_train[kemdy19_annot_train['Emotion'] == 4].sample(target_neutral_num_19)
kemdy20_annot_train_neut = kemdy20_annot_train[kemdy20_annot_train['Emotion'] == 4].sample(target_neutral_num_20)
len(kemdy19_annot_train_not_neut) + len(kemdy19_annot_train_neut) + len(kemdy20_annot_train_not_neut) + len(kemdy20_annot_train_neut)

12964

## embedding, Test and validation dataset 합치기

In [27]:
emb_test_final = {}
emb_test_final['wav'] = []
emb_test_final['txt'] = []
emb_test_final['segment_id'] = []

# emb_test_final['wav'] = kemdy19_emb_test['wav']
emb_test_final['wav'].extend(kemdy20_emb_test['wav'])

# emb_test_final['txt'] = kemdy19_emb_test['txt']
emb_test_final['txt'].extend(kemdy20_emb_test['txt'])

# emb_test_final['segment_id'] = kemdy19_emb_test['segment_id']
emb_test_final['segment_id'].extend(kemdy20_emb_test['segment_id'])


print(len(emb_test_final['wav']), len(emb_test_final['txt']), len(emb_test_final['segment_id']))

2570 2570 2570


In [28]:
emb_val_final = {}
emb_val_final['wav'] = []
emb_val_final['txt'] = []
emb_val_final['segment_id'] = []

# emb_val_final['wav'] = kemdy19_emb_val['wav']
emb_val_final['wav'].extend(kemdy20_emb_val['wav'])

# emb_val_final['txt'] = kemdy19_emb_val['txt']
emb_val_final['txt'].extend(kemdy20_emb_val['txt'])

# emb_val_final['segment_id'] = kemdy19_emb_val['segment_id']
emb_val_final['segment_id'].extend(kemdy20_emb_val['segment_id'])

print(len(emb_val_final['wav']), len(emb_val_final['txt']), len(emb_val_final['segment_id']))

2718 2718 2718


In [29]:
annot_train_final = pd.concat([kemdy19_annot_train_neut, kemdy20_annot_train_neut, kemdy19_annot_train_not_neut, kemdy20_annot_train_not_neut])

annot_test_final = pd.concat([kemdy20_annot_test])
annot_val_final = pd.concat([kemdy20_annot_val])

annot_train_final.reset_index(drop=True, inplace=True)
annot_test_final.reset_index(drop=True, inplace=True)
annot_val_final.reset_index(drop=True, inplace=True)

print(len(annot_train_final), len(annot_test_final), len(annot_val_final))

12964 2570 2718


In [30]:
emb_train_final = {}
emb_train_final['wav'] = []
emb_train_final['txt'] = []
emb_train_final['segment_id'] = []
for segment_annot_id in kemdy19_annot_train_neut['Segment ID']:
    for wav, txt, segment_emb_id in zip(kemdy19_emb_train['wav'], kemdy19_emb_train['txt'], kemdy19_emb_train['segment_id']):
        if segment_annot_id == segment_emb_id:
            emb_train_final['wav'].append(wav)
            emb_train_final['txt'].append(txt)
            emb_train_final['segment_id'].append(segment_emb_id)
            
for segment_annot_id in kemdy19_annot_train_not_neut['Segment ID']:
    for wav, txt, segment_emb_id in zip(kemdy19_emb_train['wav'], kemdy19_emb_train['txt'], kemdy19_emb_train['segment_id']):
        if segment_annot_id == segment_emb_id:
            emb_train_final['wav'].append(wav)
            emb_train_final['txt'].append(txt)
            emb_train_final['segment_id'].append(segment_emb_id)
        

for segment_annot_id in kemdy20_annot_train_neut['Segment ID']:
    for wav, txt, segment_emb_id in zip(kemdy20_emb_train['wav'], kemdy20_emb_train['txt'], kemdy20_emb_train['segment_id']):
        if segment_emb_id == segment_annot_id:
            emb_train_final['wav'].append(wav)
            emb_train_final['txt'].append(txt)
            emb_train_final['segment_id'].append(segment_emb_id)
        
for segment_annot_id in kemdy20_annot_train_not_neut['Segment ID']:
    for wav, txt, segment_emb_id in zip(kemdy20_emb_train['wav'], kemdy20_emb_train['txt'], kemdy20_emb_train['segment_id']):
        if segment_annot_id == segment_emb_id:
            emb_train_final['wav'].append(wav)
            emb_train_final['txt'].append(txt)
            emb_train_final['segment_id'].append(segment_emb_id)
            
print(len(emb_train_final['wav']), len(emb_train_final['txt']), len(emb_train_final['segment_id']))

12964 12964 12964


# torch dataset 만들기
- 참고: https://tutorials.pytorch.kr/beginner/basics/data_tutorial.html

In [31]:
torch.set_default_dtype(torch.float32)
print(torch.get_default_dtype())

torch.float32


In [32]:
class EtriDataset(Dataset):
    def __init__(self, file_names, 
                 text_embeddings, 
                 wav_embeddings, 
                 Temp,
                 EDA,
                 Emotion,
                 Emotion_vec, 
                 Arousal, 
                 Valence):
        self.file_names = file_names
        self.text_embeddings = text_embeddings
        self.wav_embeddings = wav_embeddings
        self.temp = Temp
        self.eda = EDA
        self.label_emotion = Emotion
        self.label_emotion_vec = Emotion_vec
        self.label_arousal = Arousal
        self.label_valence = Valence
        
    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        text_embeddings = self.text_embeddings[idx]
        wav_embeddings = self.wav_embeddings[idx]
        temp = self.temp[idx]
        eda = self.eda[idx]
        label_emotion = self.label_emotion[idx]
        label_emotion_ext = self.label_emotion_vec[idx]
        label_arousal = self.label_arousal[idx]
        label_valence = self.label_valence[idx]
        return text_embeddings, wav_embeddings, temp, eda, label_emotion, label_emotion_ext, label_arousal, label_valence

In [35]:
# session을 통합시킨 데이터 셋을 만들었을 때
dataset_train = EtriDataset(file_names = annot_train_final['Segment ID'],
                      text_embeddings = torch.stack(emb_train_final['txt']),
                      wav_embeddings = torch.stack(emb_train_final['wav']),
                      Emotion = annot_train_final['Emotion'],
                      Arousal = annot_train_final['Arousal'],
                      Valence = annot_train_final['Valence'],
                      EDA = torch.Tensor(annot_train_final['Scaled EDA']), 
                      Temp = torch.Tensor(annot_train_final['Scaled TEMP']), 
                      Emotion_vec = torch.Tensor(annot_train_final['emotion_vector'])) 


dataset_test = EtriDataset(file_names = annot_test_final['Segment ID'],
                      text_embeddings = torch.stack(emb_test_final['txt']),
                      wav_embeddings = torch.stack(emb_test_final['wav']),
                      Emotion = annot_test_final['Emotion'],
                      Arousal = annot_test_final['Arousal'],
                      Valence = annot_test_final['Valence'],
                      EDA = torch.Tensor(annot_test_final['Scaled EDA']), 
                      Temp = torch.Tensor(annot_test_final['Scaled TEMP']), 
                      Emotion_vec = torch.Tensor(annot_test_final['emotion_vector']))

dataset_val = EtriDataset(file_names = annot_val_final['Segment ID'],
                      text_embeddings = torch.stack(emb_val_final['txt']),
                      wav_embeddings = torch.stack(emb_val_final['wav']),
                      Emotion = annot_val_final['Emotion'],
                      Arousal = annot_val_final['Arousal'],
                      Valence = annot_val_final['Valence'],
                      EDA = torch.Tensor(annot_val_final['Scaled EDA']), 
                      Temp = torch.Tensor(annot_val_final['Scaled TEMP']), 
                      Emotion_vec = torch.Tensor(annot_val_final['emotion_vector'])) 


  EDA = torch.Tensor(annot_train_final['Scaled EDA']),


In [36]:
print(f"Training Data Size : {len(dataset_train)}")
print(f"Validation Data Size : {len(dataset_val)}")
print(f"Testing Data Size : {len(dataset_test)}")

Training Data Size : 12964
Validation Data Size : 2718
Testing Data Size : 2570


In [37]:
train_dataloader = DataLoader(dataset_train, batch_size=256, shuffle=True, drop_last=True)
validation_dataloader = DataLoader(dataset_val, batch_size=128, shuffle=True, drop_last=True)
test_dataloader = DataLoader(dataset_test, batch_size=128, shuffle=True, drop_last=True)

# NetWork 만들기

In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [39]:
class MLPNetwork_pre(nn.Module):
    def __init__(self, input_length, input_width):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_length*input_width, 768)
        self.gelu1 = nn.GELU()
        self.bn1 = nn.BatchNorm1d(768)
        self.fc2 = nn.Linear(768, 512)
        self.gelu2 = nn.GELU()
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 32)
        self.gelu3 = nn.GELU()
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.gelu1(x)
        x = self.bn1(x)
        x = self.fc2(x)
        x = self.gelu2(x)
        x = self.bn2(x)
        x = self.fc3(x)
        output = self.gelu3(x)
        return output

class ConvNetwork_final(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv2d_1 = nn.Conv1d(in_channels = 32, out_channels = 16, kernel_size=8)
        self.leakyrelu_1 = nn.LeakyReLU()
        self.conv2d_2 = nn.Conv1d(in_channels = 16, out_channels = 8, kernel_size=8)
        self.leakyrelu_2 = nn.LeakyReLU()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(144, 64)
        self.leakyrelu_3 = nn.LeakyReLU()
        self.bn1 = nn.BatchNorm1d(64)
        self.drop1 = nn.Dropout(p=0.25)
        self.fc2 = nn.Linear(64, 7)

    def forward(self, x):
        x = self.conv2d_1(x)
        x = self.leakyrelu_1(x)
        x = self.conv2d_2(x)
        x = self.leakyrelu_2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.leakyrelu_3(x)
        x = self.bn1(x)
        x = self.drop1(x)
        output = self.fc2(x)
        return output

In [40]:
class TensorFusionMixer(nn.Module):
    def __init__(self, ModelA, ModelB, ModelC):
        super().__init__()
        self.ModelA = ModelA
        self.ModelB = ModelB
        self.Model_cnn_final = ModelC
        self.softmax = nn.Softmax(dim=1)

    def tensor_fusion(self, batch_arr1, batch_arr2):
        fusion_matrix_lst = []
        for i, (arr1, arr2) in enumerate(zip(batch_arr1, batch_arr2)):
            
            arr1 = arr1.unsqueeze(-1)
            arr2 = arr2.unsqueeze(0)


            kron_matrix = torch.kron(arr2, arr1)
            l, w = kron_matrix.shape

            kron_matrix = kron_matrix.view(-1, l, w)
            fusion_matrix_lst.append(kron_matrix)

        fusion_matrix = torch.concat(fusion_matrix_lst)

        return fusion_matrix
    
    def forward(self, x1, x2):
            x1 = self.ModelA(x1)
            x2 = self.ModelB(x2)
            fusion_matrix = self.tensor_fusion(x1, x2)
            x = self.Model_cnn_final(fusion_matrix) 
            output = self.softmax(x)
            return output

In [41]:
txt_input_length, txt_input_width = torch.Tensor(emb_train_final['txt'][0]).shape
wav_input_length, wav_input_width = torch.Tensor(emb_train_final['wav'][0]).shape

# tf_mixer에 들어갈 wav mlp, txt mlp 선언
model_mlp_txt = MLPNetwork_pre(txt_input_length,txt_input_width).to(device)
model_mlp_wav = MLPNetwork_pre(wav_input_length,wav_input_width).to(device)

model_cnn_final = ConvNetwork_final().to(device)

# 최종 모델 선언
model_tf_cnn_mixer = TensorFusionMixer(ModelA = model_mlp_txt, 
                                   ModelB = model_mlp_wav,
                                   ModelC = model_cnn_final).to(device)

# model 병렬 학습 처리
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model_mlp_txt = nn.DataParallel(model_mlp_txt).to(device)
    model_mlp_wav = nn.DataParallel(model_mlp_wav).to(device)
    model_tf_cnn_mixer = nn.DataParallel(model_tf_cnn_mixer).to(device)

print(model_tf_cnn_mixer)

80 768 149 1024
Let's use 4 GPUs!
DataParallel(
  (module): TensorFusionMixer(
    (ModelA): MLPNetwork_pre(
      (flatten): Flatten(start_dim=1, end_dim=-1)
      (fc1): Linear(in_features=61440, out_features=768, bias=True)
      (gelu1): GELU(approximate='none')
      (bn1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc2): Linear(in_features=768, out_features=512, bias=True)
      (gelu2): GELU(approximate='none')
      (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc3): Linear(in_features=512, out_features=32, bias=True)
      (gelu3): GELU(approximate='none')
    )
    (ModelB): MLPNetwork_pre(
      (flatten): Flatten(start_dim=1, end_dim=-1)
      (fc1): Linear(in_features=152576, out_features=768, bias=True)
      (gelu1): GELU(approximate='none')
      (bn1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc2): Linear(in_features=768, out_feat

# 학습을 위한 train, test method 만들기

In [42]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)    
    # data 순서: text_embeddings, wav_embeddings, temp, eda, label_emotion, label_emotion_ext, label_arousal, label_valence
    for batch, (X_txt, X_wav, _, _, 
                    label_emotion, label_emotion_vec, label_arousal, label_valence) in enumerate(dataloader): 
        y = label_emotion_vec # 라벨을 변경하고자 하면 이 변수만 바꿔주면 나머지는 y로 적용
        # 예측 오류 계산 
        X_txt, X_wav, y= X_txt.to(device), X_wav.to(device), y.type(torch.float32).to(device)
        
        
        pred = model(X_txt, X_wav)
        y = F.softmax(y, dim = 1)
        
        loss = loss_fn(pred, y)

        # 역전파
        optimizer.zero_grad()
        loss.mean().backward() 
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.mean().mean().item(), batch * len(X_txt) 
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        

In [43]:
multi_label_threshold = 0.080 # 두 개의 softmax 차이값 어디까지를 multilabel로 볼 것인가.
def cal_multiple_class(probs, threshold = multi_label_threshold):
    values = probs.topk(2)
    pred_list = []
    # print(values)
    diffs = abs(torch.diff(values.values))
    for idx,diff in zip(values.indices, diffs):
        if diff <= threshold:
            sorted_label, idx = torch.sort(idx)
            if sorted_label[0] == 0:
                pred_list.append(100*7 + 10*sorted_label[1].item())
            else:
                pred_list.append(100*sorted_label[0].item() + 10*sorted_label[1].item())
        else:
            pred_list.append(idx[0].item())
    return torch.Tensor(pred_list).to(device), probs.argmax(1)

In [52]:
from copy import deepcopy
from sklearn.metrics import f1_score as f1_skearn

def test(dataloader, model, loss_fn, mode = 'test'):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    f1 = F1Score(task= 'multiclass', num_classes=37).to(device)   
    precision = MulticlassPrecision(num_classes=37)
    preds = []
    targets = []
    
    with torch.no_grad():
        # data 순서: text_embeddings, wav_embeddings, temp, eda, label_emotion, label_emotion_ext, label_arousal, label_valence
        for batch, (X_txt, X_wav, X_temp, X_eda, 
                        label_emotion, label_emotion_vec, label_arousal, label_valence) in enumerate(dataloader): 
            y = label_emotion_vec # 라벨을 변경하고자 하면 이 변수만 바꿔주면 나머지는 y로 적용
            
            # 예측 오류 계산
            X_txt, X_wav, y= X_txt.to(device), X_wav.to(device),y.type(torch.float32).to(device)
            
            
            pred = model(X_txt, X_wav)
            pred_for_acc, _ = cal_multiple_class(pred)
            
            preds.append(pred_for_acc)
            y = F.softmax(y, dim = 1)

            targets.append(label_emotion) 

            test_loss += loss_fn(pred, y).mean().item()
            
            correct += (pred_for_acc == label_emotion.to(device)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    f1_score = f1(torch.cat(preds).to(device), torch.cat(targets).to(device))
    f1_score_weighted = f1_skearn(torch.cat(preds).detach().cpu().numpy(), torch.cat(targets).detach().cpu().numpy(), average= 'weighted')
    
    accuracy = (100*correct)
    
    if mode == 'test':
        print(torch.cat(preds), torch.cat(preds).shape)
        print("f1 score: ", f1_score)
        print(f"Test Error: Accuracy: {(accuracy):>0.1f}%, Avg loss: {test_loss:>8f}\n")
    elif mode == 'val':
        print(f"Validation Error: Accuracy: {(accuracy):>0.1f}%, Avg val loss: {test_loss:>8f} \n")
    
    return f1_score, f1_score_weighted, accuracy, test_loss

# 학습시키기

In [45]:
single_emotion = [0,1,2,3,4,5,6]
total_obs = 0
for i in single_emotion:
    total_obs += Counter(annot_train_final['Emotion'])[i]

print('total (single) obs: ', total_obs)
print(Counter(annot_train_final['Emotion']))

weight_for_class = []
for key, value in sorted(Counter(annot_train_final['Emotion']).items()):
    if key in single_emotion:
        print(f'{key} is in single emotion, {value}')
        weight_for_class.append(1 - (value/total_obs))
            
weight_for_class = torch.Tensor(weight_for_class).type(torch.float16)
weight_for_class

total (single) obs:  12057
Counter({4: 6134, 3: 1800, 0: 1620, 6: 1000, 5: 729, 1: 392, 2: 382, 340: 316, 40: 129, 450: 95, 460: 86, 140: 78, 10: 44, 350: 29, 250: 23, 240: 22, 50: 13, 60: 13, 20: 9, 560: 6, 7140: 5, 7340: 3, 260: 3, 150: 3, 730: 3, 7240: 3, 120: 3, 1450: 2, 230: 2, 4560: 2, 3450: 2, 1460: 2, 360: 2, 7250: 1, 7450: 1, 2450: 1, 3460: 1, 2340: 1, 130: 1, 20000: 1, 160: 1, 10000: 1})
0 is in single emotion, 1620
1 is in single emotion, 392
2 is in single emotion, 382
3 is in single emotion, 1800
4 is in single emotion, 6134
5 is in single emotion, 729
6 is in single emotion, 1000


tensor([0.8657, 0.9673, 0.9683, 0.8506, 0.4912, 0.9395, 0.9170],
       dtype=torch.float16)

In [46]:
class weighted_MSELoss(nn.Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight.to(device)
    def forward(self,inputs,targets):
        return ((inputs - targets)**2) * self.weight

In [48]:
lr = 1e-4
loss_fn = weighted_MSELoss(weight = weight_for_class).to(device) 
optimizer = optim.Adam(model_tf_cnn_mixer.parameters(), lr=lr, weight_decay= 0.0012)

## start training mlp fusion mixer

In [50]:
#wandb init
epochs = 15

wandb.init(
    # set the wandb project where this run will be logged
    project="ETRI-kyungho-multiclassification_NOTS",
    name = f'Experiment Name(kyungho)',
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "CNN Tensor Fusion Mixer",
    "dataset": "ETRI Kemdy20",
    "epochs": epochs,
    "Optimizer": optimizer.__class__.__name__,
    "Loss": loss_fn.__class__.__name__,
    "multi label threshold": multi_label_threshold,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkhk172216[0m ([33mtoez[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [54]:
# Set the Training Parameters
from copy import deepcopy
loss_list = []
acc_list = []
best_acc = 0
best_f1 = 0
best_f1_weighted = 0
best_acc_model = None 
best_f1_model = None

for epoch in range(epochs):
    print(f"---------------Epoch {epoch+1}----------------")
    train(train_dataloader, model_tf_cnn_mixer, loss_fn, optimizer)
    f1_score, f1_score_weighted, accuracy, loss = test(validation_dataloader, model_tf_cnn_mixer, loss_fn, mode = 'val')
    if accuracy > best_acc:
        best_acc = accuracy
        best_acc_model = deepcopy(model_tf_cnn_mixer)
        print('best_acc:', best_acc)
    if f1_score > best_f1:
        best_f1 = f1_score
        best_f1_model = deepcopy(model_tf_cnn_mixer)
        print('best_f1:', best_f1)
        
    if f1_score_weighted > best_f1_weighted:
        best_f1_weighted = f1_score_weighted
        best_f1_weighted_model = deepcopy(model_tf_cnn_mixer)
        print('best_f1:', best_f1_weighted)
        
    loss_list.append(loss)
    acc_list.append(accuracy)
    wandb.log({'accuracy': accuracy, 'loss': loss, 'f1 score': f1_score})
wandb.finish()
print("Done!", f'best f1_score: {best_f1}, f1_weighted {best_f1_weighted} | best accuracy: {best_acc}')

---------------Epoch 1----------------
loss: 0.060106  [    0/12964]
Validation Error: Accuracy: 52.8%, Avg val loss: 0.042178 

best_acc: 52.75938189845475
best_f1: tensor(0.5335, device='cuda:0')
best_f1: 0.4692226352321508
---------------Epoch 2----------------
loss: 0.054037  [    0/12964]
Validation Error: Accuracy: 61.7%, Avg val loss: 0.035416 

best_acc: 61.73657100809419
best_f1: tensor(0.6243, device='cuda:0')
best_f1: 0.5932512154031657
---------------Epoch 3----------------
loss: 0.055136  [    0/12964]
Validation Error: Accuracy: 32.3%, Avg val loss: 0.049552 

---------------Epoch 4----------------
loss: 0.052756  [    0/12964]
Validation Error: Accuracy: 72.7%, Avg val loss: 0.031164 

best_acc: 72.70051508462105
best_f1: tensor(0.7351, device='cuda:0')
best_f1: 0.7567662664952928
---------------Epoch 5----------------
loss: 0.055373  [    0/12964]
Validation Error: Accuracy: 36.3%, Avg val loss: 0.047419 

---------------Epoch 6----------------
loss: 0.051534  [    0/12

0,1
accuracy,▅▇▃█▃▂▄▅▄▁▃▂▁▂▄
f1 score,▅▇▃█▃▂▄▅▄▁▃▂▁▂▄
loss,▄▂▅▁▅▆▄▃▄█▅█▇▇▅

0,1
accuracy,42.89919
f1 score,0.43378
loss,0.04731


Done! best f1_score: 0.7351190447807312, f1_weighted 0.7567662664952928 | best accuracy: 72.70051508462105


In [55]:
test(test_dataloader, model_tf_cnn_mixer, loss_fn, mode = 'test')

tensor([  4.,   0.,   4.,  ...,   4.,   4., 460.], device='cuda:0') torch.Size([2560])
f1 score:  tensor(0.4059, device='cuda:0')
Test Error: Accuracy: 40.4%, Avg loss: 0.051221



(tensor(0.4059, device='cuda:0'),
 0.3043544289798236,
 40.42801556420233,
 0.0512214444577694)