### Demo for the MMML for Human Behavior Understanding
Author: Xinyu Li

ID: 2278619

Date: July/2022

Project Name: Multimodal Machine Learning for Human Behavior Understanding

This repository are used to show the programming implementation of the MSc Individual Project (EESE MSc 2022 Summer) at the School of Engineering, University of Birmingham.

Before Running: import all reqired package by !pip or manually conbine the ipykernal to a specific python environment (venv).
The full install table is given in requirement.txt.

In [1]:
'''import section'''
import os
import json
# !pip install pymrmr
import pymrmr
import numpy as np
import pandas as pd
from smart_open import smart_open
from keras import metrics
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Dense, Concatenate
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.utils import plot_model

# !pip install tensorflow
# !pip install tensorflow-estimator
# from autoencoder import AutoEncoder

##### Loading the config file

In [2]:
address_config = json.load(open('./config/address.json', 'r'))

out_address = address_config['prepocessed_feature']['pre-aligned']
print(os.path.isfile(out_address['train_Audio']))
print(os.path.isfile(out_address['train_Visual']))
out_address['train_Visual']

True
True


'./dataset/prepocessed_feature/V_data_train.csv'

##### Loading the preprocessed data
For audio data:          
MFCC(39) + MFCC Velocity(39) + MFCC Acceleration(39) + 1 = 118

For visual data:              
Facial Landmarks_x(68) + Facial Landmarks_y(68) + eye gaze(6) + head pose(6) + AU_r(17) + AU_c(18) + 1 = 184 

In [3]:
if os.path.isfile(out_address['train_Audio']) and os.path.isfile(out_address['train_Visual']):
    print("\nPre-processed data detected, start loading the features ...")
    train_Audio = pd.read_csv(out_address['train_Audio'], header=None) 
    dev_Audio = pd.read_csv(out_address['dev_Audio'], header=None) 
    test_Audio = pd.read_csv(out_address['test_Audio'], header=None)
    train_Visual = pd.read_csv(out_address['train_Visual'], header=None, low_memory=False)
    dev_Visual = pd.read_csv(out_address['dev_Visual'], header=None, low_memory=False) 
    test_Visual = pd.read_csv(out_address['test_Visual'], header=None, low_memory=False)
    train_label = pd.read_csv(out_address['train_label'], header=None) 
    train_instance = pd.read_csv(out_address['train_instance'], header=None) 
    dev_label = pd.read_csv(out_address['dev_label'], header=None) 
    dev_instance = pd.read_csv(out_address['dev_instance'], header=None)

    print("===" * 15)
    print("Size of train_Audio", train_Audio.shape)
    print("Size of train_Visual", train_Visual.shape)
    print("Size of dev_Audio", dev_Audio.shape)
    print("Size of dev_Visual", dev_Visual.shape)
    print("Size of test_Audio", test_Audio.shape)
    print("Size of test_Visual", test_Visual.shape)
    print("---" * 15)
    print("Size of train_label", train_label.T.shape)
    print("Size of dev_label", dev_label.T.shape)
    print("Size of train_instance", train_instance.T.shape)
    print("Size of dev_instance", dev_instance.T.shape)
    print("===" * 15)
else:
    print("Successfully loading all pre-preproposed features.")


Pre-processed data detected, start loading the features ...
Size of train_Audio (759575, 118)
Size of train_Visual (759575, 184)
Size of dev_Audio (317104, 118)
Size of dev_Visual (317104, 184)
Size of test_Audio (372734, 118)
Size of test_Visual (372734, 184)
---------------------------------------------
Size of train_label (759576, 1)
Size of dev_label (317105, 1)
Size of train_instance (759576, 1)
Size of dev_instance (317105, 1)


In [4]:
# Delete the first column (time-frame index)
train_Audio = train_Audio.iloc[:,1:]  
dev_Audio = dev_Audio.iloc[:,1:]
test_Audio = test_Audio.iloc[:,1:]
train_Visual = train_Visual.iloc[:,1:]
dev_Visual = dev_Visual.iloc[:,1:]
test_Visual = test_Visual.iloc[:,1:]

##### Loading the modal config

In [5]:
# from autoencoder:
model_config = json.load(open('./config/model.json', 'r'))['DDAE']
dest_path = model_config['dest_path']
hidden_ratio = model_config['hidden_ratio']
learning_rate = model_config['learning_rate']
batch_size = model_config['batch_size']
epochs = model_config['epoch_number']
noise = model_config['noise_rate']
p = model_config['p']
beta = model_config['beta']

print("\nSuccessfully loading the DDAE configuration.")


Successfully loading the DDAE configuration.


In [6]:
# Initializing the Multi-DDAE model
name = '%s_hidden%.2f_batch%d_epoch%d_noise%s' % ('multimodal_aligned_mfcc', hidden_ratio, batch_size, epochs, noise)
dim_ori_Audio =  117 # 39*3
dim_ori_Visual_1 = 136
dim_ori_Visual_2 = 6
dim_ori_Visual_3 = 6
dim_ori_Visual_4 = 35
dim_ori_Visual = dim_ori_Visual_1 + dim_ori_Visual_2 + dim_ori_Visual_3 + dim_ori_Visual_4 # = 183
decoder_Audio = None
decoder_Visual_1 = None
decoder_Visual_2 = None
decoder_Visual_3 = None
decoder_Visual_4 = None
# dim_ori_Audio

##### Consturcting the input layer
Function called from: Keras Engine Input layer.

Please install Keras==2.2.4 Keras-Applications==1.0.7 keras-metrics==1.1.0 Keras-Preprocessing==1.0.9 

(Source: https://github.com/keras-team/keras).

In [7]:
# Building the Multi-DDAE model

if not os.path.isdir(os.path.join(dest_path, name)):
    os.mkdir(os.path.join(dest_path, name))
    already_fitted = False
else:
    already_fitted = True

if hidden_ratio != 1.0:
    dim_Audio = int(dim_ori_Audio * hidden_ratio)
    dim_V1 = int(dim_ori_Visual_1 * hidden_ratio)
    dim_V2 = int(dim_ori_Visual_2 * hidden_ratio)
    dim_V3 = int(dim_ori_Visual_3 * hidden_ratio)
    dim_V4 = int(dim_ori_Visual_4 * hidden_ratio)
    dim = int((dim_ori_Audio + dim_ori_Visual) * hidden_ratio / 4)
    
input_Audio = Input(shape=(dim_ori_Audio, ), name='audio_MFCCs')
input_Video_1 = Input(shape=(dim_ori_Visual_1, ), name='video_landmark')
input_Video_2 = Input(shape=(dim_ori_Visual_2, ), name='video_eyegaze')
input_Video_3 = Input(shape=(dim_ori_Visual_3, ), name='video_headpose')
input_Video_4 = Input(shape=(dim_ori_Visual_4, ), name='video_AU')
encoded_input = Input(shape=(dim, ))

##### Consturcting the deep autoencoder
Dense function called from: Core Keras Layers.

Concatenate function called from: Merge Layers. 

Model function called from: Training-related part of the Keras engine.

Please install Keras==2.2.4 Keras-Applications==1.0.7 keras-metrics==1.1.0 Keras-Preprocessing==1.0.9 

(Source: https://github.com/keras-team/keras).

In [8]:
encode_layer_Audio = Dense(dim_Audio, 
                activation='relu', kernel_initializer='he_uniform',
                name='audio_MFCCs_encoded')(input_Audio)
encode_layer_Video_1 = Dense(dim_V1, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_landmark_encoded')(input_Video_1)
encode_layer_Video_2 = Dense(dim_V2, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_eyegaze_encoded')(input_Video_2)
encode_layer_Video_3 = Dense(dim_V3,
                activation='relu', kernel_initializer='he_uniform',
                name='video_headpose_encoded')(input_Video_3)
encode_layer_Video_4 = Dense(dim_V4, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_AU_encoded')(input_Video_4)

shared_layer = Concatenate(axis=1, name='concat')([encode_layer_Audio, encode_layer_Video_1, encode_layer_Video_2, encode_layer_Video_3, encode_layer_Video_4])

encoded_layer = Dense(dim, 
            activation='relu',
            name='shared_repres')(shared_layer)

decode_layer_Audio = Dense(dim_Audio, 
                activation='relu', name='audio_MFCCs_decoded')(encoded_layer)
decode_layer_Video_1 = Dense(dim_V1, 
                activation='relu', name='video_landmark_decoded')(encoded_layer)
decode_layer_Video_2 = Dense(dim_V2, 
                activation='relu', name='video_eyegaze_decoded')(encoded_layer)
decode_layer_Video_3 = Dense(dim_V3, 
                activation='relu', name='video_headpose_decoded')(encoded_layer)
decode_layer_Video_4 = Dense(dim_V4, 
                activation='relu', name='video_AU_decoded')(encoded_layer)

decode_layer_Audio = Dense(dim_ori_Audio, activation='linear',
                name='audio_MFCCs_reconstructed')(decode_layer_Audio)
decode_layer_Video_1 = Dense(dim_ori_Visual_1, activation='linear',
                name='video_landmark_reconstructed')(decode_layer_Video_1)
decode_layer_Video_2 = Dense(dim_ori_Visual_2, activation='linear',
                name='video_eyegaze_reconstructed')(decode_layer_Video_2)
decode_layer_Video_3 = Dense(dim_ori_Visual_3, activation='linear',
                name='video_headpose_reconstructed')(decode_layer_Video_3)
decode_layer_Video_4 = Dense(dim_ori_Visual_4, activation='linear',
                name='video_AU_reconstructed')(decode_layer_Video_4)

In [9]:
encode_layer_Audio = Dense(dim_Audio, 
                activation='relu', kernel_initializer='he_uniform',
                name='audio_MFCCs_encoded')(input_Audio)
encode_layer_Video_1 = Dense(dim_V1, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_landmark_encoded')(input_Video_1)
encode_layer_Video_2 = Dense(dim_V2, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_eyegaze_encoded')(input_Video_2)
encode_layer_Video_3 = Dense(dim_V3,
                activation='relu', kernel_initializer='he_uniform',
                name='video_headpose_encoded')(input_Video_3)
encode_layer_Video_4 = Dense(dim_V4, 
                activation='relu', kernel_initializer='he_uniform',
                name='video_AU_encoded')(input_Video_4)

shared_layer = Concatenate(axis=1, name='concat')([encode_layer_Audio, encode_layer_Video_1, encode_layer_Video_2, encode_layer_Video_3, encode_layer_Video_4])

encoded_layer = Dense(dim, 
            activation='relu',
            name='shared_repres')(shared_layer)

decode_layer_Audio = Dense(dim_Audio, 
                activation='relu', name='audio_MFCCs_decoded')(encoded_layer)
decode_layer_Video_1 = Dense(dim_V1, 
                activation='relu', name='video_landmark_decoded')(encoded_layer)
decode_layer_Video_2 = Dense(dim_V2, 
                activation='relu', name='video_eyegaze_decoded')(encoded_layer)
decode_layer_Video_3 = Dense(dim_V3, 
                activation='relu', name='video_headpose_decoded')(encoded_layer)
decode_layer_Video_4 = Dense(dim_V4, 
                activation='relu', name='video_AU_decoded')(encoded_layer)

decode_layer_Audio = Dense(dim_ori_Audio, activation='linear',
                name='audio_MFCCs_reconstructed')(decode_layer_Audio)
decode_layer_Video_1 = Dense(dim_ori_Visual_1, activation='linear',
                name='video_landmark_reconstructed')(decode_layer_Video_1)
decode_layer_Video_2 = Dense(dim_ori_Visual_2, activation='linear',
                name='video_eyegaze_reconstructed')(decode_layer_Video_2)
decode_layer_Video_3 = Dense(dim_ori_Visual_3, activation='linear',
                name='video_headpose_reconstructed')(decode_layer_Video_3)
decode_layer_Video_4 = Dense(dim_ori_Visual_4, activation='linear',
                name='video_AU_reconstructed')(decode_layer_Video_4)

# Training model of the deep autoencoder
deep_autoencoder = Model(inputs=[input_Audio, 
                    input_Video_1, input_Video_2, input_Video_3, input_Video_4], 
                    outputs=[decode_layer_Audio, 
                    decode_layer_Video_1, decode_layer_Video_2, decode_layer_Video_3, decode_layer_Video_4])
encoder = Model(inputs=[input_Audio, 
                    input_Video_1, input_Video_2, input_Video_3, input_Video_4], 
                    outputs=encoded_layer)
decoder_Audio = Model(inputs=encoded_input, 
                    outputs=deep_autoencoder.get_layer('audio_MFCCs_reconstructed')(
                        deep_autoencoder.get_layer('audio_MFCCs_decoded')(
                        encoded_input)))
decoder_Video_1 = Model(inputs=encoded_input, 
                    outputs=deep_autoencoder.get_layer('video_landmark_reconstructed')(
                        deep_autoencoder.get_layer('video_landmark_decoded')(
                        encoded_input)))
decoder_Video_2 = Model(inputs=encoded_input, 
                    outputs=deep_autoencoder.get_layer('video_eyegaze_reconstructed')(
                        deep_autoencoder.get_layer('video_eyegaze_decoded')(
                        encoded_input)))
decoder_Video_3 = Model(inputs=encoded_input, 
                    outputs=deep_autoencoder.get_layer('video_headpose_reconstructed')(
                        deep_autoencoder.get_layer('video_headpose_decoded')(
                        encoded_input)))
decoder_Video_4 = Model(inputs=encoded_input, 
                    outputs=deep_autoencoder.get_layer('video_AU_reconstructed')(
                        deep_autoencoder.get_layer('video_AU_decoded')(
                        encoded_input)))

# configure model
# two combo ['adam' + 'mse] ['adadelta', 'binary_crossentropy']
deep_autoencoder.compile(optimizer='adam', 
                        loss='mse',
                        metrics=[metrics.mse, metrics.mse,
                                metrics.mse, metrics.mse,
                                metrics.mse],
                        loss_weights=[0.35, 0.35, 0.1, 0.1, 0.1])
print("===" * 33)
print("Summary of the deep autoencoder: ")
print(deep_autoencoder.summary())
print("---" * 33)
print("Summary of the encoder: ")
print(encoder.summary())
print("---" * 33)
print("Summary of the audio decoder: ")
print(decoder_Audio.summary())
print("===" * 33)

# display the model structure by graphviz: 
plot_model(deep_autoencoder, show_shapes=True, to_file=os.path.join(dest_path, name, 'multimodal_DDAE.png'))

Summary of the deep autoencoder: 
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 audio_MFCCs (InputLayer)       [(None, 117)]        0           []                               
                                                                                                  
 video_landmark (InputLayer)    [(None, 136)]        0           []                               
                                                                                                  
 video_eyegaze (InputLayer)     [(None, 6)]          0           []                               
                                                                                                  
 video_headpose (InputLayer)    [(None, 6)]          0           []                               
                                                            

 concat (Concatenate)           (None, 149)          0           ['audio_MFCCs_encoded[0][0]',    
                                                                  'video_landmark_encoded[0][0]', 
                                                                  'video_eyegaze_encoded[0][0]',  
                                                                  'video_headpose_encoded[0][0]', 
                                                                  'video_AU_encoded[0][0]']       
                                                                                                  
 shared_repres (Dense)          (None, 37)           5550        ['concat[0][0]']                 
                                                                                                  
Total params: 22,364
Trainable params: 22,364
Non-trainable params: 0
__________________________________________________________________________________________________
None
----------------------------------

##### Training Stage 

Fit function called from: Training-related part of the Keras engine.

Please install Keras==2.2.4 Keras-Applications==1.0.7 keras-metrics==1.1.0 Keras-Preprocessing==1.0.9 

(Source: https://github.com/keras-team/keras).

In [10]:
if already_fitted:
    print("\nExisting model has already fitted. \nOpen model:", name)
    deep_autoencoder.load_weights(os.path.join(dest_path, name, 'DDAE.h5'))
    print("\nCompleted loading the weights of fitted model. \nProcessing model:", name)
else:
    train_Video_1, train_Video_2, train_Video_3, train_Video_4 = separate_V(train_Video)
    dev_Video_1, dev_Video_2, dev_Video_3, dev_Video_4 = separate_V(dev_Video)

    train_Audio = np.vstack((train_Audio, X_dev_A))
    train_Video_1 = np.vstack((train_Video_1, dev_Video_1))
    train_Video_2 = np.vstack((train_Video_2, dev_Video_2))
    train_Video_3 = np.vstack((train_Video_3, dev_Video_3))
    train_Video_4 = np.vstack((train_Video_4, dev_Video_4))

    if noisy:
        train_Audio_noisy = add_noise(train_Audio, noise)
        train_Video_1_noisy = add_noise(train_Video_1, noise)
        train_Video_2_noisy = add_noise(train_Video_2, noise)
        train_Video_3_noisy = add_noise(train_Video_3, noise)
        train_Video_4_noisy = add_noise(train_Video_4, noise)
    else:
        train_Audio_noisy = train_Audio
        train_Video_1_noisy = train_Video_1
        train_Video_2_noisy = train_Video_2
        train_Video_3_noisy = train_Video_3
        train_Video_4_noisy = train_Video_4

    assert train_Audio_noisy.shape == train_Audio.shape
    assert train_Video_1_noisy.shape == train_Video_1.shape
    assert train_Video_2_noisy.shape == train_Video_2.shape
    assert train_Video_3_noisy.shape == train_Video_3.shape
    assert train_Video_4_noisy.shape == train_Video_4.shape
    
    # save the training procedure to the local logger (logger.csv)
#     csv_logger = CSVLogger(os.path.join(dest_path, name, "logger.csv"))
#     checkpoint = ModelCheckpoint(os.path.join(dest_path, name, "weights-improvement-{epoch:02d}-{loss:.2f}.hdf5"), monitor='loss', verbose=1, save_best_only=True, mode='min')
#     callbacks_list = [csv_logger, checkpoint]

    deep_autoencoder.fit([train_Audio_noisy, 
                        train_Video_1_noisy, train_Video_2_noisy, 
                        train_Video_3_noisy, train_Video_4_noisy],
                        [train_Audio, train_Video_1, train_Video_2,
                        train_Video_3, train_Video_4],
                        epochs=epoch_number,
                        batch_size=batch_size,
                        shuffle=True,
                        callbacks=callbacks_list)
    print("\nThe training stage is finished, save to: ", name)


Existing model has already fitted. 
Open model: multimodal_aligned_mfcc_hidden0.50_batch1024_epoch100_noise0.1

Completed loading the weights of fitted model. 
Processing model: multimodal_aligned_mfcc_hidden0.50_batch1024_epoch100_noise0.1


##### Check the training results
For the training log-book --- open file: ./pre-trained/DDAE/"model_name"/logger.csv

For the auto-generated test result --- open file: ./pre-trained/DDAE/"model_name"/"model_name"_result.txt

For the graphviz generated network structure --- open file: ./pre-trained/DDAE/"model_name"/multimodal_DDAE.png

Meanwhile, all trained weights are given in the *.npy files.