# Set parameters

In [None]:
parameters = {
    'contextual': True,
    'timewindowsize': 10,
    'smoothing_windowsize': 10,
    'sparsing_coefficient': 24,
    'labels_forward_shift': 0,
    }

modalities_list = ['audio', 'eyes', 'face_nn', 'kinect']
features_num_list = [36, 6, 100, 27]
frame_rate_list = [100, 50, 50, 15]

X_raw = [None] * len(modalities_list)

context = 6 # amount of context in seconds

###############################################
###  THIS IS WHAT YOU MIGHT WANT TO CHANGE  ###
###############################################

folder = '../' # Set the folder of test data 
save_prediction_folder = folder + 'submission_FedotovD/prediction/' # Set folder for prediction
# The prediction files will NOT replace the original ones (with all zeros)

print('Parameters are loaded!')

# Read test data

In [None]:
import os
import pandas as pd
import numpy as np

num_sets = len(modalities_list)

for mod in range(0,len(modalities_list)):
    # Features
    modality = modalities_list[mod]
    features_num = features_num_list[mod]
    
    filenames_features = os.listdir(folder + modality)

    names_list = ['F_' + str(elem) for elem in range(0,features_num)]

    for num in range (0, len(filenames_features)):
        temp = pd.read_csv(folder + modality + '/' + filenames_features[num], names=names_list, header=0)
        if num == 0:
            features = pd.DataFrame(temp.as_matrix(), 
                                    index=[filenames_features[num][:-4] + '_' + str(elem) for elem in np.round(temp.index.values, decimals=2)],
                                    columns=temp.columns.values)
        else:
            features = features.append(pd.DataFrame(temp.as_matrix(),
                                                    index=[filenames_features[num][:-4] + '_' + str(elem) for elem in np.round(temp.index.values, decimals=2)],
                                                    columns=temp.columns.values))
    X_raw[mod] = features
    
# Labels
filenames_labels = os.listdir(folder + 'prediction')

for num in range (0, len(filenames_labels)):
    temp = pd.read_csv(folder + 'prediction/' + filenames_labels[num])
    if num == 0:
        labels = pd.DataFrame(temp.iloc[:,1:7].as_matrix(), 
                             index=[filenames_labels[num][:-4] + '_' + str(elem) for elem in round(temp.Time,2)],
                             columns=temp.columns.values[1:7])
    else:
        labels = labels.append(pd.DataFrame(temp.iloc[:,1:7].as_matrix(),
                                            index=[filenames_labels[num][:-4] + '_' + str(elem) for elem in round(temp.Time,2)],
                                            columns=temp.columns.values[1:7]))
        
Y_raw = labels

print('modality: (examples, features)\n')
for sets in range(num_sets):
    print(modalities_list[sets] + ': ' + str(X_raw[sets].shape))
print('labels: ' + str(Y_raw.shape))

print('Data is loaded!')

# Preprocessing functions

In [None]:
from sklearn import preprocessing
import os
import sys
import copy

# Create index variable
index_cut = [None] * num_sets
for sets in range(num_sets):
    index_cut[sets] = [itm for itm in X_raw[sets].index if itm in Y_raw.index]
    
def extract_partition_from_index (indices):
    partition = int(indices[2:10], 16)
    
    return partition

# This code performs the standard data preprocessing
def data_normalizer(dataX, scalerX, imputer=True, features_norm=True, verbose=True):

    # Create data frames for normalized data
    dataX_norm = copy.deepcopy(dataX)
    # Imput missing values (if any)
    if (imputer):
        dataX_norm.set_value(dataX_norm.index, dataX_norm.columns, np.nan_to_num(dataX_norm.as_matrix()))

    # Normalize features
    if (features_norm):                             
        dataX_norm.set_value(dataX_norm.index, dataX_norm.columns, 
                                        scalerX.transform(dataX_norm.as_matrix()))
    
    return dataX_norm

# This code transforms standard data representation (Samples x Features) into time-continuous (Samples x Features x Timesteps)
# taking sparsing coefficient into account

def smoother (prediction, index, index_cont, smoothing_windowsize, sc, labels_forward_shift):
    prediction_smooth = np.zeros((len(index)))
    for i in range(0,len(index)):
        counter = 0
        lower_bound = max(0,i - smoothing_windowsize*sc * max(1,labels_forward_shift))
        upper_bound = i + smoothing_windowsize*sc * max(1,labels_forward_shift)
        locate = np.where(index_cont[lower_bound:upper_bound] == index[i])
        for loc_x in range(0, locate[0].shape[0]):
            for loc_y in range(0, locate[1].shape[0]):
                prediction_smooth[i] += prediction[lower_bound + locate[0][loc_x],locate[1][loc_y]]
                counter += 1
        prediction_smooth[i] /= counter 

            
    return prediction_smooth

# Filling arrays with time-continuous data
def add_timesteps (X, Y, index, timewindowX, timewindowY, sc, labels_forward_shift):
    
    c = 1 - labels_forward_shift
    # Create arrays for time-continuous data     
    X_cont = np.zeros((X.shape[0], timewindowX, X.shape[1]))
    Y_cont_index = np.array([([None] * timewindowY)] * Y.shape[0])
    
    # sc - sparsing coefficient
    for i in range (X.shape[0]):
        for j in range (0, timewindowX):
            if (i - sc*timewindowX + sc*j + sc) >= 0:
                if extract_partition_from_index(index[i]) == extract_partition_from_index(index[i - sc*timewindowX + sc*j + sc]): 
                    X_cont[i,j,:] = X[i - sc*timewindowX + sc*j + sc,:]

        for j in range (0, timewindowY):
            if 0 <= (i - sc*np.floor(timewindowY*c) + sc*j + sc*(0 ** (1 - c))) < Y.shape[0]: 
                if extract_partition_from_index(index[i]) == extract_partition_from_index(index[i - sc*int(np.floor(timewindowY*c)) + sc*j + sc*int(0 ** (1 - c))]):
                    Y_cont_index[i,j] = index[i - sc*int(np.floor(timewindowY*c)) + sc*j + sc*int(0 ** (1 - c))]
                    
    return X_cont, Y_cont_index

def preprocess_data (X_raw, Y_raw, scalerX_list, index_cut, parameters):
    X = [None] * num_sets
    X_ = [None] * num_sets
    Y_cont_index = [None] * num_sets

    print('Preprocessing...')
    
    for sets in range(num_sets):
        print('Set: ' + str(sets))
        X_[sets] = data_normalizer(X_raw[sets], scalerX_list[sets])

        parameters['sparsing_coefficient'] = int(context * frame_rate_list[sets] / parameters['timewindowsize'])

        [X[sets], Y_cont_index[sets]] = add_timesteps(X_[sets].loc[index_cut[sets]].as_matrix(),
                                                      Y_raw.loc[index_cut[sets]].as_matrix(),
                                                      index_cut[sets], parameters['timewindowsize'],
                                                      parameters['smoothing_windowsize'], 
                                                      parameters['sparsing_coefficient'], 
                                                      parameters['labels_forward_shift'])

    print('Preprocessing is completed!')
    return X, Y_cont_index

print('Preprocessing functions are defined!')

# Preprocess

In [None]:
from sklearn.externals import joblib 
scalerX_list = [None] * len(modalities_list)
for sets in range(0, len(modalities_list)):
    scalerX_list[sets] = joblib.load('scalerX_' + modalities_list[sets] + '.pkl') 
[X, Y_cont_index] = preprocess_data (X_raw, Y_raw, scalerX_list, index_cut, parameters)

# Predict

In [None]:
from keras.models import Sequential, load_model
print('Making predictions with pre-trained models...')
prediction = [None] * num_sets

for sets in range(num_sets):
    print('\nSet: ' + str(sets))
    model = load_model('model_' + modalities_list[sets] + '.h5')
    print('Model loaded!\nPredicting...')
    prediction[sets] = model.predict(X[sets], verbose=1)
    
print('\nPredictions are made!')

In [None]:
prediction_smooth = [None] * num_sets
prediction_DF = [None] * num_sets

print('Smoothing prediction...\nThere will be 4 sets, 6 parts each\nIt takes time, be patient :)')

for sets in range(num_sets):
    prediction_smooth[sets] = np.zeros((prediction[sets].shape[0], prediction[sets].shape[2]))
    print('Set: ' + str(sets))
    for i in range(0,6):
        print(str(i + 1) + '/6')
        parameters['sparsing_coefficient'] = int(context * frame_rate_list[sets] / parameters['timewindowsize'])
        prediction_smooth[sets][:,i] = smoother(prediction[sets][:,:,i], index_cut[sets], Y_cont_index[sets], parameters['smoothing_windowsize'], parameters['sparsing_coefficient'], parameters['labels_forward_shift'])

    prediction_DF[sets] = pd.DataFrame(prediction_smooth[sets], index=index_cut[sets])

for sets in range(num_sets):
    print('Number of NaNs in prediction for ' + modalities_list[sets] + ': ' + str(np.sum(pd.isnull(prediction_DF[sets].any(axis=1)))))
print('Predictions are smoothed!')

# Multimodal prediction

In [None]:
print('Fusing predictions...')

# Interpolate
pred_inter = [None] * len(modalities_list)
for mod in range(0,len(modalities_list)):
    pred_inter[mod] = pd.DataFrame(np.empty((Y_raw.shape[0],Y_raw.shape[1])) * np.nan, index=Y_raw.index, columns=prediction_DF[mod].columns)
    pred_inter[mod].set_value(prediction_DF[mod].index, prediction_DF[mod].columns, prediction_DF[mod].as_matrix())
    pred_inter[mod] = pred_inter[mod].interpolate()

# Clear NaNs if any
for mod in range(0,len(modalities_list)):
    for i in range(0,pred_inter[mod].shape[0]):
        if np.sum(np.isnan(pred_inter[mod].iloc[i,:].as_matrix())) == 6:
            pred_inter[mod].set_value(pred_inter[mod].index.values[i], pred_inter[mod].columns, [0.18, 0.15, 0.08, 0.20, 0.20, 0.19])
        else: break
    print('Number of NaNs in ' + modalities_list[mod] + ': ' + str(np.sum(np.isnan(pred_inter[0].as_matrix()))/6))
    
# Adding data frames
multimodal_results = pred_inter[0].add(pred_inter[1])
multimodal_results2 = pred_inter[2].add(pred_inter[3])
multimodal_results = multimodal_results.add(multimodal_results2)
print('Predictions are fused!')

In [None]:
# Redo submission files for one label/tw

print('Modifying predictions...')
multimodal_results_new = copy.deepcopy(multimodal_results) 
from scipy import stats

tw = 10000

for num in range(0,len(filenames_labels)):
    pred = multimodal_results.filter(like=filenames_labels[num][:10], axis=0)
    temp = pred.as_matrix()

    parts = int(np.ceil(temp.shape[0]/tw))
    new_labels = np.zeros((temp.shape[0], temp.shape[1]))
    for i in range (0,parts):
        new_labels[range(i*tw,np.min([(i+1)*tw,temp.shape[0]])),
                   stats.mode(np.argmax(temp[i*tw:np.min([(i+1)*tw,temp.shape[0]]),:],axis=1))[0][0]] = 1
        
    multimodal_results_new.loc[pred.index,:] = new_labels
    
multimodal_results_new.columns = ['Anger','Sad', 'Disgust', 'Happy', 'Scared', 'Neutral']
multimodal_results_new.rename_axis('Time', axis=1)
print('Predictions are modified!')

In [None]:
print('Saving predictions...')
for f in range(len(filenames_labels)):
    multimodal_results_new.columns = ['Anger','Sad', 'Disgust', 'Happy', 'Scared', 'Neutral']
    sub_file = multimodal_results_new.loc[[s for s in multimodal_results_new.index.values if filenames_labels[f][:10] in s]]
    temp = pd.read_csv(folder + 'prediction/' + filenames_labels[f], engine='python')
    sub_file.index = temp.Time
    sub_file = sub_file.rename_axis('Time', axis=1)
    
    if not os.path.exists(save_prediction_folder):
        os.makedirs(save_prediction_folder)
    sub_file.to_csv(save_prediction_folder + filenames_labels[f])

print('Predictions are saved!\nThis was the last step!\nThank you!')