In [41]:
import glob
import os
import csv
import numpy as np
from scipy.signal import medfilt
from scipy.signal import wiener
from scipy.signal import savgol_filter

genders = ['male', 'female']
labels = ['Happy', 'Surprise', 'Disgust', 'Angry', 'Fear', 'Sad', 'Neutral'] # 0-6, 7 classes

def preprocess_data(dataPath, train):
    if train:
        path = os.path.join(dataPath, 'train')
        output_dir = os.path.join(dataPath, 'train.csv')
    else:
        path = os.path.join(dataPath, 'val')
        output_dir = os.path.join(dataPath, 'val.csv')
    folders = glob.glob(os.path.join(path, '*'))
    folders.sort()

    with open(output_dir, 'a+') as csv_output_file:
        fieldnames = ['User', 'Max', 'Min', 'Mean', 'Var', 'Mean Abs Diff', 'Mean Abs Second Diff', 'Emotion', 'Gender', 'Age'] # The features extracted
        writer = csv.DictWriter(csv_output_file, fieldnames=fieldnames)
        writer.writeheader()

        for dir in folders:
            with open(os.path.join(dir, 'EDA.csv')) as csv_file:
                csv_reader = csv.reader(csv_file)
                line_count = 0
                data = [] # all data for one person
                time_stamp = [] # time stamp for each item

                for row in csv_reader:
                    if line_count == 0:
                        start_time = float(row[0])
                    elif line_count == 1:
                        freq = float(row[0])
                    elif line_count>2 :
                        data.append(float(row[0]))
                        time_stamp.append(start_time + float((line_count-2)/freq))
                    line_count += 1

                person_Max = max(data)
                person_Min = min(data)
                data = (data - np.average(data)) / (np.std(data))
                #data = (np.array(data) - float(person_Min)) / (float(person_Max) - float(person_Min)) # normalised data for each person
                #data = medfilt(data, 11) # median filter; can be substituted by your preprocessing methods
                #data = wiener(data)
                #data = savgol_filter(data, 11, 5)
                
                
                log = open(os.path.join(dir, 'log.txt'), 'r')
                log_count = 0
                for line in log:
                    if log_count == 0:
                        user = line.split(';')[0].split(':')[-1]
                        age = line.split(';')[1].split(':')[-1]
                        gender = line.split(';')[2].split(':')[-1]
                        gender = genders.index(gender.lower())
                        log_count += 1
                    elif log_count == 1:
                        log_count += 1
                    else:
                        st = float(line.split(';')[1]) # start time of each video
                        et = float(line.split(';')[3]) # end time of each video
                        video_name = line.split(';')[2]
                        if "_" in video_name:
                            emotion_label = line.split(';')[2].split('.')[0][:-10] # emotion label of each video
                        else:
                            emotion_label = line.split(';')[2].split('.')[0][:-9]  # emotion label of each video
                        emotion_label = labels.index(emotion_label)

                        index = np.where(np.logical_and((np.array(time_stamp) >= st), (np.array(time_stamp) <= et)))
                        data_list = data[index[0]]
                        if len(data_list)== 0:
                            break
                        diff_list = [data_list[k+1]-data_list[k] for k in range(len(data_list)-1)]
                        abs_diff_list = abs(np.array(diff_list))
                        second_diff_list = [diff_list[k + 1] - diff_list[k] for k in range(len(diff_list) - 1)]
                        abs_second_diff_list = abs(np.array(second_diff_list))
                        writer.writerow({'User': user, 'Max': max(data_list), 'Min': min(data_list), 'Mean': np.mean(data_list), 'Var': np.var(data_list), 'Mean Abs Diff': np.mean(abs_diff_list), 'Mean Abs Second Diff': np.mean(abs_second_diff_list),'Emotion': emotion_label, 'Gender': gender, 'Age': age})
                log.close()
        csv_file.close()
    csv_output_file.close()

In [42]:
os.remove('train.csv')
preprocess_data('', train=1)

In [43]:
os.remove('val.csv')
preprocess_data('', train=0)

In [44]:
import pandas as pd
from sklearn.utils import shuffle

Data_train = pd.read_csv("train.csv", sep = ",")
Data_train = shuffle(Data_train)
Data_train[Data_train.User == "Person_0"].head(50)

Unnamed: 0,User,Max,Min,Mean,Var,Mean Abs Diff,Mean Abs Second Diff,Emotion,Gender,Age
61,Person_0,-0.520489,-0.542106,-0.529405,3.7e-05,0.004416,0.004821,4,0,28
44,Person_0,-0.442954,-0.485496,-0.471871,0.00013,0.005481,0.00622,0,0,28
8,Person_0,0.614217,0.507834,0.55874,0.000937,0.003178,0.003463,5,0,28
56,Person_0,-0.511215,-0.532292,-0.519594,2.7e-05,0.003027,0.003915,4,0,28
83,Person_0,-0.504624,-0.522588,-0.515194,1.5e-05,0.003052,0.003992,3,0,28
1,Person_0,2.273096,2.098047,2.183685,0.002522,0.006794,0.004437,5,0,28
58,Person_0,-0.513287,-0.529233,-0.521472,1.5e-05,0.002654,0.00348,4,0,28
37,Person_0,-0.429913,-0.448014,-0.439103,2.4e-05,0.002823,0.003688,2,0,28
59,Person_0,-0.510839,-0.529026,-0.520874,1.6e-05,0.002573,0.002702,4,0,28
68,Person_0,-0.526523,-0.541121,-0.534539,1e-05,0.002851,0.004375,1,0,28


In [45]:
Data_val = pd.read_csv("val.csv")
Data_val = shuffle(Data_val)
Data_val.tail()

Unnamed: 0,User,Max,Min,Mean,Var,Mean Abs Diff,Mean Abs Second Diff,Emotion,Gender,Age
676,Person_25,-0.298912,-0.41174,-0.350427,0.000699,0.017288,0.01967,2,0,23
244,Person_17,0.649883,0.443783,0.54193,0.001719,0.027221,0.027067,6,1,29
1407,Person_37,1.120447,0.816078,0.908536,0.004488,0.014959,0.007501,3,0,21
1040,Person_3,0.889679,0.733044,0.798659,0.003195,0.01107,0.007167,1,0,22
37,Person_10,0.93343,0.84781,0.899279,0.000442,0.016184,0.018249,4,0,26


# Vizualisation

In [46]:
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns


# plt.figure(figsize=(16, 10))
# plt.plot(np.arange(0, 1000, 1), Data_train.Mean.iloc[:1000], scaley = 100)
# plt.title("Mean variations")
# plt.legend(["y = mean common variation"])

# plt.show()

In [47]:
# #Max	Min	Mean	Var	Mean Abs Diff	Mean Abs Second Diff	Emotion

# sns.set(rc = {'figure.figsize':(16, 10)})
# sns.set_theme(style="darkgrid")
# sns.scatterplot(data = Data_train, x = "Mean", y = Data_train.index, hue = "Emotion", palette = "tab10", x_bins= 150)
# #sns.lineplot(data = Data_train.iloc[:1500], x = Data_train.Mean.iloc[:1500], y = np.arange(0, 1500, 1), hue = "Emotion", palette = "tab10")

In [48]:
Data_train.isna().sum()

User                    0
Max                     0
Min                     0
Mean                    0
Var                     0
Mean Abs Diff           0
Mean Abs Second Diff    0
Emotion                 0
Gender                  0
Age                     0
dtype: int64

In [49]:
Data_train.Emotion.value_counts()

0    699
6    676
3    475
5    472
2    436
4    434
1    362
Name: Emotion, dtype: int64

####  Data is distributed normally. No NaN values. Sad and happy emotions have more samples than others -> might have to equalize value counts

### Modeling

In [50]:
import tensorflow as tf


def initModel(shape, outputUnits, outputActivation) -> tf.keras.Model:
    tf.keras.backend.clear_session()
    
    inputs = tf.keras.Input(shape = (shape))
    expand = tf.expand_dims(inputs, axis = 2)

    gru = tf.keras.layers.GRU(256, return_sequences = True)(expand)
    flatten = tf.keras.layers.Flatten()(gru)

    outputs = tf.keras.layers.Dense(outputUnits, activation = outputActivation)(flatten)
    model = tf.keras.Model(inputs = inputs, outputs = outputs)
    print(model.summary())
    return model

In [51]:
y = Data_train.Emotion
X = pd.DataFrame(Data_train.drop(['Emotion', 'User'], axis = 1))

y_val = Data_val.Emotion
X_val = pd.DataFrame(Data_val.drop(['Emotion', 'User'], axis = 1))

#Trying adam optimizer
model = initModel(X.shape[1], 7, 'softmax')
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X,
    y,
    #validation_split = 0.3,
    validation_data = (X_val, y_val),
    batch_size=64,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
    ]
)

#Result of adam optimizer
model_acc = model.evaluate(X_val, y_val, verbose=0)[1]
print("Validation Accuracy: {:.3f}%".format(model_acc * 100))

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 8)]               0         
                                                                 
 tf.expand_dims (TFOpLambda)  (None, 8, 1)             0         
                                                                 
 gru (GRU)                   (None, 8, 256)            198912    
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 7)                 14343     
                                                                 
Total params: 213,255
Trainable params: 213,255
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 

In [52]:
# # Plot the accuracy curve for training
# plt.plot(history.history['accuracy'], color='r', label="Accuracy")

# plt.title("Accuracy")
# plt.xlabel("Number of Epochs")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

In [53]:
model.predict(X)

array([[0.16870311, 0.07657745, 0.10712576, ..., 0.10079721, 0.16205086,
        0.2170536 ],
       [0.17407425, 0.10037328, 0.14372985, ..., 0.1219712 , 0.11192211,
        0.1996162 ],
       [0.10674707, 0.16207322, 0.16804105, ..., 0.1292181 , 0.13845596,
        0.1594871 ],
       ...,
       [0.09274311, 0.1725051 , 0.16431177, ..., 0.12184849, 0.17576635,
        0.11873264],
       [0.06886141, 0.22534494, 0.19483313, ..., 0.11931857, 0.1287762 ,
        0.11487933],
       [0.05693928, 0.22031088, 0.15792762, ..., 0.10903908, 0.16173814,
        0.0992789 ]], dtype=float32)

### Creating valence column
#### Emotion is valence (1) on surprise, neutral and happy; otherwise valence is 0 (disgust, angry, fear, sad)

In [54]:
#labels = ['Happy', 'Surprise', 'Disgust', 'Angry', 'Fear', 'Sad', 'Neutral']
def getValence(row) -> int:
    if row['Emotion'] == 0 or row['Emotion'] == 1 or row['Emotion'] == 6:
        return 1
    else:
        return 0
    
Data_train['Valence'] = Data_train.apply(lambda row: getValence(row), axis = 1)
Data_val['Valence'] = Data_val.apply(lambda row: getValence(row), axis = 1)

y = Data_train.Emotion
X = pd.DataFrame(Data_train.drop(['Emotion', 'User'], axis = 1))

y_val = Data_val.Emotion
X_val = pd.DataFrame(Data_val.drop(['Emotion', 'User'], axis = 1))

In [55]:
model = initModel(X.shape[1], 7, 'softmax')

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X,
    y,
    #validation_split = 0.2,
    validation_data = (X_val, y_val),
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True
        )
    ]
)

#Result of adam optimizer
model_acc = model.evaluate(X_val, y_val, verbose=0)[1]
print("Validation Accuracy: {:.3f}%".format(model_acc * 100))

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 9)]               0         
                                                                 
 tf.expand_dims (TFOpLambda)  (None, 9, 1)             0         
                                                                 
 gru (GRU)                   (None, 9, 256)            198912    
                                                                 
 flatten (Flatten)           (None, 2304)              0         
                                                                 
 dense (Dense)               (None, 7)                 16135     
                                                                 
Total params: 215,047
Trainable params: 215,047
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Ep

### With valence given accuracy rose significantly (To 30.2%). Now will predict valence from initial data 

In [56]:
Data_train['Valence'] = Data_train.apply(lambda row: getValence(row), axis = 1)
Data_val['Valence'] = Data_val.apply(lambda row: getValence(row), axis = 1)

y_valence = np.asarray(Data_train.Valence).astype('float32')
y = Data_train.Emotion
X = pd.DataFrame(Data_train.drop(['Valence', 'Emotion', 'User'], axis = 1, inplace = False))

y_val_valence = np.asarray(Data_val.Valence).astype('float32')
y_val = Data_val.Emotion
X_val = pd.DataFrame(Data_val.drop(['Valence', 'Emotion', 'User'], axis = 1, inplace = False))

y_valence

array([0., 1., 1., ..., 0., 1., 0.], dtype=float32)

In [57]:
from tensorflow.keras.layers import Dense
tf.keras.backend.clear_session()
valenceModel = tf.keras.models.Sequential()

valenceModel.add(Dense(8, input_dim = X.shape[1], activation = 'relu'))
valenceModel.add(tf.keras.layers.BatchNormalization())
valenceModel.add(Dense(4, activation = 'relu'))
valenceModel.add(tf.keras.layers.BatchNormalization())
valenceModel.add(Dense(1, activation = 'sigmoid'))

valenceModel.summary()


valenceModel.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = valenceModel.fit(
    X,
    y_valence,
    validation_data = (X_val, y_val_valence),
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True
        )
    ]
)

#Result of adam optimizer
model_acc = valenceModel.evaluate(X_val, y_val_valence, verbose=0)[1]
print("Test Accuracy: {:.3f}%".format(model_acc * 100))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 72        
                                                                 
 batch_normalization (BatchN  (None, 8)                32        
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 batch_normalization_1 (Batc  (None, 4)                16        
 hNormalization)                                                 
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 161
Trainable params: 137
Non-trainable pa

values = model.predict(X_val)
values = [int(value >= 0.5) for value in values]
print('accuracy: ', accuracy_score(y_val_valence, values))

In [58]:
y_valence = valenceModel.predict(X)
y_val_valence = valenceModel.predict(X_val)
y_valence

array([[0.51483446],
       [0.49076444],
       [0.47748485],
       ...,
       [0.28791887],
       [0.40569466],
       [0.2453596 ]], dtype=float32)