In [1]:
#imports

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#remove warnings
import warnings
#ignore warnings
warnings.filterwarnings('ignore')

In [2]:
#open the csv with labels

fold_set = pd.read_csv('../dataset/fold_split.csv')
fold_set.head()

Unnamed: 0,id,fold-subject-independent
0,10_train,1
1,24_train,1
2,38_train,1
3,51_train,1
4,11_train,1


# Train

In [3]:
#quick check: open the files for every feature folder, and check if the columns are the same and check for empty columns

#now, open the sessions on train_val

#open train_val folder

label_path = '../../data/train_val/labels'
data_path = '../../data/train_val/'

#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]

print('Number of sessions:', len(files))


#open each feature folder, get the csvs into a single dataframe, with the session number as a column
feature_folders = ['openface', 'openpose', 'opensmile']

for folder in feature_folders:
    print('Processing', folder)
    #list files in path
    files = os.listdir(data_path + folder)
    #remove hidden files
    files = [file for file in files if not file.startswith('.')]
    columns = [] 
    for file in files:
        #open the first file to get the column names
        df = pd.read_csv(data_path + folder + '/' + file)
        #add column with session number
        session= file.split('.')[0]
        if len(columns) == 0:
            columns = df.columns
            data = df
        else:
            cols = df.columns
            if not all(elem in columns for elem in cols):
                print('Columns do not match')
                print('Columns in', folder, 'not in data:', [elem for elem in cols if elem not in columns])
                print(session)
        #now, check for completely empty columns
        empty_cols = df.columns[df.isnull().all()]
        if len(empty_cols) > 0:
            print('Empty columns in', folder, ':', empty_cols)
            print(session)
            

Number of sessions: 71
Processing openface
Processing openpose
Columns do not match
Columns in openpose not in data: ['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y', 'vel_dist_7_0', 'vel_dist_4_0']
36_train
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8',
       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
       'vel_dist_7_0', 'vel_dist_4_0'],
      dtype='object')
36_train
Columns do not match
Columns in openpose not in data: ['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y', 'vel_dist_7_0', 'vel_dist_4_0']
50_train
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8',
       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
       'vel_dist_7_0', 'vel_di

In [10]:
#now, open the sessions on train_val

#open train_val folder

label_path = '../../data/train_val/labels'
data_path = '../../data/train_val/'

#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]

print('Number of sessions:', len(files))


#open each feature folder, get the csvs into a single dataframe, with the session number as a column
feature_folders = ['openface', 'openpose', 'opensmile']

#initialize the dataframe
#check if the file exists
if os.path.exists('../../data/train_val.csv'):
    train_val = pd.read_csv('../../data/train_val.csv')
    print('train_val:', train_val.shape)
    sessions_already = train_val['session'].unique()
else:
    train_val = pd.DataFrame()
    sessions_already = []

#save session names when there is a difference for time and frames
diff_session = dict()
    
#get session names
files_folders = os.listdir(data_path + feature_folders[0])
sessions = [file.split('.')[0] for file in files_folders if not file.startswith('.')]
for session in sessions:
    print('session:', session)
    if session in sessions_already:
        print('session already in train_val')
        continue
    for folder in feature_folders:
        
        #if folder openface
        if folder == 'openface':
            #get the csv
            session_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            #if empty, skip this session
            if session_csv.empty:
                print('empty openface')
                continue

            #remove rows with nan values
            print('openface prenan:', session_csv.shape)
            session_csv.dropna(inplace=True)
            print('openface postnan:', session_csv.shape)
            #add session column as the first column

            #decrease the frame by one
            session_csv['frame'] = session_csv['frame'] - 1
            session_csv['session'] = session
            #change session from last to first column
            cols = session_csv.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            session_csv = session_csv[cols]
            #change [' timestamp'] to timestamp
            session_csv.rename(columns={' timestamp': 'timestamp'}, inplace=True)
            print('openface:', session_csv.shape)
            #print(session_csv.columns)
            #print(session_csv.head())




        #if folder openpose
        if folder == 'openpose':
            #get the csv
            open_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            #if empty, skip this session
            if open_csv.empty:
                print('empty openpose')
                continue
            #reduce one in frame_id
            open_csv['frame_id'] = open_csv['frame_id']
            #remove columns ['person_id', 'week_id', 'robot_group'] if existing
            if 'person_id' in open_csv.columns:
                open_csv.drop(columns=['person_id', 'week_id', 'robot_group'], inplace=True)

            #remove columns ['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y','vel_dist_7_0', 'vel_dist_4_0']
            if 'vel_1_x' in open_csv.columns:
                open_csv.drop(columns=['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 
                                       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
                                       'vel_dist_7_0', 'vel_dist_4_0'], inplace=True)
            #see difference in index numbers
            #index_session = session_csv['frame'].values
            #index_open = open_csv['frame_id'].values
            #print('openpose:', index_session, index_open)
            #see if they are the same, if not print
            #if not np.array_equal(index_session, index_open):
            #    print('Different indexes')
            #    #which are different
            #    diff = np.where(index_session != index_open)
            #    print(diff)

            #remove rows with nan values
            print('openpose prenan:', open_csv.shape)
            open_csv.dropna(inplace=True)
            print('openpose postnan:', open_csv.shape)

            #merge horizontally with the session_csv, through column "frame_id" and "frame" in open_csv and session_csv, respectively
            session_csv = pd.merge(session_csv, open_csv, how='inner', left_on='frame', right_on='frame_id')
            #drop the frame_id column
            session_csv.drop(columns='frame_id', inplace=True)

            print('openpose prenan:', session_csv.shape)
            session_csv.dropna(inplace=True)
            print('openpose postnan:', session_csv.shape)

            print('openpose:', session_csv.shape)
            #print(session_csv.columns)
            #print(session_csv.head())

        #if folder opensmile
        if folder == 'opensmile':
            #get the csv
            smile_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            if smile_csv.empty:
                print('empty opensmile')
                continue

            #now, open the corresponding speaker_diarization file
            sd_path = '../../data/train_val/speaker_diarization/'
            sd_csv = pd.read_csv(sd_path + session + '.csv')
            #if empty, skip this session
            if sd_csv.empty:
                print('empty speaker diarization')
                print('***************************************************************************')
                continue

            print('opensmile prenan:', smile_csv.shape)
            smile_csv.dropna(inplace=True)
            print('opensmile postnan:', smile_csv.shape)

            #drop column "file"
            if 'Unnamed: 0' in smile_csv.columns:
                smile_csv.drop(columns='Unnamed: 0', inplace=True)
                


            #time is as "0 days 00:00:02.510000"
            #turn this into only seconds - 2.51
            #first, turn into time instead of string
            smile_csv['start'] = pd.to_timedelta(smile_csv['start'])
            #print(smile_csv[['start']].head())
            smile_csv['time'] = smile_csv['start'].apply(lambda x: x.total_seconds())
            



            #print(smile_csv['time']) 
            #print(session_csv.columns)
            subset_smile = pd.DataFrame()
            #go row by row in session_csv, and look at timestamp. use the timestamp column as a reference to get the opensmile features, and get the average of the features in opensmile within the interval
            prev_time = 0   
            for ind, row in session_csv.iterrows():
                #get the timestamp
                timestamp = row['frame']/30 #in seconds, for 30 fps
                #if timestamp is 0, then avg_features is the first row of smile_csv
                if timestamp == 0:
                    avg_features = smile_csv.iloc[0]
                    avg_features['time'] = timestamp
                    #drop start and end columns
                    avg_features.drop(['start', 'end'], inplace=True)
                    subset_smile = pd.concat([subset_smile, avg_features], axis=1)
                    prev_time = 0
                    continue


                #get the opensmile features that are in the interval
                interval = smile_csv[(smile_csv['time'] > prev_time) & (smile_csv['time'] <= timestamp)]

                #now, check who was speaking. Column "speaker" in sd_csv is robot, person or pause. time is in seconds, and there are two columns, start_turn and end_turn
                #if the timestamp is within the interval of a speaker, then keep the interval, otherwise, zero out the features
                speaker = sd_csv[(sd_csv['start_turn'] <= timestamp) & (sd_csv['end_turn'] > timestamp)]['speaker']
                if speaker.empty:
                    speaker = pd.DataFrame(['pause'])
                #    print(timestamp, 'empty speaker')


                

                #if empty, print warning
                if interval.empty:
                    if timestamp > smile_csv['time'].max():
                        print('timestamp is bigger than max time')
                        print('timestamp max:', session_csv['frame'].max()/30, 'max time opensmile:', smile_csv['time'].max())
                        diff_session[session] = (session_csv['frame'].max()/30, smile_csv['time'].max())
                    else:
                        print('empty interval')
                    #skip rest of the loop
                    break


                #print(interval)
                interval['time'] = timestamp
                interval['frame'] = row['frame']
                #remove start and end columns
                interval.drop(columns=['start', 'end'], inplace=True)
                #get the average of the features
                if speaker.values[0] == 'participant':
                    avg_features = interval.mean()
                else:
                    avg_features = interval.mean()
                    avg_features[:] = 0
            
                avg_features['time'] = timestamp
                avg_features['frame'] = row['frame']
                #avg_features['speaker'] = speaker.values[0]
                #print(speaker.values[0])
                #print(avg_features)
                #append the features to the subset_smile
                subset_smile = pd.concat([subset_smile, avg_features], axis=1)
                #print(subset_smile.shape)
                #update the prev_time
                prev_time = timestamp

            


            print('done')
            #transpose the subset_smile 
            subset_smile = subset_smile.T
            #reindex
            subset_smile.reset_index(drop=True, inplace=True)
            print(subset_smile.shape)
            
            #print(subset_smile.columns)
            #print(subset_smile)
            #merge horizontally with the session_csv
            session_csv = pd.merge(session_csv, subset_smile, how='inner', left_on='frame', right_on='frame')


            print('opensmile:', session_csv.shape)
            #print(session_csv.head())
            
        
    #add a column with fold_id to the session_csv
    fold_id = fold_set[fold_set['id'] == session]['fold-subject-independent'].values[0]
    session_csv['fold_id'] = fold_id
    print('fold_id:', fold_id)
    
    #append the session_csv to the train_val
    train_val = pd.concat([train_val, session_csv], axis=0)
    print('train_val:', train_val.shape)
    print('train_val columns:', train_val.columns)
    train_val.reset_index(drop=True, inplace=True)
    #save the train_val
    train_val.to_csv('../../data/train_val.csv', index=False)

    print('DIFF SESSION:', diff_session)    


train_val.reset_index(drop=True, inplace=True)
print(train_val.shape)
#save the train_val
train_val.to_csv('../../data/train_val.csv', index=False)
train_val



Number of sessions: 71
session: 12_train
openface prenan: (10960, 37)
openface postnan: (10960, 37)
openface: (10960, 38)
openpose prenan: (10960, 31)
openpose postnan: (10959, 31)
openpose prenan: (10959, 68)
openpose postnan: (10959, 68)
openpose: (10959, 68)
opensmile prenan: (36597, 28)
opensmile postnan: (36597, 28)
done
(10959, 27)
opensmile: (10958, 94)
fold_id: 1
train_val: (10958, 95)
train_val columns: Index(['session', 'frame', 'timestamp', ' AU01_r', ' AU01_c', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c',
       ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c',
       ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c',
       ' AU28_c', ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2',
       'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5', 'dist_4_1', 'vel_dist_4_1'

Unnamed: 0,session,frame,timestamp,AU01_r,AU01_c,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz,time,fold_id
0,12_train,1,0.033,0.32,1.0,0.0,0.00,0.00,1.79,1.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,1
1,12_train,2,0.067,0.25,0.0,0.0,0.00,0.00,1.75,1.10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,1
2,12_train,3,0.100,0.16,1.0,0.0,0.00,0.00,1.67,0.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,1
3,12_train,4,0.133,0.15,0.0,0.0,0.00,0.00,1.62,0.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,1
4,12_train,5,0.167,0.12,0.0,0.0,0.00,0.00,1.58,0.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998513,11_val,9903,330.100,1.02,1.0,0.0,0.00,1.41,0.63,1.19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.1,4
998514,11_val,9904,330.133,1.09,1.0,0.0,0.00,1.47,0.70,1.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.133333,4
998515,11_val,9905,330.167,1.15,1.0,0.0,0.03,1.49,0.77,1.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.166667,4
998516,11_val,9906,330.200,1.27,1.0,0.0,0.03,1.17,0.86,1.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.2,4


In [3]:
#open train_val folder
train_val = pd.read_csv('../../data/train_val.csv')

In [11]:
#remove time column
train_val.drop(columns='timestamp', inplace=True)
#make fold_id the first column
cols = train_val.columns.tolist()   
cols = cols[-2:] + cols[:-2]
train_val = train_val[cols]
print(train_val.columns)


Index(['time', 'fold_id', 'session', 'frame', ' AU01_r', ' AU01_c', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c',
       ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c',
       ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c',
       ' AU28_c', ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2',
       'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5', 'dist_4_1', 'vel_dist_4_1',
       'dist_4_17', 'vel_dist_4_17', 'dist_4_15', 'vel_dist_4_15', 'dist_4_18',
       'vel_dist_4_18', 'dist_4_16', 'vel_dist_4_16', 'dist_7_2',
       'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5', 'dist_7_1', 'vel_dist_7_1',
       'dist_7_17', 'vel_dist_7_17', 'dist_7_15', 'vel_dist_7_15', 'dist_7_18',
       'vel_dist_7_18', 'dist_7_16', 'vel_dist_7_16', 'Loudness_sma3',
       'alphaRatio_sma3', 'hamma

In [4]:
train_val.columns 

Index(['time', 'fold_id', 'session', 'frame', 'UserAwkwardness',
       'RobotMistake', 'InteractionRupture', ' AU01_r', ' AU01_c', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c',
       ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c',
       ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c',
       ' AU28_c', ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2',
       'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5', 'dist_4_1', 'vel_dist_4_1',
       'dist_4_17', 'vel_dist_4_17', 'dist_4_15', 'vel_dist_4_15', 'dist_4_18',
       'vel_dist_4_18', 'dist_4_16', 'vel_dist_4_16', 'dist_7_2',
       'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5', 'dist_7_1', 'vel_dist_7_1',
       'dist_7_17', 'vel_dist_7_17', 'dist_7_15', 'vel_dist_7_15', 'dist_7_18',
       'vel_dist_7_18', 'dist_7_16', 'v

In [None]:
#do NOT USE

#get labels for each session, add to the train_val

#label folder
label_path = '../../data/train_val/labels/'
#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]
print('Number of sessions:', len(files))


for file in files:
    #open the csv
    label_df = pd.read_csv(label_path + file)
    #get the session number
    session = file.split('.')[0]
    if session in train_val['session'].values:
        #get the label
        train_val_session = train_val[train_val['session'] == session]
    else:
        print('Session NOT in train_val:', session)
        continue

    #for each row in train_val_session, get the time, and match it to the time in the label dataset
    #if the time is within the interval, get the label
    for ind,row in train_val_session.iterrows():
        time = row['time']
        #get the label
        label_row = label_df[(label_df['Begin Time - ss.msec'] <= time) & (label_df['End Time - ss.msec'] >= time)]
        if label_row.empty:
            print('empty label')
            print('time:', time)
            print('session:', session)
            label_column_0.append('NaN')
            label_column_1.append('NaN')
            label_column_2.append('NaN')
        else:
            label_column_0.append(label_row['UserAwkwardness'].values[0])
            label_column_1.append(label_row['RobotMistake'].values[0])
            label_column_2.append(label_row['InteractionRupture'].values[0])
    
#add the labels to the train_val, as columns 3-5
train_val['UserAwkwardness'] = label_column_0
train_val['RobotMistake'] = label_column_1
train_val['InteractionRupture'] = label_column_2
cols_df = train_val.columns.tolist()
cols_df_new = cols_df[:4] + cols_df[-3:] + cols_df[4:-3]

train_val = train_val[cols_df_new]
print(train_val.columns)
print(train_val.shape)
train_val.to_csv('../../data/train_val.csv', index=False)
train_val



In [10]:
#get labels for each session, add to the train_val

#get train_val
train_val = pd.read_csv('../../data/train_val.csv')
#make the 3 label columns zeros
train_val['UserAwkwardness'] = 0
train_val['RobotMistake'] = 0
train_val['InteractionRupture'] = 0

#label folder
label_path = '../../data/train_val/labels/'
#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]
print('Number of sessions:', len(files))

label_column_0 = []
label_column_1 = []
label_column_2 = []


for file in files:
    #open the csv
    label_df = pd.read_csv(label_path + file)
    #get the session number
    session = file.split('.')[0]
    if session in train_val['session'].values:
        #get the label
        train_val_session = train_val[train_val['session'] == session]
    else:
        print('Session NOT in train_val:', session)
        continue

    #for each row in train_val_session, get the time, and match it to the time in the label dataset
    #if the time is within the interval, get the label
    for ind,row in label_df.iterrows():
        time_min = row['Begin Time - ss.msec']
        time_max = row['End Time - ss.msec']
        #get labels for each
        lab_uawk = row['UserAwkwardness']
        lab_rmist = row['RobotMistake']
        lab_irupt = row['InteractionRupture']
        #get the rows in train_val_session that are within the interval
        train_val_interval = train_val_session[(train_val_session['time'] >= time_min) & (train_val_session['time'] <= time_max)]
        #get the indexes
        index_interval = train_val_interval.index
        print('lenght of train_val_interval:', len(train_val_interval))
        print('index_interval:', index_interval)
        if len(train_val_interval) == 0:
            print('empty interval')
            print('time_min:', time_min)
            print('time_max:', time_max)
            print('session:', session)
        else:
            if lab_uawk == 1:
                train_val.loc[index_interval, 'UserAwkwardness'] = lab_uawk
            if lab_rmist == 1:
                train_val.loc[index_interval, 'RobotMistake'] = lab_rmist
            if lab_irupt == 1:
                train_val.loc[index_interval, 'InteractionRupture'] = lab_irupt



train_val.to_csv('../../data/train_val.csv', index=False)
train_val



Number of sessions: 71
lenght of train_val_interval: 2345
index_interval: Index([191643, 191644, 191645, 191646, 191647, 191648, 191649, 191650, 191651,
       191652,
       ...
       193978, 193979, 193980, 193981, 193982, 193983, 193984, 193985, 193986,
       193987],
      dtype='int64', length=2345)
lenght of train_val_interval: 852
index_interval: Index([193988, 193989, 193990, 193991, 193992, 193993, 193994, 193995, 193996,
       193997,
       ...
       194830, 194831, 194832, 194833, 194834, 194835, 194836, 194837, 194838,
       194839],
      dtype='int64', length=852)
lenght of train_val_interval: 1655
index_interval: Index([194840, 194841, 194842, 194843, 194844, 194845, 194846, 194847, 194848,
       194849,
       ...
       196485, 196486, 196487, 196488, 196489, 196490, 196491, 196492, 196493,
       196494],
      dtype='int64', length=1655)
lenght of train_val_interval: 120
index_interval: Index([196495, 196496, 196497, 196498, 196499, 196500, 196501, 196502, 196

Unnamed: 0,time,fold_id,session,frame,UserAwkwardness,RobotMistake,InteractionRupture,AU01_r,AU01_c,AU02_r,...,logRelF0-H1-A3_sma3nz,F1frequency_sma3nz,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz
0,0.033333,1,12_train,1,0,0,0,0.32,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.066667,1,12_train,2,0,0,0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.100000,1,12_train,3,0,0,0,0.16,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.133333,1,12_train,4,0,0,0,0.15,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.166667,1,12_train,5,0,0,0,0.12,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998513,330.100000,4,11_val,9903,0,0,0,1.02,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998514,330.133333,4,11_val,9904,0,0,0,1.09,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998515,330.166667,4,11_val,9905,0,0,0,1.15,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998516,330.200000,4,11_val,9906,0,0,0,1.27,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#now normalize features
import sklearn
from sklearn.preprocessing import StandardScaler


#copy dataframe
train_val_norm = train_val.copy()
#normalize features
scaler = StandardScaler()
features = train_val_norm.columns[7:]
print(features)
train_val_norm[features] = scaler.fit_transform(train_val_norm[features])

#remove nan and inf
train_val_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
train_val_norm.dropna(inplace=True)

train_val_norm.to_csv('../../data/train_val_norm.csv', index=False)

Index([' AU01_r', ' AU01_c', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r',
       ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r',
       ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r',
       ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c',
       ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c',
       ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'dist_4_7',
       'vel_dist_4_7', 'dist_4_2', 'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5',
       'dist_4_1', 'vel_dist_4_1', 'dist_4_17', 'vel_dist_4_17', 'dist_4_15',
       'vel_dist_4_15', 'dist_4_18', 'vel_dist_4_18', 'dist_4_16',
       'vel_dist_4_16', 'dist_7_2', 'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5',
       'dist_7_1', 'vel_dist_7_1', 'dist_7_17', 'vel_dist_7_17', 'dist_7_15',
       'vel_dist_7_15', 'dist_7_18', 'vel_dist_7_18', 'dist_7_16',
       'vel_dist_7_16', 'Loudness_sma3', 'alphaRatio_sma3',
       'hammarbergIndex_sma3', 'slope0-500_sma3', 's

In [12]:
#save the scaler
import joblib
joblib.dump(scaler, '../../data/scaler.pkl')


['../../data/scaler.pkl']

# TEST

In [3]:
#imports

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#remove warnings
import warnings
#ignore warnings
warnings.filterwarnings('ignore')

In [4]:
#quick check: open the files for every feature folder, and check if the columns are the same and check for empty columns

#now, open the sessions on train_val

#open train_val folder

label_path = '../../data/test/labels'
data_path = '../../data/test/'

#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]

print('Number of sessions:', len(files))


#open each feature folder, get the csvs into a single dataframe, with the session number as a column
feature_folders = ['openface', 'openpose', 'opensmile']

for folder in feature_folders:
    print('Processing', folder)
    #list files in path
    files = os.listdir(data_path + folder)
    #remove hidden files
    files = [file for file in files if not file.startswith('.')]
    columns = [] 
    for file in files:
        #open the first file to get the column names
        df = pd.read_csv(data_path + folder + '/' + file)
        #add column with session number
        session= file.split('.')[0]
        if len(columns) == 0:
            columns = df.columns
            data = df
        else:
            cols = df.columns
            if not all(elem in columns for elem in cols):
                print('Columns do not match')
                print('Columns in', folder, 'not in data:', [elem for elem in cols if elem not in columns])
                print(session)
        #now, check for completely empty columns
        empty_cols = df.columns[df.isnull().all()]
        if len(empty_cols) > 0:
            print('Empty columns in', folder, ':', empty_cols)
            print(session)
            

Number of sessions: 18
Processing openface
Processing openpose
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8',
       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
       'vel_dist_7_0', 'vel_dist_4_0'],
      dtype='object')
10_test
Columns do not match
Columns in openpose not in data: ['person_id', 'week_id', 'robot_group']
13_test
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8',
       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
       'vel_dist_7_0', 'vel_dist_4_0'],
      dtype='object')
7_test
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8',
       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
       'vel_dist_7_0', 'vel_dist_4_0'],
      dtype='object')
2_test
Empty columns in openpose : Index(['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'd

In [8]:
#now, open the sessions on train_val

#open train_val folder

label_path = '../../data/test/labels'
data_path = '../../data/test/'

#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]

print('Number of sessions:', len(files))


#open each feature folder, get the csvs into a single dataframe, with the session number as a column
feature_folders = ['openface', 'openpose', 'opensmile']

#initialize the dataframe
#check if the file exists
if os.path.exists('../../data/test.csv'):
    train_val = pd.read_csv('../../data/test.csv')
    print('train_val:', train_val.shape)
    sessions_already = train_val['session'].unique()
else:
    train_val = pd.DataFrame()
    sessions_already = []

#save session names when there is a difference for time and frames
diff_session = dict()
    
#get session names
files_folders = os.listdir(data_path + feature_folders[0])
sessions = [file.split('.')[0] for file in files_folders if not file.startswith('.')]
for session in sessions:
    print('session:', session)
    if session in sessions_already:
        print('session already in train_val')
        continue
    for folder in feature_folders:
        
        #if folder openface
        if folder == 'openface':
            #get the csv
            session_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            #if empty, skip this session
            if session_csv.empty:
                print('empty openface')
                continue

            #remove rows with nan values
            print('openface prenan:', session_csv.shape)
            session_csv.dropna(inplace=True)
            print('openface postnan:', session_csv.shape)
            #add session column as the first column

            #decrease the frame by one
            session_csv['frame'] = session_csv['frame'] - 1
            session_csv['session'] = session
            #change session from last to first column
            cols = session_csv.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            session_csv = session_csv[cols]
            #change [' timestamp'] to timestamp
            session_csv.rename(columns={' timestamp': 'timestamp'}, inplace=True)
            print('openface:', session_csv.shape)
            #print(session_csv.columns)
            #print(session_csv.head())




        #if folder openpose
        if folder == 'openpose':
            #get the csv
            open_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            #if empty, skip this session
            if open_csv.empty:
                print('empty openpose')
                continue
            #reduce one in frame_id
            open_csv['frame_id'] = open_csv['frame_id']
            #remove columns ['person_id', 'week_id', 'robot_group'] if existing
            if 'person_id' in open_csv.columns:
                open_csv.drop(columns=['person_id', 'week_id', 'robot_group'], inplace=True)

            #remove columns ['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y','vel_dist_7_0', 'vel_dist_4_0']
            if 'vel_1_x' in open_csv.columns:
                open_csv.drop(columns=['vel_1_x', 'vel_1_y', 'vel_8_x', 'vel_8_y', 'dist_1_8', 'vel_dist_1_8', 
                                       'dist_7_0', 'dist_4_0', 'vel_7_x', 'vel_7_y', 'vel_4_x', 'vel_4_y',
                                       'vel_dist_7_0', 'vel_dist_4_0'], inplace=True)


            #remove rows with nan values
            print('openpose prenan:', open_csv.shape)
            open_csv.dropna(inplace=True)
            print('openpose postnan:', open_csv.shape)

            #merge horizontally with the session_csv, through column "frame_id" and "frame" in open_csv and session_csv, respectively
            session_csv = pd.merge(session_csv, open_csv, how='inner', left_on='frame', right_on='frame_id')
            #drop the frame_id column
            session_csv.drop(columns='frame_id', inplace=True)

            print('openpose prenan:', session_csv.shape)
            session_csv.dropna(inplace=True)
            print('openpose postnan:', session_csv.shape)

            print('openpose:', session_csv.shape)
            #print(session_csv.columns)
            #print(session_csv.head())

        #if folder opensmile
        if folder == 'opensmile':
            #get the csv
            smile_csv = pd.read_csv(data_path + folder + '/' + session + '.csv')
            if smile_csv.empty:
                print('empty opensmile')
                continue

            #now, open the corresponding speaker_diarization file
            sd_path = '../../data/test/speaker_diarization/'
            sd_csv = pd.read_csv(sd_path + session + '.csv')
            #if empty, skip this session
            if sd_csv.empty:
                print('empty speaker diarization')
                print('***************************************************************************')
                continue

            print('opensmile prenan:', smile_csv.shape)
            smile_csv.dropna(inplace=True)
            print('opensmile postnan:', smile_csv.shape)

            #drop column "file"
            if 'Unnamed: 0' in smile_csv.columns:
                smile_csv.drop(columns='Unnamed: 0', inplace=True)
                


            #time is as "0 days 00:00:02.510000"
            #turn this into only seconds - 2.51
            #first, turn into time instead of string
            smile_csv['start'] = pd.to_timedelta(smile_csv['start'])
            #print(smile_csv[['start']].head())
            smile_csv['time'] = smile_csv['start'].apply(lambda x: x.total_seconds())
            



            #print(smile_csv['time']) 
            #print(session_csv.columns)
            subset_smile = pd.DataFrame()
            #go row by row in session_csv, and look at timestamp. use the timestamp column as a reference to get the opensmile features, and get the average of the features in opensmile within the interval
            prev_time = 0   
            for ind, row in session_csv.iterrows():
                #get the timestamp
                timestamp = row['frame']/30 #in seconds, for 30 fps
                #if timestamp is 0, then avg_features is the first row of smile_csv
                if timestamp == 0:
                    avg_features = smile_csv.iloc[0]
                    avg_features['time'] = timestamp
                    #drop start and end columns
                    avg_features.drop(['start', 'end'], inplace=True)
                    subset_smile = pd.concat([subset_smile, avg_features], axis=1)
                    prev_time = 0
                    continue


                #get the opensmile features that are in the interval
                interval = smile_csv[(smile_csv['time'] > prev_time) & (smile_csv['time'] <= timestamp)]

                #now, check who was speaking. Column "speaker" in sd_csv is robot, person or pause. time is in seconds, and there are two columns, start_turn and end_turn
                #if the timestamp is within the interval of a speaker, then keep the interval, otherwise, zero out the features
                speaker = sd_csv[(sd_csv['start_turn'] <= timestamp) & (sd_csv['end_turn'] > timestamp)]['speaker']
                if speaker.empty:
                    speaker = pd.DataFrame(['pause'])
                #    print(timestamp, 'empty speaker')


                

                #if empty, print warning
                if interval.empty:
                    if timestamp > smile_csv['time'].max():
                        print('timestamp is bigger than max time')
                        print('timestamp max:', session_csv['frame'].max()/30, 'max time opensmile:', smile_csv['time'].max())
                        diff_session[session] = (session_csv['frame'].max()/30, smile_csv['time'].max())
                    else:
                        print('empty interval')
                    #skip rest of the loop
                    break


                #print(interval)
                interval['time'] = timestamp
                interval['frame'] = row['frame']
                #remove start and end columns
                interval.drop(columns=['start', 'end'], inplace=True)
                #get the average of the features
                if speaker.values[0] == 'participant':
                    avg_features = interval.mean()
                else:
                    avg_features = interval.mean()
                    avg_features[:] = 0
            
                avg_features['time'] = timestamp
                avg_features['frame'] = row['frame']
                #avg_features['speaker'] = speaker.values[0]
                #print(speaker.values[0])
                #print(avg_features)
                #append the features to the subset_smile
                subset_smile = pd.concat([subset_smile, avg_features], axis=1)
                #print(subset_smile.shape)
                #update the prev_time
                prev_time = timestamp

            


            print('done')
            #transpose the subset_smile 
            subset_smile = subset_smile.T
            #reindex
            subset_smile.reset_index(drop=True, inplace=True)
            print(subset_smile.shape)
            
            #print(subset_smile.columns)
            #print(subset_smile)
            #merge horizontally with the session_csv
            session_csv = pd.merge(session_csv, subset_smile, how='inner', left_on='frame', right_on='frame')


            print('opensmile:', session_csv.shape)
            #print(session_csv.head())
            
        

    #append the session_csv to the train_val
    train_val = pd.concat([train_val, session_csv], axis=0)
    print('train_val:', train_val.shape)
    print('train_val columns:', train_val.columns)
    train_val.reset_index(drop=True, inplace=True)
    #save the train_val
    train_val.to_csv('../../data/test.csv', index=False)

    print('DIFF SESSION:', diff_session)    


train_val.reset_index(drop=True, inplace=True)
print(train_val.shape)

#save the train_val
train_val.to_csv('../../data/test.csv', index=False)
train_val



Number of sessions: 18
session: 4_test
openface prenan: (15191, 37)
openface postnan: (15191, 37)
openface: (15191, 38)
openpose prenan: (15191, 31)
openpose postnan: (15190, 31)
openpose prenan: (15190, 68)
openpose postnan: (15190, 68)
openpose: (15190, 68)
opensmile prenan: (50596, 28)
opensmile postnan: (50596, 28)
timestamp is bigger than max time
timestamp max: 506.3 max time opensmile: 505.95
done
(15180, 27)
opensmile: (15179, 94)
train_val: (15179, 94)
train_val columns: Index(['session', 'frame', 'timestamp', ' AU01_r', ' AU01_c', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c',
       ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c',
       ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c',
       ' AU28_c', ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2',
       '

Unnamed: 0,session,frame,timestamp,AU01_r,AU01_c,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,F1frequency_sma3nz,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz,time
0,4_test,1,0.033,0.00,0.0,0.00,0.0,0.0,0.73,1.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
1,4_test,2,0.067,0.00,0.0,0.00,0.0,0.0,0.74,1.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
2,4_test,3,0.100,0.00,0.0,0.00,0.0,0.0,0.73,1.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
3,4_test,4,0.133,0.00,0.0,0.00,0.0,0.0,0.77,1.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
4,4_test,5,0.167,0.00,0.0,0.00,0.0,0.0,0.83,1.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241324,3_test,10233,341.100,4.98,1.0,1.69,0.0,0.0,0.00,1.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,341.1
241325,3_test,10234,341.133,5.00,1.0,1.76,0.0,0.0,0.00,1.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,341.133333
241326,3_test,10235,341.167,5.00,1.0,1.69,0.0,0.0,0.00,1.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,341.166667
241327,3_test,10236,341.200,5.00,1.0,1.60,0.0,0.0,0.00,1.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,341.2


In [9]:
#remove time column
train_val.drop(columns='timestamp', inplace=True)
#make fold_id the first column
cols = train_val.columns.tolist()   
cols = cols[-1:] + cols[:-1]
train_val = train_val[cols]
print(train_val.columns)
#save
train_val.to_csv('../../data/test.csv', index=False)


Index(['F3amplitudeLogRelF0_sma3nz', 'time', 'session', 'frame', ' AU01_r',
       ' AU01_c', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r',
       ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r',
       ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r', ' AU02_c',
       ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c',
       ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c',
       ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'dist_4_7', 'vel_dist_4_7',
       'dist_4_2', 'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5', 'dist_4_1',
       'vel_dist_4_1', 'dist_4_17', 'vel_dist_4_17', 'dist_4_15',
       'vel_dist_4_15', 'dist_4_18', 'vel_dist_4_18', 'dist_4_16',
       'vel_dist_4_16', 'dist_7_2', 'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5',
       'dist_7_1', 'vel_dist_7_1', 'dist_7_17', 'vel_dist_7_17', 'dist_7_15',
       'vel_dist_7_15', 'dist_7_18', 'vel_dist_7_18', 'dist_7_16',
       'vel_dist_7_16', 'Loudness_sma3', 'alpha

In [10]:
#move first column to last
cols = train_val.columns.tolist()
cols = cols[1:] + cols[:1]
train_val = train_val[cols]
print(train_val.columns)
#save
train_val.to_csv('../../data/test.csv', index=False)

Index(['time', 'session', 'frame', ' AU01_r', ' AU01_c', ' AU02_r', ' AU04_r',
       ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r',
       ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r',
       ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c',
       ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c',
       ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c',
       ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2', 'vel_dist_4_2',
       'dist_4_5', 'vel_dist_4_5', 'dist_4_1', 'vel_dist_4_1', 'dist_4_17',
       'vel_dist_4_17', 'dist_4_15', 'vel_dist_4_15', 'dist_4_18',
       'vel_dist_4_18', 'dist_4_16', 'vel_dist_4_16', 'dist_7_2',
       'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5', 'dist_7_1', 'vel_dist_7_1',
       'dist_7_17', 'vel_dist_7_17', 'dist_7_15', 'vel_dist_7_15', 'dist_7_18',
       'vel_dist_7_18', 'dist_7_16', 'vel_dist_7_16', 'Loudness_sma3',
       'alphaRatio_sma3', 'hammarbergIndex_

In [11]:
#get labels for each session, add to the train_val

#get train_val
train_val = pd.read_csv('../../data/test.csv')
#make the 3 label columns zeros
train_val['UserAwkwardness'] = 0
train_val['RobotMistake'] = 0
train_val['InteractionRupture'] = 0

#label folder
label_path = '../../data/test/labels/'
#list files in path
files = os.listdir(label_path)
#remove hidden files
files = [file for file in files if not file.startswith('.')]
print('Number of sessions:', len(files))


for file in files:
    #open the csv
    label_df = pd.read_csv(label_path + file)
    #get the session number
    session = file.split('.')[0]
    if session in train_val['session'].values:
        #get the label
        train_val_session = train_val[train_val['session'] == session]
    else:
        print('Session NOT in train_val:', session)
        continue

    #for each row in train_val_session, get the time, and match it to the time in the label dataset
    #if the time is within the interval, get the label
    for ind,row in label_df.iterrows():
        time_min = row['Begin Time - ss.msec']
        time_max = row['End Time - ss.msec']
        #get labels for each
        lab_uawk = row['UserAwkwardness']
        lab_rmist = row['RobotMistake']
        lab_irupt = row['InteractionRupture']
        #get the rows in train_val_session that are within the interval
        train_val_interval = train_val_session[(train_val_session['time'] >= time_min) & (train_val_session['time'] <= time_max)]
        #get the indexes
        index_interval = train_val_interval.index
        print('lenght of train_val_interval:', len(train_val_interval))
        print('index_interval:', index_interval)
        if len(train_val_interval) == 0:
            print('empty interval')
            print('time_min:', time_min)
            print('time_max:', time_max)
            print('session:', session)
        else:
            if lab_uawk == 1:
                train_val.loc[index_interval, 'UserAwkwardness'] = lab_uawk
            if lab_rmist == 1:
                train_val.loc[index_interval, 'RobotMistake'] = lab_rmist
            if lab_irupt == 1:
                train_val.loc[index_interval, 'InteractionRupture'] = lab_irupt



train_val.to_csv('../../data/test.csv', index=False)
train_val



Number of sessions: 18
lenght of train_val_interval: 2399
index_interval: Index([187144, 187145, 187146, 187147, 187148, 187149, 187150, 187151, 187152,
       187153,
       ...
       189533, 189534, 189535, 189536, 189537, 189538, 189539, 189540, 189541,
       189542],
      dtype='int64', length=2399)
lenght of train_val_interval: 1816
index_interval: Index([189543, 189544, 189545, 189546, 189547, 189548, 189549, 189550, 189551,
       189552,
       ...
       191349, 191350, 191351, 191352, 191353, 191354, 191355, 191356, 191357,
       191358],
      dtype='int64', length=1816)
lenght of train_val_interval: 2835
index_interval: Index([191359, 191360, 191361, 191362, 191363, 191364, 191365, 191366, 191367,
       191368,
       ...
       194184, 194185, 194186, 194187, 194188, 194189, 194190, 194191, 194192,
       194193],
      dtype='int64', length=2835)
lenght of train_val_interval: 778
index_interval: Index([194194, 194195, 194196, 194197, 194198, 194199, 194200, 194201, 1

Unnamed: 0,time,session,frame,AU01_r,AU01_c,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz,UserAwkwardness,RobotMistake,InteractionRupture
0,0.033333,4_test,1,0.00,0.0,0.00,0.0,0.0,0.73,1.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,0.066667,4_test,2,0.00,0.0,0.00,0.0,0.0,0.74,1.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,0.100000,4_test,3,0.00,0.0,0.00,0.0,0.0,0.73,1.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,0.133333,4_test,4,0.00,0.0,0.00,0.0,0.0,0.77,1.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,0.166667,4_test,5,0.00,0.0,0.00,0.0,0.0,0.83,1.94,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241324,341.100000,3_test,10233,4.98,1.0,1.69,0.0,0.0,0.00,1.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
241325,341.133333,3_test,10234,5.00,1.0,1.76,0.0,0.0,0.00,1.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
241326,341.166667,3_test,10235,5.00,1.0,1.69,0.0,0.0,0.00,1.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
241327,341.200000,3_test,10236,5.00,1.0,1.60,0.0,0.0,0.00,1.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [12]:
#move label columns to positions 3-5
cols = train_val.columns.tolist()
cols = cols[:3] + cols[-3:] + cols[3:-3]
train_val = train_val[cols]
print(train_val.columns)
print(train_val.shape)
train_val.to_csv('../../data/test.csv', index=False)
train_val

Index(['time', 'session', 'frame', 'UserAwkwardness', 'RobotMistake',
       'InteractionRupture', ' AU01_r', ' AU01_c', ' AU02_r', ' AU04_r',
       ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r',
       ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r',
       ' AU26_r', ' AU45_r', ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c',
       ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c',
       ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c',
       ' AU45_c', 'dist_4_7', 'vel_dist_4_7', 'dist_4_2', 'vel_dist_4_2',
       'dist_4_5', 'vel_dist_4_5', 'dist_4_1', 'vel_dist_4_1', 'dist_4_17',
       'vel_dist_4_17', 'dist_4_15', 'vel_dist_4_15', 'dist_4_18',
       'vel_dist_4_18', 'dist_4_16', 'vel_dist_4_16', 'dist_7_2',
       'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5', 'dist_7_1', 'vel_dist_7_1',
       'dist_7_17', 'vel_dist_7_17', 'dist_7_15', 'vel_dist_7_15', 'dist_7_18',
       'vel_dist_7_18', 'dist_7_16', 'vel_dist_7_1

Unnamed: 0,time,session,frame,UserAwkwardness,RobotMistake,InteractionRupture,AU01_r,AU01_c,AU02_r,AU04_r,...,logRelF0-H1-A3_sma3nz,F1frequency_sma3nz,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz
0,0.033333,4_test,1,0,0,0,0.00,0.0,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.066667,4_test,2,0,0,0,0.00,0.0,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.100000,4_test,3,0,0,0,0.00,0.0,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.133333,4_test,4,0,0,0,0.00,0.0,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.166667,4_test,5,0,0,0,0.00,0.0,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241324,341.100000,3_test,10233,0,0,0,4.98,1.0,1.69,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241325,341.133333,3_test,10234,0,0,0,5.00,1.0,1.76,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241326,341.166667,3_test,10235,0,0,0,5.00,1.0,1.69,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241327,341.200000,3_test,10236,0,0,0,5.00,1.0,1.60,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#now normalize features
import sklearn
from sklearn.preprocessing import StandardScaler
import joblib

#get scaler 
scaler = joblib.load('../../data/scaler.pkl')

#copy dataframe
train_val_norm = train_val.copy()
#normalize features
features = train_val_norm.columns[6:]
print(features)
train_val_norm[features] = scaler.fit_transform(train_val_norm[features])

#remove nan and inf
train_val_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
train_val_norm.dropna(inplace=True)


train_val_norm.to_csv('../../data/test_norm.csv', index=False)

Index([' AU01_r', ' AU01_c', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r',
       ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r',
       ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r',
       ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c',
       ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c',
       ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'dist_4_7',
       'vel_dist_4_7', 'dist_4_2', 'vel_dist_4_2', 'dist_4_5', 'vel_dist_4_5',
       'dist_4_1', 'vel_dist_4_1', 'dist_4_17', 'vel_dist_4_17', 'dist_4_15',
       'vel_dist_4_15', 'dist_4_18', 'vel_dist_4_18', 'dist_4_16',
       'vel_dist_4_16', 'dist_7_2', 'vel_dist_7_2', 'dist_7_5', 'vel_dist_7_5',
       'dist_7_1', 'vel_dist_7_1', 'dist_7_17', 'vel_dist_7_17', 'dist_7_15',
       'vel_dist_7_15', 'dist_7_18', 'vel_dist_7_18', 'dist_7_16',
       'vel_dist_7_16', 'Loudness_sma3', 'alphaRatio_sma3',
       'hammarbergIndex_sma3', 'slope0-500_sma3', 's