# Pre-process raw data into .npz files

In [1]:
from __future__ import division, print_function
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial import distance
import csv
%matplotlib inline
from numpy import arange, sin, pi, cos
from scipy.fftpack import fft, dct

### Load pre-trained GloVe data (Twitter 100d) into a dictionary

In [None]:
twitter_words = np.loadtxt("GloVeTwitter27B/100d.txt", usecols=range(0, 1), dtype = "str")

twitter_vectors = np.loadtxt("GloVeTwitter27B/100d.txt", usecols=range(1, 101), dtype = "float")

twitter_table = {}
for i in range(twitter_words.size):
    twitter_table[twitter_words[i]] = twitter_vectors[i]

In [None]:
vector_dim = twitter_vectors.shape[1]
vector_dim

### Load pre-trained GloVe data (Wiki&Gigaword 100d) into a dictionary

In [None]:
wiki_words = np.loadtxt("GloVeWiki6B/100d.txt", usecols=range(0, 1), dtype = "str")

wiki_vectors = np.loadtxt("GloVeWiki6B/100d.txt", usecols=range(1, 101), dtype = "float")

wiki_table = {}
print(wiki_words.size)
for i in range(wiki_words.size):
    wiki_table[wiki_words[i]] = wiki_vectors[i]

In [None]:
vector_dim = wiki_vectors.shape[1]
vector_dim

### Load recorded raw word transcriptions + head motion data

In [None]:
def pre_process_data(inputs, targets, words, motion_data, time_intervals, check_table):

    for index, word in enumerate(words):
        start_time = time_intervals[index][0]
        end_time = time_intervals[index][1]

        for t in range(start_time, end_time):
            targets.append(motion_data[t])

        if word in check_table:
            for t in range(start_time, end_time):
                inputs.append(check_table[word])
        else:
            #print(word)
            word_split = word.split('\'')
            if len(word_split) <= 1:
                for t in range(start_time, end_time):
                    # treat unknown words as 0 vectors
                    inputs.append(np.zeros(vector_dim))
            else:
                word_split[1] = '\'' + word_split[1]
                mid = int((start_time + end_time) / 2)
                if word_split[0] in check_table:
                    for t in range(start_time, mid):
                        inputs.append(check_table[word_split[0]])
                    if word_split[1] in check_table:
                        for t in range(mid, end_time):
                            inputs.append(check_table[word_split[1]])
                    else:
                        for t in range(mid, end_time):
                            inputs.append(np.zeros(vector_dim))
                else:
                    for t in range(start_time, mid):
                        inputs.append(np.zeros(vector_dim))
                    if word_split[1] in check_table:
                        for t in range(mid, end_time):
                            inputs.append(check_table[word_split[1]])
                    else:
                        for t in range(mid, end_time):
                            inputs.append(np.zeros(vector_dim))

In [None]:
# Load Extrovert/Introvert 1-6 into test data
# Load Extrovert/Introvert 7-12 into validation data
# Load Extrovert/Introvert 13-46 into train data

inputs = []
targets = []

for i in range(7, 13):
    motion_data = np.loadtxt("ExtrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, twitter_table)

In [None]:
inputs_array = np.array(inputs)
targets_array = np.array(targets)

In [None]:
np.savez('data/Twitter/validation_extro.npz', inputs=inputs_array, targets=targets_array)

In [None]:
trail_d = np.load('data/Twitter/validation_extro.npz')
print(trail_d['targets'].shape)
print(trail_d['inputs'].shape)

In [None]:
# Make 1-6 individual validation data
for i in range(7, 13):
    inputs = []
    targets = []
    
    motion_data = np.loadtxt("ExtrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    print("Test case {0}: input dim: {1}, target dim: {2}".format(i, time_intervals.shape, motion_data.shape))
    
    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, twitter_table)
    inputs_array = np.array(inputs)
    targets_array = np.array(targets)
    np.savez('data/Twitter/validation{0}_extro.npz'.format(i), inputs=inputs_array, targets=targets_array)
    trail_d = np.load('data/Twitter/validation{0}_extro.npz'.format(i))
    print("Input dim: ", trail_d['inputs'].shape, "Target dim: ", trail_d['targets'].shape)

In [None]:
# Make 1-6 individual validation data
for i in range(7, 13):
    inputs = []
    targets = []
    
    motion_data = np.loadtxt("IntrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("IntrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("IntrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    print("Test case {0}: input dim: {1}, target dim: {2}".format(i, time_intervals.shape, motion_data.shape))
    
    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, wiki_table)
    inputs_array = np.array(inputs)
    targets_array = np.array(targets)
    np.savez('data/Wiki/validation{0}_intro.npz'.format(i), inputs=inputs_array, targets=targets_array)
    trail_d = np.load('data/Wiki/validation{0}_intro.npz'.format(i))
    print("Input dim: ", trail_d['inputs'].shape, "Target dim: ", trail_d['targets'].shape)

In [None]:
# Make 1-6 individual test data
for i in range(1, 7):
    inputs = []
    targets = []
    
    motion_data = np.loadtxt("ExtrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    print("Test case {0}: input dim: {1}, target dim: {2}".format(i, time_intervals.shape, motion_data.shape))
    
    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, twitter_table)
    inputs_array = np.array(inputs)
    targets_array = np.array(targets)
    np.savez('data/Twitter/test{0}_extro.npz'.format(i), inputs=inputs_array, targets=targets_array)
    trail_d = np.load('data/Twitter/test{0}_extro.npz'.format(i))
    print("Input dim: ", trail_d['inputs'].shape, "Target dim: ", trail_d['targets'].shape)

## Create 300d inputs, using [x_i-10, x_i, x_i+10]

In [None]:
from __future__ import division, print_function
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial import distance
import csv
%matplotlib inline
from numpy import arange, sin, pi, cos
from scipy.fftpack import fft, dct

In [None]:
train_100d = np.load('data/Wiki/train_extro.npz')

length_of_inputs = train_100d['inputs'].shape[0]
inputs_with_padding = np.zeros((train_100d['inputs'].shape[0]+20, train_100d['inputs'].shape[1]))

inputs_with_padding[10:-10,:] = train_100d['inputs']

inputs_300d = np.zeros((length_of_inputs, 300))

inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
inputs_300d[:,200:300] = inputs_with_padding[20:,:]

np.savez('data/Wiki/train_300d_skip{}_extro.npz'.format(10), inputs=inputs_300d, targets=train_100d['targets'])

In [None]:
validation_100d = np.load('data/Wiki/validation_extro.npz')

length_of_inputs = validation_100d['inputs'].shape[0]
inputs_with_padding = np.zeros((validation_100d['inputs'].shape[0]+20, validation_100d['inputs'].shape[1]))

inputs_with_padding[10:-10,:] = validation_100d['inputs']

inputs_300d = np.zeros((length_of_inputs, 300))

inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
inputs_300d[:,200:300] = inputs_with_padding[20:,:]

np.savez('data/Wiki/validation_300d_skip{}_extro.npz'.format(10), inputs=inputs_300d, targets=validation_100d['targets'])

In [None]:
for i in range(1,7):
    test_100d = np.load('data/Wiki/test{}_extro.npz'.format(i))

    length_of_inputs = test_100d['inputs'].shape[0]
    inputs_with_padding = np.zeros((test_100d['inputs'].shape[0]+20, test_100d['inputs'].shape[1]))

    inputs_with_padding[10:-10,:] = test_100d['inputs']

    inputs_300d = np.zeros((length_of_inputs, 300))

    inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
    inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
    inputs_300d[:,200:300] = inputs_with_padding[20:,:]

    np.savez('data/Wiki/test{0}_300d_skip{1}_extro.npz'.format(i, 10), inputs=inputs_300d, targets=test_100d['targets'])

In [None]:
for i in range(7,13):
    validation_100d = np.load('data/Wiki/validation{}_extro.npz'.format(i))

    length_of_inputs = validation_100d['inputs'].shape[0]
    inputs_with_padding = np.zeros((validation_100d['inputs'].shape[0]+20, validation_100d['inputs'].shape[1]))

    inputs_with_padding[10:-10,:] = validation_100d['inputs']

    inputs_300d = np.zeros((length_of_inputs, 300))

    inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
    inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
    inputs_300d[:,200:300] = inputs_with_padding[20:,:]

    np.savez('data/Wiki/validation{0}_300d_skip{1}_extro.npz'.format(i, 10), inputs=inputs_300d, targets=validation_100d['targets'])

In [None]:
# intro data:

train_100d = np.load('data/Wiki/train_intro.npz')

length_of_inputs = train_100d['inputs'].shape[0]
inputs_with_padding = np.zeros((train_100d['inputs'].shape[0]+20, train_100d['inputs'].shape[1]))

inputs_with_padding[10:-10,:] = train_100d['inputs']

inputs_300d = np.zeros((length_of_inputs, 300))

inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
inputs_300d[:,200:300] = inputs_with_padding[20:,:]

np.savez('data/Wiki/train_300d_skip{}_intro.npz'.format(10), inputs=inputs_300d, targets=train_100d['targets'])


validation_100d = np.load('data/Wiki/validation_intro.npz')

length_of_inputs = validation_100d['inputs'].shape[0]
inputs_with_padding = np.zeros((validation_100d['inputs'].shape[0]+20, validation_100d['inputs'].shape[1]))

inputs_with_padding[10:-10,:] = validation_100d['inputs']

inputs_300d = np.zeros((length_of_inputs, 300))

inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
inputs_300d[:,200:300] = inputs_with_padding[20:,:]

np.savez('data/Wiki/validation_300d_skip{}_intro.npz'.format(10), inputs=inputs_300d, targets=validation_100d['targets'])

for i in range(1,7):
    test_100d = np.load('data/Wiki/test{}_intro.npz'.format(i))

    length_of_inputs = test_100d['inputs'].shape[0]
    inputs_with_padding = np.zeros((test_100d['inputs'].shape[0]+20, test_100d['inputs'].shape[1]))

    inputs_with_padding[10:-10,:] = test_100d['inputs']

    inputs_300d = np.zeros((length_of_inputs, 300))

    inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
    inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
    inputs_300d[:,200:300] = inputs_with_padding[20:,:]

    np.savez('data/Wiki/test{0}_300d_skip{1}_intro.npz'.format(i, 10), inputs=inputs_300d, targets=test_100d['targets'])

In [2]:
for i in range(7,13):
    validation_100d = np.load('data/Wiki/validation{}_intro.npz'.format(i))

    length_of_inputs = validation_100d['inputs'].shape[0]
    inputs_with_padding = np.zeros((validation_100d['inputs'].shape[0]+20, validation_100d['inputs'].shape[1]))

    inputs_with_padding[10:-10,:] = validation_100d['inputs']

    inputs_300d = np.zeros((length_of_inputs, 300))

    inputs_300d[:,0:100] = inputs_with_padding[:-20,:]
    inputs_300d[:,100:200] = inputs_with_padding[10:-10,:]
    inputs_300d[:,200:300] = inputs_with_padding[20:,:]

    np.savez('data/Wiki/validation{0}_300d_skip{1}_intro.npz'.format(i, 10), inputs=inputs_300d, targets=validation_100d['targets'])