# Pre-process raw data into .npz files

In [1]:
from __future__ import division, print_function # Makes division and printing work like python 3 (we're using 2)
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial import distance
import csv
%matplotlib inline
from numpy import arange, sin, pi, cos
from scipy.fftpack import fft, dct

### Load pre-trained GloVe data (Twitter 100d) into a dictionary

In [2]:
twitter_words = np.loadtxt("GloVeTwitter27B/100d.txt", usecols=range(0, 1), dtype = "string")

twitter_vectors = np.loadtxt("GloVeTwitter27B/100d.txt", usecols=range(1, 101), dtype = "float")

twitter_table = {}
for i in range(twitter_words.size):
    twitter_table[twitter_words[i]] = twitter_vectors[i]

In [3]:
vector_dim = twitter_vectors.shape[1]
vector_dim

100

### Load pre-trained GloVe data (Wiki&Gigaword 100d) into a dictionary

In [None]:
# wiki_words = np.loadtxt("GloVeWiki6B/100d.txt", usecols=range(0, 1), dtype = "string")

# wiki_vectors = np.loadtxt("GloVeWiki6B/100d.txt", converters = {(1,101): lambda s: float(s.strip() or 0)}, 
#                           usecols=range(1, 101), dtype = "float")

# wiki_table = {}
# print(wiki_words.size)
# for i in range(wiki_words.size):
#     wiki_table[wiki_words[i]] = wiki_vectors[i]

### Load recorded raw word transcriptions + head motion data

In [5]:
def pre_process_data(inputs, targets, words, motion_data, time_intervals, check_table):

    for index, word in enumerate(words):
        start_time = time_intervals[index][0]
        end_time = time_intervals[index][1]

        for t in range(start_time, end_time):
            targets.append(motion_data[t])

        if word in check_table:
            for t in range(start_time, end_time):
                inputs.append(check_table[word])
        else:
            #print(word)
            word_split = word.split('\'')
            if len(word_split) <= 1:
                for t in range(start_time, end_time):
                    # treat unknown words as 0 vectors
                    inputs.append(np.zeros(vector_dim))
            else:
                word_split[1] = '\'' + word_split[1]
                mid = int((start_time + end_time) / 2)
                if word_split[0] in check_table:
                    for t in range(start_time, mid):
                        inputs.append(check_table[word_split[0]])
                    if word_split[1] in check_table:
                        for t in range(mid, end_time):
                            inputs.append(check_table[word_split[1]])
                    else:
                        for t in range(mid, end_time):
                            inputs.append(np.zeros(vector_dim))
                else:
                    for t in range(start_time, mid):
                        inputs.append(np.zeros(vector_dim))
                    if word_split[1] in check_table:
                        for t in range(mid, end_time):
                            inputs.append(check_table[word_split[1]])
                    else:
                        for t in range(mid, end_time):
                            inputs.append(np.zeros(vector_dim))

In [14]:
# Load Extrovert/Introvert 1-6 into test data
# Load Extrovert/Introvert 7-12 into validation data
# Load Extrovert/Introvert 13-46 into train data

inputs = []
targets = []

for i in range(1, 6):
    motion_data = np.loadtxt("ExtrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, twitter_table)

In [15]:
inputs_array = np.array(inputs)
targets_array = np.array(targets)

In [16]:
np.savez('data/Twitter/test_extro.npz', inputs=inputs_array, targets=targets_array)

In [17]:
trail_d = np.load('data/Twitter/test_extro.npz')
print(trail_d['targets'].shape)
print(trail_d['inputs'].shape)

(15462, 6)
(15462, 100)


In [23]:
# Make 1-6 individual test data
for i in range(1, 7):
    inputs = []
    targets = []
    
    motion_data = np.loadtxt("ExtrovertRawData/Motion/{0}.rov".format(i), skiprows=17, usecols=range(0, 6))
    time_intervals = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(4, 6), dtype="int")
    words = np.loadtxt("ExtrovertRawData/Words/{0}".format(i), usecols=range(3, 4), dtype="string")

    print("Test case {0}: input dim: {1}, target dim: {2}".format(i, time_intervals.shape, motion_data.shape))
    
    for index, word in enumerate(words):
        words[index] = word.lower()
        
    pre_process_data(inputs, targets, words, motion_data, time_intervals, twitter_table)
    inputs_array = np.array(inputs)
    targets_array = np.array(targets)
    np.savez('data/Twitter/test{0}_extro.npz'.format(i), inputs=inputs_array, targets=targets_array)
    trail_d = np.load('data/Twitter/test{0}_extro.npz'.format(i))
    print("Input dim: ", trail_d['inputs'].shape, "Target dim: ", trail_d['targets'].shape)

Test case 1: input dim: (562, 2), target dim: (31467, 6)
Input dim:  (13462, 100) Target dim:  (13462, 6)
Test case 2: input dim: (719, 2), target dim: (31983, 6)
Input dim:  (15557, 100) Target dim:  (15557, 6)
Test case 3: input dim: (607, 2), target dim: (35331, 6)
Input dim:  (15462, 100) Target dim:  (15462, 6)
Test case 4: input dim: (806, 2), target dim: (30155, 6)
Input dim:  (17944, 100) Target dim:  (17944, 6)
Test case 5: input dim: (578, 2), target dim: (31798, 6)
Input dim:  (14660, 100) Target dim:  (14660, 6)
Test case 6: input dim: (938, 2), target dim: (34518, 6)
Input dim:  (19162, 100) Target dim:  (19162, 6)
