# Pre-process raw data into .npz files

In [1]:
from __future__ import division, print_function # Makes division and printing work like python 3 (we're using 2)
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial import distance
import csv
%matplotlib inline
from numpy import arange, sin, pi, cos
from scipy.fftpack import fft, dct

### Load pre-trained GloVe data (Twitter 25d) into a dictionary

In [2]:
check_words = np.loadtxt("GloVeTwitter27B/25d.txt", usecols=range(0, 1), dtype = 'string')

check_vectors = np.loadtxt("GloVeTwitter27B/25d.txt", usecols=range(1, 26), dtype = "float")

In [5]:
check_table = {}
for i in range(check_words.size):
    check_table[check_words[i]] = check_vectors[i]

### Load recorded raw word transcriptions + head motion data

In [9]:
motion_data = np.loadtxt("RawMotionData/Adam_06_n.rov", skiprows=16, usecols=range(0, 6))
time_intervals = np.loadtxt("RawWordTables/Adam_06_n", usecols=range(4, 6), dtype="int")
words = np.loadtxt("RawWordTables/Adam_06_n", usecols=range(3, 4), dtype="string")

In [10]:
for index, word in enumerate(words):
    words[index] = word.lower()

In [76]:
inputs = []
outputs = []

for index, word in enumerate(words):
    start_time = time_intervals[index][0]
    end_time = time_intervals[index][1]
    
    for t in range(start_time, end_time):
        outputs.append(motion_data[t])
    
    if word in check_table:
        for t in range(start_time, end_time):
            inputs.append(check_table[word])
    else:
        #print(word)
        word_split = word.split('\'')
        if len(word_split) <= 1:
            for t in range(start_time, end_time):
                # treat unknown words as 0 vectors
                inputs.append(np.zeros(check_vectors.shape[1]))
        else:
            word_split[1] = '\'' + word_split[1]
            mid = int((start_time + end_time) / 2)
            if word_split[0] in check_table:
                for t in range(start_time, mid):
                    inputs.append(check_table[word_split[0]])
                if word_split[1] in check_table:
                    for t in range(mid, end_time):
                        inputs.append(check_table[word_split[1]])
                else:
                    for t in range(mid, end_time):
                        inputs.append(np.zeros(check_vectors.shape[1]))
            else:
                for t in range(start_time, mid):
                    inputs.append(np.zeros(check_vectors.shape[1]))
                if word_split[1] in check_table:
                    for t in range(mid, end_time):
                        inputs.append(check_table[word_split[1]])
                else:
                    for t in range(mid, end_time):
                        inputs.append(np.zeros(check_vectors.shape[1]))

In [82]:
inputs_array = np.array(inputs)
outputs_array = np.array(outputs)

In [88]:
np.savez('data/train.npz', inputs=inputs_array, targets=outputs_array)

In [91]:
trail_d = np.load('data/train.npz')
trail_d['targets'].shape

(12235, 6)