In [1]:
import numpy as np
import pandas as pd
import os
import json
import gc
import tensorflow as tf
from tensorflow import keras

2023-07-10 13:10:16.737541: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-10 13:10:16.780699: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def split_parquet(path, train, drop_rows=False, columns=None):
    """
    split_paraquet takes a path to a parquet file and splits it into multiple numpy array files (.npy)
    It splits it by sequence_id

    Parameters:
        path (string)    : Path to parquet file 
        train (bool)     : Indicates whether the parquet files are for training (True) or are supplemental (False)
        drop_rows (bool) : Whether to drop NA rows or not. Default: False
        columns (list)   : List of columns t

    Returns:
        Max frame count for this parquet file
    """
    max_length = 0 # The max number of rows found in all the subset dataframes
    max_size = 1560 
    
    df = pd.read_parquet(path,columns=columns)

    # Get rid of face and pose data
    #df.drop(df.columns[1:469], axis=1, inplace=True)
    # df.drop(df.columns[22:55], axis=1, inplace=True)
    # df.drop(df.columns[43:511], axis=1, inplace=True)
    # df.drop(df.columns[64:97], axis=1, inplace=True)
    # df.drop(df.columns[85:553], axis=1, inplace=True)
    # df.drop(df.columns[106:139], axis=1, inplace=True)
    # df.drop(df.columns[85:127], axis=1, inplace=True)

    grouped_df = df.groupby(['sequence_id'], dropna=False)

    # try:
    #     print("split_files does not exists. Creating directory")
    #     os.mkdir("split_files")
    # except:
    #     print("split_files already exists. Skipping directory creation")
    # finally:
    #     print("Splitting parquet file and saving in split_files")

    # Create directories for training and supplemental numpy array files
    if(train):
        if (not os.path.isdir("train_split_files")):
            print("train_split_files does not exists. Creating directory")
            os.mkdir("train_split_files")
        else:
           print("train_split_files already exists. Skipping directory creation") 
    else:
        if (not os.path.isdir("suppl_split_files")):
            print("suppl_split_files does not exists. Creating directory")
            os.mkdir("suppl_split_files")
        else:
           print("suppl_split_files already exists. Skipping directory creation") 

    for name, subset_df in grouped_df:
        # subset_df.dropna(axis=1, how='all', inplace=True)
        # subset_df.interpolate(inplace=True)

        subset_df.interpolate(inplace=True) # Fill in the NaN values through interpolation
        subset_df.fillna(0.0, axis=1, inplace=True) # Fill the remaining NaN values with 0.0

        num_rows = subset_df.count(axis=0).iloc[0] # Numer of rows in the subset dataframe

        if(drop_rows):
            subset_df.dropna(axis=0, subset=subset_df.columns[1:], how='all', inplace=True)
        
        if (num_rows < max_size):
            # Append zeros to the subset dataframe
            zero_data = np.zeros(shape=(max_size - num_rows,len(columns))) # Create 2D numpy matrix filled with zeros that's of the correct size
            index_zeros = np.full(shape=(max_size - num_rows), fill_value=name[0]) # Array for labelling the indices of the array of zeros to match that of the subset dataframe
            zeros = pd.DataFrame(zero_data, columns=columns, index=index_zeros) # Create a dataframe filled with zeros that's of the correct size
            subset_df = pd.concat([subset_df, zeros]) # Append the zeros dataframe to the subset dataframe


        # subsdf_np = subset_df.to_numpy(dtype=np.float32) # Convert pandas dataframe to numpy
        # subsdf_size = np.shape(subsdf_np)
        
        # if (subsdf_size[0] < 784):
        #     zeros_mat = np.zeros((784-subsdf_size[0], subsdf_size[1]), dtype=np.float32)
        #     subsdf_np = np.vstack((subsdf_np, zeros_mat))

        # while(subset_df.count(axis=0).iloc[0] < 784):
        #     # zeros_row = pd.Series(np.zeros(subset_df.count(axis=1, numeric_only=True).iloc[0]), name=name)
        #     # subset_df = subset_df.append(zeros_row)
        #     subset_df.loc[len(subset_df)] = 0
        #     last = subset_df.index[-1]
        #     subset_df = subset_df.rename(index={last: str(name)})

        # print(subset_df.count(axis=1, numeric_only=True).iloc[0])
    
        # print(subset_df)

        if (train):
            np.save(f"train_split_files/{name[0]}", subset_df.to_numpy(dtype=np.float32))
        else:
            np.save(f"suppl_split_files/{name[0]}", subset_df.to_numpy(dtype=np.float32))
        
        if(subset_df.count(axis=0).iloc[0] >= max_length):
            max_length = subset_df.count(axis=0).iloc[0]
        
    return max_length

In [3]:
def lev_dist(a, b):
    """
    lev_dist returns the Levenshtein Distance between two strings

    Parameters:
        a (string) : First string 
        b (string) : Second string

    Returns:
        Levenshtein Distance (int) 
    """

    distance_matrix = np.zeros((len(a)+1, len(b)+1), np.int8)
    
    for i in range(1, len(a)+1):
        distance_matrix[i][0] = i
    
    for i in range(1, len(b)+1):
        distance_matrix[0][i] = i

    for i in range(1, len(a)+1):
        for j in range(1, len(b) + 1):

            if(a[i - 1] == b[j-1]):
                distance_matrix[i][j] = distance_matrix[i-1][j-1]
            else:
                distance_matrix[i][j] = min(distance_matrix[i][j - 1], distance_matrix[i - 1][j], distance_matrix[i - 1][j - 1]) + 1


    return distance_matrix[-1][-1] 

In [None]:
SEL_FEATURES = ['x_right_hand_0','y_right_hand_0',
                'x_right_hand_1','y_right_hand_1',
                'x_right_hand_2','y_right_hand_2',
                'x_right_hand_3','y_right_hand_3',
                'x_right_hand_4','y_right_hand_4',
                'x_right_hand_5','y_right_hand_5',
                'x_right_hand_6','y_right_hand_6',
                'x_right_hand_7','y_right_hand_7',
                'x_right_hand_8','y_right_hand_8',
                'x_right_hand_9','y_right_hand_9',
                'x_right_hand_10','y_right_hand_10',
                'x_right_hand_11','y_right_hand_11',
                'x_right_hand_12','y_right_hand_12',
                'x_right_hand_13','y_right_hand_13',
                'x_right_hand_14','y_right_hand_14',
                'x_right_hand_15','y_right_hand_15',
                'x_right_hand_16','y_right_hand_16',
                'x_right_hand_17','y_right_hand_17',
                'x_right_hand_18','y_right_hand_18',
                'x_right_hand_19','y_right_hand_19',
                'x_right_hand_20','y_right_hand_20',
                'x_left_hand_0','y_left_hand_0',
                'x_left_hand_1','y_left_hand_1',
                'x_left_hand_2','y_left_hand_2',
                'x_left_hand_3','y_left_hand_3',
                'x_left_hand_4','y_left_hand_4',
                'x_left_hand_5','y_left_hand_5',
                'x_left_hand_6','y_left_hand_6',
                'x_left_hand_7','y_left_hand_7',
                'x_left_hand_8','y_left_hand_8',
                'x_left_hand_9','y_left_hand_9',
                'x_left_hand_10','y_left_hand_10',
                'x_left_hand_11','y_left_hand_11',
                'x_left_hand_12','y_left_hand_12',
                'x_left_hand_13','y_left_hand_13',
                'x_left_hand_14','y_left_hand_14',
                'x_left_hand_15','y_left_hand_15',
                'x_left_hand_16','y_left_hand_16',
                'x_left_hand_17','y_left_hand_17',
                'x_left_hand_18','y_left_hand_18',
                'x_left_hand_19','y_left_hand_19',
                'x_left_hand_20','y_left_hand_20']


files = os.listdir("parquet_files/train_landmarks")
max_size_train = 0
max_size_suppl = 0

gc.enable()

for file_name in files:
     temp_size = split_parquet(path=f"parquet_files/train_landmarks/{file_name}", train = True, columns = SEL_FEATURES)
     
     if(temp_size >= max_size_train):
          max_size_train = temp_size

files = os.listdir("parquet_files/supplemental_landmarks")
for file_name in files:
     temp_size = split_parquet(path=f"parquet_files/supplemental_landmarks/{file_name}", train = False, columns = SEL_FEATURES)
     
     if(temp_size >= max_size_suppl):
          max_size_suppl = temp_size

In [5]:
print("Max length of train files: {}".format(max_size_train))
print("Max length of supplemental files: {}".format(max_size_suppl))

Max length of train files: 1560
Max length of supplemental files: 1560


In [6]:
# the shape of the input data
the_data = np.load("train_split_files/71095.npy", allow_pickle=False)
the_data.shape

(1560, 84)

In [2]:
def one_hot_coder(phrase=None, max_seq_len=None, lookup_dict=None, dict_len=None):
    
    one_hot_vec = np.zeros(dict_len*max_seq_len, dtype=np.float32) # 59 (possible chars) * 31 (max sequence length) = 1829
    
    for idx, i in enumerate(phrase):
        one_hot_vec[dict_len*idx + lookup_dict[i]] = 1/max_seq_len
    
    return one_hot_vec


In [7]:
def one_hot_decoder(vec=None, max_seq_len=None, lookup_dict=None, dict_len=None):
    
    lookup_dict_rev = dict((value, key) for key, value in lookup_dict.items())

    vec = np.reshape(vec, (max_seq_len, dict_len))
    max_idxs = np.argmax(vec, axis=1)

    joined_str = "".join([lookup_dict_rev[idx] for idx in max_idxs]).rstrip()

    return joined_str


In [None]:
# ONE HOT CODER FUNCTION TESTING
#one_hot_decoder(one_hot_coder("hello my name is aaron"))

In [4]:
# Train.csv PreProcessing
with open('character_to_prediction_index.json') as f:
    char2pred = json.load(f) # Loads in a dictionary

train_csv = pd.read_csv('train.csv')
max_len = max([len(x) for x in train_csv["phrase"].values])

train_csv["one_hot"] = [one_hot_coder(phrase=x, max_seq_len=max_len, lookup_dict=char2pred, dict_len=len(char2pred)) for x in train_csv["phrase"].values]



In [3]:
files = os.listdir("testing_data")
a = []
for file_name in files: 
    the_data = np.load(f"testing_data/{file_name}", allow_pickle=False)
    a.append(the_data)

# the_data = np.load("train_split_files/71095.npy", allow_pickle=False)
# a = []
# a.append(the_data)

# the_data = np.load("train_split_files/95670.npy", allow_pickle=False)
# a.append(the_data)
# # my_test = np.append(my_test[0], the_data, axis=0)
# # # my_test.shape
# # my_test
a = np.asarray(a)

# a.shape
training_images = a.reshape(10,1560,84,1)
# training_images = tf.convert_to_tensor(a)
# training_images.reshape(10,1560,84,1)
# training_images

In [4]:
with open('character_to_prediction_index.json') as f:
    char2pred = json.load(f) # Loads in a dictionary

training_csv = pd.read_csv('testing.csv')
max_len = max([len(x) for x in training_csv["phrase"].values])

i = 0
j = 0
array_labels = np.zeros((10, 1829), dtype=np.float32)
for x in training_csv["phrase"].values:
    array_labels[i] = one_hot_coder(phrase=x, max_seq_len=31, lookup_dict=char2pred, dict_len=len(char2pred)) 
    i = i + 1

# training_labels = tf.convert_to_tensor(array_labels)
# training_labels

In [25]:
model = keras.Sequential([
    # keras.layers.Conv2D(32, (2, 2), activation='relu', input_shape=(1560, 84, 1)),
    # keras.layers.MaxPooling2D(2, 2),
    keras.layers.Flatten(input_shape=(1560, 84)),
    # keras.layers.Flatten(),
    keras.layers.Dense(10, activation=tf.nn.relu),
    keras.layers.Dense(1829, activation=tf.nn.softmax)
])

model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

model.fit(training_images, array_labels, epochs=1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x7f0aa84e9540>

In [None]:
train_csv = pd.read_csv('train.csv')
for i in range(4):
    print(train_csv["one_hot"][0][i])