In [17]:
import numpy as np
import pandas as pd
import os

In [18]:
def split_parquet(path, train, drop_rows=False, columns=None):
    """
    split_paraquet takes a path to a parquet file and splits it into multiple numpy array files (.npy)
    It splits it by sequence_id

    Parameters:
        path (string)    : Path to parquet file 
        train (bool)     : Indicates whether the parquet files are for training (True) or are supplemental (False)
        drop_rows (bool) : Whether to drop NA rows or not. Default: False
        columns (list)   : List of columns t

    Returns:
        Max frame count for this parquet file
    """
    max_length = 0
    df = pd.read_parquet(path,columns=columns)
    # Get rid of face and pose data
    #df.drop(df.columns[1:469], axis=1, inplace=True)
    # df.drop(df.columns[22:55], axis=1, inplace=True)
    # df.drop(df.columns[43:511], axis=1, inplace=True)
    # df.drop(df.columns[64:97], axis=1, inplace=True)
    # df.drop(df.columns[85:553], axis=1, inplace=True)
    # df.drop(df.columns[106:139], axis=1, inplace=True)
    # df.drop(df.columns[85:127], axis=1, inplace=True)

    grouped_df = df.groupby(['sequence_id'], dropna=False)

    # try:
    #     print("split_files does not exists. Creating directory")
    #     os.mkdir("split_files")
    # except:
    #     print("split_files already exists. Skipping directory creation")
    # finally:
    #     print("Splitting parquet file and saving in split_files")

    if(train):
        if (not os.path.isdir("train_split_files")):
            print("train_split_files does not exists. Creating directory")
            os.mkdir("train_split_files")
        else:
           print("train_split_files already exists. Skipping directory creation") 
    else:
        if (not os.path.isdir("suppl_split_files")):
            print("suppl_split_files does not exists. Creating directory")
            os.mkdir("suppl_split_files")
        else:
           print("suppl_split_files already exists. Skipping directory creation") 

    for name, subset_df in grouped_df:
        subset_df.dropna(axis=1, how='all', inplace=True)
        subset_df.interpolate(inplace=True)

        if(subset_df.count(axis=0)[0] >= max_length):
            max_length = subset_df.count(axis=0).iloc[0]
        

        while(subset_df.count(axis=0).iloc[0] < 784):
            # zeros_row = pd.Series(np.zeros(subset_df.count(axis=1, numeric_only=True).iloc[0]), name=name)
            # subset_df = subset_df.append(zeros_row)
            subset_df.loc[len(subset_df)] = 0
            last = subset_df.index[-1]
            subset_df = subset_df.rename(index={last: str(name)})

        # print(subset_df.count(axis=1, numeric_only=True).iloc[0])
        
        if(drop_rows):
            subset_df.dropna(axis=0, subset=subset_df.columns[1:], how='all', inplace=True)

        # print(subset_df)

        if (train):
            np.save(f"train_split_files/{name}",subset_df.to_numpy())
        else:
            np.save(f"suppl_split_files/{name}", subset_df.to_numpy())
        
    return max_length

In [19]:
def lev_dist(a, b):
    """
    lev_dist returns the Levenshtein Distance between two strings

    Parameters:
        a (string) : First string 
        b (string) : Second string

    Returns:
        Levenshtein Distance (int) 
    """

    distance_matrix = np.zeros((len(a)+1, len(b)+1), np.int8)
    
    for i in range(1, len(a)+1):
        distance_matrix[i][0] = i
    
    for i in range(1, len(b)+1):
        distance_matrix[0][i] = i

    for i in range(1, len(a)+1):
        for j in range(1, len(b) + 1):

            if(a[i - 1] == b[j-1]):
                distance_matrix[i][j] = distance_matrix[i-1][j-1]
            else:
                distance_matrix[i][j] = min(distance_matrix[i][j - 1], distance_matrix[i - 1][j], distance_matrix[i - 1][j - 1]) + 1


    return distance_matrix[-1][-1] 

In [20]:
SEL_FEATURES = ['x_right_hand_0','y_right_hand_0',
                'x_right_hand_1','y_right_hand_1',
                'x_right_hand_2','y_right_hand_2',
                'x_right_hand_3','y_right_hand_3',
                'x_right_hand_4','y_right_hand_4',
                'x_right_hand_5','y_right_hand_5',
                'x_right_hand_6','y_right_hand_6',
                'x_right_hand_7','y_right_hand_7',
                'x_right_hand_8','y_right_hand_8',
                'x_right_hand_9','y_right_hand_9',
                'x_right_hand_10','y_right_hand_10',
                'x_right_hand_11','y_right_hand_11',
                'x_right_hand_12','y_right_hand_12',
                'x_right_hand_13','y_right_hand_13',
                'x_right_hand_14','y_right_hand_14',
                'x_right_hand_15','y_right_hand_15',
                'x_right_hand_16','y_right_hand_16',
                'x_right_hand_17','y_right_hand_17',
                'x_right_hand_18','y_right_hand_18',
                'x_right_hand_19','y_right_hand_19',
                'x_right_hand_20','y_right_hand_20',
                'x_left_hand_0','y_left_hand_0',
                'x_left_hand_1','y_left_hand_1',
                'x_left_hand_2','y_left_hand_2',
                'x_left_hand_3','y_left_hand_3',
                'x_left_hand_4','y_left_hand_4',
                'x_left_hand_5','y_left_hand_5',
                'x_left_hand_6','y_left_hand_6',
                'x_left_hand_7','y_left_hand_7',
                'x_left_hand_8','y_left_hand_8',
                'x_left_hand_9','y_left_hand_9',
                'x_left_hand_10','y_left_hand_10',
                'x_left_hand_11','y_left_hand_11',
                'x_left_hand_12','y_left_hand_12',
                'x_left_hand_13','y_left_hand_13',
                'x_left_hand_14','y_left_hand_14',
                'x_left_hand_15','y_left_hand_15',
                'x_left_hand_16','y_left_hand_16',
                'x_left_hand_17','y_left_hand_17',
                'x_left_hand_18','y_left_hand_18',
                'x_left_hand_19','y_left_hand_19',
                'x_left_hand_20','y_left_hand_20']


files = os.listdir("parquet_files/train_landmarks")
max_size_train = 0
max_size_suppl = 0

for file_name in files:
     temp_size = split_parquet(path=f"parquet_files/train_landmarks/{file_name}", train = True, columns = SEL_FEATURES)
     
     if(temp_size >= max_size_train):
          max_size_train = temp_size

files = os.listdir("parquet_files/supplemental_landmarks")
for file_name in files:
     temp_size = split_parquet(path=f"parquet_files/supplemental_landmarks/{file_name}", train = False, columns = SEL_FEATURES)
     
     if(temp_size >= max_size_suppl):
          max_size_suppl = temp_size

print("Max length of train files: {}".format(max_size_train))
print("Max length of supplemental files: {}".format(max_size_suppl))

train_split_files already exists. Skipping directory creation


  for name, subset_df in grouped_df:


             x_right_hand_0  y_right_hand_0  x_right_hand_1  y_right_hand_1  \
sequence_id                                                                   
765523541          0.271300        0.823856        0.362784        0.808182   
765523541          0.253725        0.833073        0.352714        0.815525   
765523541          0.268294        0.810972        0.365343        0.805956   
765523541          0.286671        0.811448        0.373863        0.802811   
765523541          0.262001        0.818015        0.366714        0.799401   
...                     ...             ...             ...             ...   
765523541          0.000000        0.000000        0.000000        0.000000   
765523541          0.000000        0.000000        0.000000        0.000000   
765523541          0.000000        0.000000        0.000000        0.000000   
765523541          0.000000        0.000000        0.000000        0.000000   
765523541          0.000000        0.000000        0

KeyboardInterrupt: 

In [None]:
print("Max length of train files: {}".format(max_size_train))
print("Max length of supplemental files: {}".format(max_size_suppl))

Max length of train files: 784
Max length of supplemental files: 620
