In [30]:
import numpy as np
import pandas as pd
import os

In [35]:
def split_parquet(path, drop_rows=False, columns=None):
    """
    split_paraquet takes a path to a parquet file and splits it into multiple numpy array files (.npy)
    It splits it by sequence_id

    Parameters:
        path (string)    : Path to parquet file 
        drop_rows (bool) : Whether to drop NA rows or not. Default: False
        columns (list)   : List of columns t

    Returns:
        Max frame count for this parquet file
    """
    max_length = 0
    df = pd.read_parquet(path,columns=columns)
    # Get rid of face and pose data
    #df.drop(df.columns[1:469], axis=1, inplace=True)
    # df.drop(df.columns[22:55], axis=1, inplace=True)
    # df.drop(df.columns[43:511], axis=1, inplace=True)
    # df.drop(df.columns[64:97], axis=1, inplace=True)
    # df.drop(df.columns[85:553], axis=1, inplace=True)
    # df.drop(df.columns[106:139], axis=1, inplace=True)
    # df.drop(df.columns[85:127], axis=1, inplace=True)

    grouped_df = df.groupby(['sequence_id'], dropna=False)

    try:
        print("split_files does not exists. Creating directory")
        os.mkdir("split_files")
    except:
        print("split_files already exists. Skipping directory creation")
    finally:
        print("Splitting parquet file and saving in split_files")


    for name, subset_df in grouped_df:
        subset_df.dropna(axis=1, how='all', inplace=True)
        subset_df.interpolate(inplace=True)

        if(subset_df.count >= max_length):
            max_length = subset_df.count

        if(drop_rows):
            subset_df.dropna(axis=0, subset=subset_df.columns[1:], how='all', inplace=True)
        np.save(f"split_files/{name[0]}",subset_df.to_numpy())

    return max_length



In [32]:
def lev_dist(a, b):
    """
    lev_dist returns the Levenshtein Distance between two strings

    Parameters:
        a (string) : First string 
        b (string) : Second string

    Returns:
        Levenshtein Distance (int) 
    """

    distance_matrix = np.zeros((len(a)+1, len(b)+1), np.int8)
    
    for i in range(1, len(a)+1):
        distance_matrix[i][0] = i
    
    for i in range(1, len(b)+1):
        distance_matrix[0][i] = i

    for i in range(1, len(a)+1):
        for j in range(1, len(b) + 1):

            if(a[i - 1] == b[j-1]):
                distance_matrix[i][j] = distance_matrix[i-1][j-1]
            else:
                distance_matrix[i][j] = min(distance_matrix[i][j - 1], distance_matrix[i - 1][j], distance_matrix[i - 1][j - 1]) + 1


    return distance_matrix[-1][-1] 

In [34]:
SEL_FEATURES = ['x_right_hand_0','y_right_hand_0',
                'x_right_hand_1','y_right_hand_1',
                'x_right_hand_2','y_right_hand_2',
                'x_right_hand_3','y_right_hand_3',
                'x_right_hand_4','y_right_hand_4',
                'x_right_hand_5','y_right_hand_5',
                'x_right_hand_6','y_right_hand_6',
                'x_right_hand_7','y_right_hand_7',
                'x_right_hand_8','y_right_hand_8',
                'x_right_hand_9','y_right_hand_9',
                'x_right_hand_10','y_right_hand_10',
                'x_right_hand_11','y_right_hand_11',
                'x_right_hand_12','y_right_hand_12',
                'x_right_hand_13','y_right_hand_13',
                'x_right_hand_14','y_right_hand_14',
                'x_right_hand_15','y_right_hand_15',
                'x_right_hand_16','y_right_hand_16',
                'x_right_hand_17','y_right_hand_17',
                'x_right_hand_18','y_right_hand_18',
                'x_right_hand_19','y_right_hand_19',
                'x_right_hand_20','y_right_hand_20',
                'x_left_hand_0','y_left_hand_0',
                'x_left_hand_1','y_left_hand_1',
                'x_left_hand_2','y_left_hand_2',
                'x_left_hand_3','y_left_hand_3',
                'x_left_hand_4','y_left_hand_4',
                'x_left_hand_5','y_left_hand_5',
                'x_left_hand_6','y_left_hand_6',
                'x_left_hand_7','y_left_hand_7',
                'x_left_hand_8','y_left_hand_8',
                'x_left_hand_9','y_left_hand_9',
                'x_left_hand_10','y_left_hand_10',
                'x_left_hand_11','y_left_hand_11',
                'x_left_hand_12','y_left_hand_12',
                'x_left_hand_13','y_left_hand_13',
                'x_left_hand_14','y_left_hand_14',
                'x_left_hand_15','y_left_hand_15',
                'x_left_hand_16','y_left_hand_16',
                'x_left_hand_17','y_left_hand_17',
                'x_left_hand_18','y_left_hand_18',
                'x_left_hand_19','y_left_hand_19',
                'x_left_hand_20','y_left_hand_20']




files = os.listdir("parquet_files")
max_size = 0

for file_name in files:
     temp_size = split_parquet(path=f"parquet_files/{file_name}", columns = SEL_FEATURES)
     
     if(temp_size >= max_size):
          max_size = temp_size


print(max_size)


Unnamed: 0_level_0,x_right_hand_0,y_right_hand_0,x_right_hand_1,y_right_hand_1,x_right_hand_2,y_right_hand_2,x_right_hand_3,y_right_hand_3,x_right_hand_4,y_right_hand_4,...,x_left_hand_16,y_left_hand_16,x_left_hand_17,y_left_hand_17,x_left_hand_18,y_left_hand_18,x_left_hand_19,y_left_hand_19,x_left_hand_20,y_left_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
571283026,0.082548,0.852293,0.188519,0.852344,0.260271,0.836757,0.324864,0.833374,0.400065,0.820683,...,,,,,,,,,,
571283026,,,,,,,,,,,...,,,,,,,,,,
571283026,,,,,,,,,,,...,,,,,,,,,,
571283026,,,,,,,,,,,...,,,,,,,,,,
571283026,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610093577,0.136303,0.635539,0.195181,0.646167,0.283071,0.635409,0.340649,0.616554,0.360180,0.588900,...,,,,,,,,,,
610093577,0.128935,0.640800,0.194135,0.649716,0.283852,0.637947,0.340892,0.618103,0.365880,0.590945,...,,,,,,,,,,
610093577,0.123598,0.648160,0.192160,0.655673,0.279835,0.644255,0.339039,0.626247,0.366412,0.599580,...,,,,,,,,,,
610093577,0.124527,0.655196,0.199336,0.661683,0.286930,0.649429,0.346276,0.630742,0.375924,0.603721,...,,,,,,,,,,
