In [3]:
import numpy as np
import pandas as pd
import os

In [18]:
def split_parquet(path, drop_rows=False):
    """
    split_paraquet takes a path to a parquet file and splits it into multiple numpy array files (.npy)
    It splits it by sequence_id

    Parameters:
        path (string)    : Path to parquet file 
        drop_rows (bool) : Whether to drop NA rows or not. Default: False

    Returns:
        Nothing 
    """
    
    df = pd.read_parquet(path)
    # Get rid of face and pose data
    df.drop(df.columns[1:469], axis=1, inplace=True)
    df.drop(df.columns[22:55], axis=1, inplace=True)
    df.drop(df.columns[43:511], axis=1, inplace=True)
    df.drop(df.columns[64:97], axis=1, inplace=True)
    df.drop(df.columns[85:553], axis=1, inplace=True)
    df.drop(df.columns[106:139], axis=1, inplace=True)
    df.drop(df.columns[85:127], axis=1, inplace=True)

    grouped_df = df.groupby(['sequence_id'], dropna=False)

    try:
        print("split_files does not exists. Creating directory")
        os.mkdir("split_files")
    except:
        print("split_files already exists. Skipping directory creation")
    finally:
        print("Splitting parquet file and saving in split_files")


    for name, subset_df in grouped_df:
        subset_df.dropna(axis=1, how='all', inplace=True)
        subset_df.interpolate(inplace=True)
        if(drop_rows):
            subset_df.dropna(axis=0, subset=subset_df.columns[1:], how='all', inplace=True)
        np.save(f"split_files/{name[0]}",subset_df.to_numpy())


In [13]:
def lev_dist(a, b):
    """
    lev_dist returns the Levenshtein Distance between two strings

    Parameters:
        a (string) : First string 
        b (string) : Second string

    Returns:
        Levenshtein Distance (int) 
    """

    distance_matrix = np.zeros((len(a)+1, len(b)+1), np.int8)
    
    for i in range(1, len(a)+1):
        distance_matrix[i][0] = i
    
    for i in range(1, len(b)+1):
        distance_matrix[0][i] = i

    for i in range(1, len(a)+1):
        for j in range(1, len(b) + 1):

            if(a[i - 1] == b[j-1]):
                distance_matrix[i][j] = distance_matrix[i-1][j-1]
            else:
                distance_matrix[i][j] = min(distance_matrix[i][j - 1], distance_matrix[i - 1][j], distance_matrix[i - 1][j - 1]) + 1


    return distance_matrix[-1][-1] 

In [19]:
files = os.listdir("parquet_files")
for file_name in files:
     split_parquet(path=f"parquet_files/{file_name}")

split_files does not exists. Creating directory
Splitting parquet file and saving in split_files
