In [35]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking

In [25]:
words = os.listdir("words")

for word in words:
    print(f"{word}: {len(os.listdir(f'words/{word}'))}")


after: 347


In [26]:
file_names = os.listdir("words/after")


file_names

['1044729798.parquet',
 '1052878223.parquet',
 '1056438401.parquet',
 '106564566.parquet',
 '1078150780.parquet',
 '1094304937.parquet',
 '1097667387.parquet',
 '1098175348.parquet',
 '1104271845.parquet',
 '1114889620.parquet',
 '1117500362.parquet',
 '111963023.parquet',
 '1120375122.parquet',
 '1180922957.parquet',
 '119913707.parquet',
 '1204695936.parquet',
 '1242836183.parquet',
 '124296714.parquet',
 '1264815245.parquet',
 '1271728954.parquet',
 '1282295278.parquet',
 '1289995927.parquet',
 '1304860276.parquet',
 '1317336076.parquet',
 '1319436014.parquet',
 '1341475784.parquet',
 '1348400183.parquet',
 '1361395782.parquet',
 '1375340229.parquet',
 '137646346.parquet',
 '1381884726.parquet',
 '1388244940.parquet',
 '1392496150.parquet',
 '1395996631.parquet',
 '1401874542.parquet',
 '1403988076.parquet',
 '1416799858.parquet',
 '1432112256.parquet',
 '1434029916.parquet',
 '1444373558.parquet',
 '1444677243.parquet',
 '1453596488.parquet',
 '1468366744.parquet',
 '1478777756.par

In [29]:
dataframes = []
max_length = 0

hand_map = {"left_hand": 0, "right_hand": 1}

for word in words:
    file_names = os.listdir(f"words/{word}")[:100]
    index = words.index(word)

    # Limit to just 100 files
    for file_name in file_names:
        # Convert parquet to dataframe  
        df = pd.read_parquet(f"words/{word}/{file_name}")

        # Remove pose and face data, remove miscellaneous row_id column
        df = df[df["type"] != "pose"]
        df = df[df["type"] != "face"]
        df = df.drop(columns=["row_id"])

        # Convert NaN to -1
        df = df.fillna(-1)

        # Add sign column
        df["sign"] = index

        # Encode hand type column
        df["type"] = df["type"].map(hand_map)

        # Normalize and restrict frames
        df["frame"] = df["frame"] - int(df.iloc[0]["frame"])
        df = df[df["frame"] < 100]

        # Append to list
        dataframes.append(df)

dataframes[0]


Unnamed: 0,frame,type,landmark_index,x,y,z,sign
468,0,0,0,-1.000000,-1.000000,-1.000000,0
469,0,0,1,-1.000000,-1.000000,-1.000000,0
470,0,0,2,-1.000000,-1.000000,-1.000000,0
471,0,0,3,-1.000000,-1.000000,-1.000000,0
472,0,0,4,-1.000000,-1.000000,-1.000000,0
...,...,...,...,...,...,...,...
3253,5,1,16,0.390754,0.452435,-0.110298,0
3254,5,1,17,0.356809,0.569751,-0.066635,0
3255,5,1,18,0.378821,0.560500,-0.082378,0
3256,5,1,19,0.396190,0.548753,-0.090704,0


In [32]:
# left and right hand dataframe pad template

max_length = max(df.shape[0] for df in dataframes)
num_of_dataframes = len(dataframes)
print(f"Adjusted (max) length: {max_length}")

for i in range(len(dataframes)):
    # Get relevant data
    df = dataframes[i]
    
    if df.shape[0] < max_length:
        pad_df = pd.DataFrame([{"frame": -2, "type": -2, "landmark_index": -2, "x": -2, "y": -2, "z": -2, "sign": -2} for a in range(max_length - df.shape[0])])
        df = pd.concat([df, pad_df], axis=0)

    # Ensure the values are still in float32 after padding
    df = df.astype({col: 'float32' for col in df.select_dtypes(include=['float64']).columns})
    dataframes[i] = df

    print(f"{i+1}/{num_of_dataframes} have been padded.")

Adjusted (max) length: 4200
1/100 have been padded.
2/100 have been padded.
3/100 have been padded.
4/100 have been padded.
5/100 have been padded.
6/100 have been padded.
7/100 have been padded.
8/100 have been padded.
9/100 have been padded.
10/100 have been padded.
11/100 have been padded.
12/100 have been padded.
13/100 have been padded.
14/100 have been padded.
15/100 have been padded.
16/100 have been padded.
17/100 have been padded.
18/100 have been padded.
19/100 have been padded.
20/100 have been padded.
21/100 have been padded.
22/100 have been padded.
23/100 have been padded.
24/100 have been padded.
25/100 have been padded.
26/100 have been padded.
27/100 have been padded.
28/100 have been padded.
29/100 have been padded.
30/100 have been padded.
31/100 have been padded.
32/100 have been padded.
33/100 have been padded.
34/100 have been padded.
35/100 have been padded.
36/100 have been padded.
37/100 have been padded.
38/100 have been padded.
39/100 have been padded.
40/100

In [33]:
final_max_length = max([df.shape[0] for df in dataframes])
final_min_length = min([df.shape[0] for df in dataframes])

final_max_length, final_min_length

(4200, 4200)

In [34]:
encoded_signs = set([df["sign"].iloc[0] for df in dataframes])

encoded_signs


{0}

In [39]:
# Function to transform frame-by-frame data into feature vectors
def transform_to_features(df, num_landmarks=42):
    """
    Transforms a DataFrame with (x, y, z) coordinates into a time-series feature array.
    Each frame is converted into a vector of size num_landmarks * 3.
    """
    
    # Reshape each frame into a flattened feature vector
    features = df[['x', 'y', 'z']].values.reshape(-1, num_landmarks * 3)
    return features

features = transform_to_features(dataframes[0])
features.shape  # Shape: (num_frames, 126)

(100, 126)

In [40]:
X = np.array([transform_to_features(df) for df in dataframes])
y = np.array([df["sign"].iloc[0] for df in dataframes])

X.shape, y.shape


((100, 100, 126), (100,))