In [1]:
import pandas as pd
from pandas import DataFrame


import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import pickle

# Import Data

In [2]:
# Load hand landmark data in DataFrame
df = pd.read_parquet("generated-data/data-extraction/all_hand_landmarks.parquet")

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,z13,z14,z15,z16,z17,z18,z19,z20,label,path
0,0.464909,0.55522,0.611244,0.628575,0.657871,0.55937,0.580615,0.569546,0.557562,0.497237,...,-0.018208,-0.079061,-0.076559,-0.054041,-0.029359,-0.068963,-0.067767,-0.053324,A,data-real/images/A/001.jpg
1,0.488618,0.63394,0.752804,0.788872,0.839207,0.691999,0.717096,0.68747,0.665154,0.598479,...,-0.026427,-0.105693,-0.088758,-0.052505,-0.035092,-0.090098,-0.073405,-0.043022,A,data-real/images/A/002.jpg
2,0.379251,0.471244,0.547347,0.584386,0.597717,0.536127,0.538939,0.510218,0.489882,0.477515,...,-0.02868,-0.079185,-0.08184,-0.067349,-0.035101,-0.067516,-0.070111,-0.059815,A,data-real/images/A/003.jpg
3,0.363328,0.460624,0.525905,0.550559,0.587458,0.484731,0.498303,0.479902,0.467675,0.422905,...,-0.001071,-0.057173,-0.050915,-0.028034,-0.011431,-0.050666,-0.042651,-0.023892,A,data-real/images/A/004.jpg
4,0.449153,0.587392,0.698809,0.739597,0.790906,0.642463,0.676976,0.64251,0.613779,0.557109,...,-0.032327,-0.107583,-0.092093,-0.058714,-0.043752,-0.096343,-0.081553,-0.054045,A,data-real/images/A/005.jpg


# Create Train-Validation-Test Splits

In [3]:
features = df.select_dtypes(include=['number'])
labels = df['label']

# Creating training set
train_features, test_val_features, train_labels, test_val_labels = train_test_split(features, labels, test_size=0.3, random_state=42)

# Creating validation and test sets
val_features, test_features, val_labels, test_labels = train_test_split(test_val_features, test_val_labels, test_size=0.5, random_state=42)

# Showing split percentages
total = len(train_features) + len(val_features) + len(test_features)
print("Splits\t",f"Train: {len(train_features)/total : 0.2f}% | Validation: {len(val_features)/total : 0.2f}% | Test: {len(test_features)/total : 0.2f}%\n")

# Output results
print("train_features: \n", train_features.head())
print("test_features: \n", test_features.head())
print("val_labels: \n", val_labels.head())
print("val_labels: \n", val_labels.head())
print("train_labels: \n", train_labels.head())
print("test_labels: \n", test_labels.head())

Splits	 Train:  0.70% | Validation:  0.15% | Test:  0.15%

train_features: 
             x0        x1        x2        x3        x4        x5        x6  \
3263  0.465939  0.481118  0.474265  0.458875  0.444494  0.427288  0.401853   
5099  0.390784  0.514721  0.572660  0.623739  0.665258  0.478599  0.518026   
2298  0.485830  0.583411  0.663286  0.702153  0.658183  0.608442  0.636100   
5310  0.304672  0.369409  0.402262  0.433927  0.442114  0.314359  0.366582   
1553  0.421728  0.478524  0.504248  0.531765  0.557650  0.409420  0.429318   

            x7        x8        x9  ...       z11       z12       z13  \
3263  0.384540  0.369204  0.412924  ... -0.012940 -0.017405 -0.001870   
5099  0.570527  0.598660  0.451630  ...  0.019741 -0.001874  0.000146   
2298  0.649788  0.648856  0.552017  ... -0.070993 -0.094280 -0.021244   
5310  0.409631  0.424804  0.303937  ...  0.002817  0.004225 -0.008745   
1553  0.438590  0.440538  0.393904  ...  0.019114  0.013835  0.000438   

           z14 

In [4]:
print(len(train_features), len(val_features), len(test_features))

5786 1240 1240


# Create Pipelines

Transformers for pipelines are located in transformers.py

In [5]:
# Importing custom transformers
from transformers import FeatureScaler, LabelEncoderTransformer

In [6]:
# Pipeline for feature sets
feature_pipe = Pipeline([
    ("Feature Scaler", FeatureScaler())
])

# Pipeline for label sets
label_pipe = Pipeline([
    ("Label Encoder",  LabelEncoderTransformer())
])

# Main Processing Script

In [7]:
# Fitting pipe to training data
feature_pipe.fit(train_features)

# Preprocessing features train and test sets
train_features = feature_pipe.transform(train_features)
val_features = feature_pipe.transform(val_features)
test_features = feature_pipe.transform(test_features)

In [8]:
# Fitting to all labels
label_pipe.fit(df['label'])

# Preprocessing labels for train and test sets
train_labels = label_pipe.transform(train_labels)
val_labels = label_pipe.transform(val_labels)
test_labels = label_pipe.transform(test_labels)

In [9]:
# Output results
print("train_features: \n", train_features.head())
print("test_features: \n", test_features.head())
print("val_labels: \n", val_labels.head())
print("val_labels: \n", val_labels.head())
print("train_labels: \n", train_labels.head())
print("test_labels: \n", test_labels.head())

train_features: 
          x0        x1        x2        x3        x4        x5        x6  \
0  0.434685  0.429033  0.412056  0.416066  0.395883  0.425452  0.384255   
1  0.359876  0.464111  0.523374  0.602705  0.620481  0.487745  0.512668   
2  0.454485  0.535816  0.625902  0.691476  0.613283  0.645376  0.643182   
3  0.274160  0.312420  0.330597  0.387822  0.393462  0.288355  0.345268   
4  0.390677  0.426325  0.445977  0.498583  0.511005  0.403761  0.414614   

         x7        x8        x9  ...       z11       z12       z13       z14  \
0  0.366576  0.379328  0.428028  ...  0.786917  0.811788  0.806893  0.793212   
1  0.565325  0.611464  0.475684  ...  0.832133  0.832647  0.810738  0.792895   
2  0.650025  0.662246  0.599285  ...  0.706600  0.708544  0.769945  0.747259   
3  0.393389  0.435578  0.293838  ...  0.808719  0.840839  0.793781  0.797308   
4  0.424334  0.451496  0.404609  ...  0.831265  0.853745  0.811296  0.806500   

        z15       z16       z17       z18       z1

# Save Preprocessed Data

In [10]:
# Saving preprocessed data
train_features.to_parquet("generated-data/preprocessed-data/train_features.parquet")
val_features.to_parquet("generated-data/preprocessed-data/val_features.parquet")
test_features.to_parquet("generated-data/preprocessed-data/test_features.parquet")

train_labels.to_parquet("generated-data/preprocessed-data/train_labels.parquet")
val_labels.to_parquet("generated-data/preprocessed-data/val_labels.parquet")
test_labels.to_parquet("generated-data/preprocessed-data/test_labels.parquet")

In [11]:
# Saving pipeline objects to a pickle file
with open('generated-data/preprocessed-data/pipeline.pkl', 'wb') as f:
    pickle.dump(feature_pipe, f)

with open('generated-data/preprocessed-data/pipeline.pkl', 'wb') as f:
    pickle.dump(label_pipe, f)