In [1]:
import pandas as pd
from pandas import DataFrame


import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import pickle

# Import Data

In [12]:
# Load hand landmark data in DataFrame
df = pd.read_parquet("generated-data/data-extraction/all_hand_landmarks.parquet")

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,z13,z14,z15,z16,z17,z18,z19,z20,label,path
0,0.464909,0.55522,0.611244,0.628575,0.657871,0.55937,0.580615,0.569546,0.557562,0.497237,...,-0.018208,-0.079061,-0.076559,-0.054041,-0.029359,-0.068963,-0.067767,-0.053324,A,data-real/images/A/001.jpg
1,0.488618,0.63394,0.752804,0.788872,0.839207,0.691999,0.717096,0.68747,0.665154,0.598479,...,-0.026427,-0.105693,-0.088758,-0.052505,-0.035092,-0.090098,-0.073405,-0.043022,A,data-real/images/A/002.jpg
2,0.379251,0.471244,0.547347,0.584386,0.597717,0.536127,0.538939,0.510218,0.489882,0.477515,...,-0.02868,-0.079185,-0.08184,-0.067349,-0.035101,-0.067516,-0.070111,-0.059815,A,data-real/images/A/003.jpg
3,0.363328,0.460624,0.525905,0.550559,0.587458,0.484731,0.498303,0.479902,0.467675,0.422905,...,-0.001071,-0.057173,-0.050915,-0.028034,-0.011431,-0.050666,-0.042651,-0.023892,A,data-real/images/A/004.jpg
4,0.449153,0.587392,0.698809,0.739597,0.790906,0.642463,0.676976,0.64251,0.613779,0.557109,...,-0.032327,-0.107583,-0.092093,-0.058714,-0.043752,-0.096343,-0.081553,-0.054045,A,data-real/images/A/005.jpg


# Create Train-Test Splits

In [3]:
features = df.select_dtypes(include=['number'])
labels = df['label']

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

# Output results
print("train_features: \n", train_features.head())
print("test_features: \n", test_features.head())
print("train_labels: \n", train_labels.head())
print("test_labels: \n", test_labels.head())

train_features: 
             x0        x1        x2        x3        x4        x5        x6  \
4019  0.254785  0.300629  0.327732  0.323033  0.307932  0.322777  0.347873   
6757  0.456055  0.525361  0.548439  0.489668  0.420353  0.514893  0.525930   
6752  0.455969  0.520430  0.544182  0.487071  0.416733  0.511564  0.523181   
2680  0.202275  0.227997  0.245465  0.255164  0.258654  0.228565  0.237285   
3347  0.728602  0.706712  0.620187  0.540182  0.494610  0.530835  0.436645   

            x7        x8        x9  ...       z11       z12       z13  \
4019  0.361351  0.373211  0.288756  ... -0.049412 -0.054782 -0.012321   
6757  0.527769  0.525027  0.458153  ... -0.105791 -0.114657 -0.045096   
6752  0.525216  0.522451  0.454840  ... -0.105551 -0.114022 -0.047172   
2680  0.239123  0.238333  0.207999  ... -0.081141 -0.093662 -0.040031   
3347  0.378930  0.325366  0.514639  ... -0.085563 -0.095366 -0.028455   

           z14       z15       z16       z17       z18       z19       z20

# Create Pipelines

Custom classes `LabelEncoder` and `FeatureScaler` relocated to transformers.py

In [4]:
# Importing custom transformers
from transformers import LabelEncoder, FeatureScaler

In [5]:
# Pipeline for feature sets
feature_pipe = Pipeline([
    ("Feature Scaler", FeatureScaler())
])

# Pipeline for label sets
label_pipe = Pipeline([
    ("Label Encoder",  LabelEncoder())
])

# Main Processing Script

In [6]:
# Fitting pipe to training data
feature_pipe.fit(train_features)

# Preprocessing features train and test sets
train_features = feature_pipe.transform(train_features)
test_features = feature_pipe.transform(test_features)

In [7]:
# Fitting to all labels
label_pipe.fit(df['label'])

# Preprocessing labels for train and test sets
train_labels = label_pipe.transform(train_labels)
test_labels = label_pipe.transform(test_labels)

In [8]:
# Output results
print("train_features: \n", train_features.head())
print("test_features: \n", test_features.head())
print("train_labels: \n", train_labels.head())
print("test_labels: \n", test_labels.head())

train_features: 
          x0        x1        x2        x3        x4        x5        x6  \
0  0.224503  0.240621  0.246278  0.262282  0.256949  0.298575  0.324588   
1  0.424846  0.475219  0.495971  0.450926  0.371323  0.531806  0.521405   
2  0.424762  0.470071  0.491155  0.447985  0.367640  0.527765  0.518366   
3  0.172235  0.164800  0.153206  0.185449  0.206816  0.184200  0.202347   
4  0.696140  0.664531  0.577143  0.508112  0.446870  0.551160  0.422713   

         x7        x8        x9  ...       z11       z12       z13       z14  \
0  0.341795  0.383382  0.275147  ...  0.736458  0.761590  0.786961  0.743312   
1  0.519633  0.536971  0.483716  ...  0.658456  0.681178  0.724456  0.632073   
2  0.516905  0.534365  0.479637  ...  0.658788  0.682030  0.720496  0.631636   
3  0.211179  0.246930  0.175715  ...  0.692560  0.709374  0.734115  0.724241   
4  0.360580  0.334979  0.553263  ...  0.686442  0.707086  0.756193  0.674850   

        z15       z16       z17       z18       z1

# Save Preprocessed Data

In [9]:
# Saving preprocessed data
train_features.to_parquet("generated-data/preprocessed-data/train_features.parquet")
test_features.to_parquet("generated-data/preprocessed-data/test_features.parquet")
train_labels.to_parquet("generated-data/preprocessed-data/train_labels.parquet")
test_labels.to_parquet("generated-data/preprocessed-data/test_labels.parquet")

In [10]:
# Saving feature pipeline to a pickle file
with open('generated-data/preprocessed-data/pipeline.pkl', 'wb') as f:
    pickle.dump(feature_pipe, f)