In [53]:
import pandas as pd
from pandas import DataFrame


import sklearn
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


# Import Data

In [54]:
# Load hand landmark data in DataFrame
df = pd.read_parquet("all_hand_landmarks.parquet")
df = df.sort_index()

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,z13,z14,z15,z16,z17,z18,z19,z20,label,path
0,0.464909,0.55522,0.611244,0.628575,0.657871,0.55937,0.580615,0.569546,0.557562,0.497237,...,-0.018208,-0.079061,-0.076559,-0.054041,-0.029359,-0.068963,-0.067767,-0.053324,A,data-real/images/A/001.jpg
1,0.488618,0.63394,0.752804,0.788872,0.839207,0.691999,0.717096,0.68747,0.665154,0.598479,...,-0.026427,-0.105693,-0.088758,-0.052505,-0.035092,-0.090098,-0.073405,-0.043022,A,data-real/images/A/002.jpg
2,0.379251,0.471244,0.547347,0.584386,0.597717,0.536127,0.538939,0.510218,0.489882,0.477515,...,-0.02868,-0.079185,-0.08184,-0.067349,-0.035101,-0.067516,-0.070111,-0.059815,A,data-real/images/A/003.jpg
3,0.363328,0.460624,0.525905,0.550559,0.587458,0.484731,0.498303,0.479902,0.467675,0.422905,...,-0.001071,-0.057173,-0.050915,-0.028034,-0.011431,-0.050666,-0.042651,-0.023892,A,data-real/images/A/004.jpg
4,0.449153,0.587392,0.698809,0.739597,0.790906,0.642463,0.676976,0.64251,0.613779,0.557109,...,-0.032327,-0.107583,-0.092093,-0.058714,-0.043752,-0.096343,-0.081553,-0.054045,A,data-real/images/A/005.jpg


# Create Pipeline

In [55]:
class DropPath(BaseEstimator, TransformerMixin):
    """
    Drops \'path\' column if it exists.
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):

        # Used for unlabeled images
        if 'path' not in df:
            return df

        return df.drop(['path'], axis=1)

        
class FeatureScaler(BaseEstimator, TransformerMixin):
    """
    Uses Min-Max scaler on all 
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        
        # Instantiate the MinMaxScaler
        scaler = MinMaxScaler()

        # Select only numeric columns
        numeric_cols = df.select_dtypes(include=['number'])

        # Fit and transform the numeric columns
        scaled_numeric_data = scaler.fit_transform(numeric_cols)

        # Convert the scaled data back to a DataFrame
        scaled_numeric_df = DataFrame(scaled_numeric_data, columns=numeric_cols.columns)

        # Combine with the original DataFrame, excluding the original numeric columns
        result_df = df.drop(numeric_cols.columns, axis=1).join(scaled_numeric_df)

        return result_df
    
class LabelEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, df):

        # Used for unlabeled images
        if 'label' not in df:
            return df

        # Instantiating the encoder
        encoder = OneHotEncoder(feature_name_combiner=(lambda _, x: str(x)), sparse_output=False)

        # Encoding features from 'label' column
        encoded_data = encoder.fit_transform(DataFrame(df['label']))

        # Turning encoded data into a DataFrame
        encoded_data_df = DataFrame(encoded_data, columns=encoder.get_feature_names_out())

        # Dropping 'label' column from original df
        del df['label']

        # Concatenating encoded data df to original df
        result_df = pd.concat([df, encoded_data_df], axis=1)

        return result_df



In [56]:
# Defining Pipeline
pipe = Pipeline([

    ("Dropper", DropPath()),
    ("Feature Scaler", FeatureScaler()),
    ("Label Encoder",  LabelEncoder())

])

# Main Processing Script

In [57]:
processed_df = pipe.fit_transform(df)
processed_df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,P,Q,R,S,T,U,V,W,X,Y
0,0.43366,0.506388,0.570567,0.611941,0.612966,0.589777,0.586113,0.568777,0.569886,0.53233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.45726,0.588564,0.729409,0.791667,0.797452,0.749245,0.735437,0.693492,0.678734,0.656852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.348396,0.418725,0.49887,0.562396,0.551767,0.56183,0.540516,0.506032,0.501416,0.508074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.332546,0.40764,0.47481,0.524469,0.54133,0.500034,0.496057,0.47397,0.478949,0.440907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.417976,0.539973,0.668822,0.73642,0.748312,0.689685,0.691542,0.645942,0.626759,0.60597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Preprocessed Data

In [58]:
processed_df.to_parquet("processed_data.parquet")