In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from src import _Preprocessing_Data

In [3]:
config_data = util.load_config()

## Collect Data Set

In [4]:
def load_dataset(config_data: dict, config: str) -> pd.DataFrame:
    # Load set of data
    X_train = util.pickle_load(config_data["train_set_eda"][0])
    y_train = util.pickle_load(config_data["train_set_eda"][1])
    
    X_valid = util.pickle_load(config_data["valid_set_eda"][0])
    y_valid = util.pickle_load(config_data["valid_set_eda"][1])
    
    X_test = util.pickle_load(config_data["test_set_eda"][0])
    y_test = util.pickle_load(config_data["test_set_eda"][1])
    
    dataset_train = pd.concat([X_train[config], y_train[config]], axis=1)
    dataset_valid = pd.concat([X_valid[config], y_valid[config]], axis=1)
    dataset_test = pd.concat([X_test[config], y_test[config]], axis=1)
    
    
    return dataset_train, dataset_valid, dataset_test

In [5]:
train_set_rf, valid_set_rf, test_set_rf = load_dataset(config_data, config='rf')
train_set_filter, valid_set_filter, test_set_filter = load_dataset(config_data, config='filter')
train_set_lasso, valid_set_lasso, test_set_lasso = load_dataset(config_data, config='lasso')

## Check Data

## Function Preprocessing data

- Split dataset into numerical and categorical data, <br>
- Performed **SimpleImputer** in order of missing data <br>
- Performed **Encoding** for categorical data such as **LabelEncoder** for ordinal data and **OHE** for non-ordinal data <br>
- Suggested to use **OrdinalEncoder** and defined the ranked value of each features <br>
- Last, performed **Standardization** using StandardScaler

## Preprocessing Data

    We are generated dataset with filter method of feature selection, 
    not the other two (Lasso or Random Forest)

In [6]:
class _Preprocessing_Data:
    """
    Handling raw dataset, \n
    Performed Imputer data, Label Encoding or OHE, and Standardization.
    """
    def __init__(self):
        pass

    def _split_numcat(self, data:dict) -> pd.DataFrame:
        """Split dataset without label into numerical and categorical.

        Parameters
        -------
        data : array-like of shape
            train_set without label

        Returns
        --------
        X_numerical : array-like of shape
            train_set of numerical features only
        
        X_category : array-like of shape
            train_set of category features only
        """
        numerical_col = data.select_dtypes('float64').columns.to_list()
        categorical_col = data.select_dtypes('object').columns.to_list()

        self.X_num = data[numerical_col]
        self.X_cat = data[categorical_col]

        return self.X_num, self.X_cat
        
    def _split_xy(self, data:dict) -> pd.DataFrame:
        """Split dataset into Numerical (float64) and Categorical data (object).

        Parameters
        -------
        data : array-like of shape
            train_set, valid_set included with label
        X : array-like of shape
          Predictor array
        y : array-like of shape
          label array

        Return 
        -------
        X_numeric : array-like of shape
                predictor for numeric only 
        X_categoric : array-like of shape
                predictor for categoric only
        """
        
        X = data.drop(columns = config_data['label'], axis=1)
        y = data[config_data['label']]
        
        self.X = X
        self.y = y
        
        numerical_col = X.select_dtypes('float64').columns.to_list()
        categorical_col = X.select_dtypes('object').columns.to_list()

        self.X_num = X[numerical_col]
        self.X_cat = X[categorical_col]

        return self.X_num, self.X_cat
    
    # Perform sanity check
    def _imputer_Num(self, data, imputer=None):
        """
        Handling missing value for numeric if any. \n
        Using median to fill np.NaN by SimpleImputer() from Sklearn Function.

        Parameters
        ---------
        data : pandas.DataFrame
            Numeric (int64) dtype only

        Returns
        -------
        data_imputed : pandas.DataFrame
            Imputed numeric data
        """
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='median')
            imputer.fit(data)

        data_imputed_num = pd.DataFrame(imputer.transform(data),
                                    index = data.index,
                                    columns = data.columns)
        
        data_imputed_num = data_imputed_num.astype('int64')
        
        self.data_imputed_num = data_imputed_num
        
        return data_imputed_num

    def _imputer_Cat(self, data, imputer = None) -> pd.DataFrame:
        """
        Handling missing value for categorical data. \n
        Using 'most_frequent' strategy from SimpleImputer() of sklearn function

        Parameters
        --------
        data : pandas.DataFrame
            categorical (object) dtype only

        Return
        ----------
        imputed_cat : pandas.DataFrame
            imputed categorical data
        """
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='most_frequent')
            imputer.fit(data)

        data_imputed_cat = pd.DataFrame(imputer.transform(data),
                                    index=data.index,
                                    columns=data.columns
                                    )
        
        self.data_imputed_cat = data_imputed_cat
        
        return data_imputed_cat
    
    def _OHE_Cat(self, data, encoder_col = None, encoder = None) -> pd.DataFrame:
        """
        One Hot Encoding using OneHotEncoder() from sklearn.preprocessing \n
        handle_unknown : 'ignore' \n
        drop : 'if binary' \n
        This function for nominal or non-Ordinal categoric data \n
        If encoder_col and encoder == None, function will generate encoder from data.
        
        Parameters
        ------
        data : pandas.DataFrame
            categorical data non-Ordinal
        encoder_col : encoder.get_feature_names_out
        encoder : OneHotEncoder()

        Returns
        ------
        data_encoded : pd.DataFrame
            OHE encoded data
        encoder_cold
        encoder
        """

        if encoder == None:
            encoder = OneHotEncoder(handle_unknown= 'ignore',
                                    drop = 'if_binary')
            encoder.fit(data)
            encoder_col = encoder.get_feature_names_out(data.columns)

        data_encoded = encoder.transform(data).toarray()
        data_encoded = pd.DataFrame(data_encoded,
                                    index=data.index,
                                    columns=encoder_col)
        
        self.data_encoded = data_encoded
        self.encoder = encoder
        
        util.pickle_dump(encoder, config_data["ohe_path"])
        
        return self.data_encoded, self.encoder
    
    def _LE_cat(self, data, encoder = None) -> pd.DataFrame:
        """
        Label Encoder for Ordinal Categoric data using LabelEncoder() from sklearn.preprocessing function \n
        categories parameter is defined as of config file

        Parameters
        --------
        data : pandas.DataFrame
            Ordinal data only
        encoder : LabelEncoder()

        Returns
        ---------
        data_encoded : pandas.DataFrame
                Encoded ordinal data
        encoder : LabelEncoder()
        """
        
        if encoder == None:
            le_encoder = LabelEncoder()
                
        else:
            le_encoder = encoder
            
        for col in data.columns.to_list():
                data[col] = le_encoder.fit_transform(data[col])
        
        util.pickle_dump(le_encoder, config_data["le_encoder_path"])

        self.data_encoded = data
        self.encoder = le_encoder
        
        return self.data_encoded, self.encoder
    
    def _standardize_Data(self, data, scaler=None) -> pd.DataFrame:
        """
        Standarization or normalization of the predictor value using StandardScaler() from sklearn.preprocessing \n
        Standarization is use (x-mean)/std to get value range from -1 to 1 and gaussian distribution

        Paramters
        ----------
        data : pandas.DataFrame
            X_train data
        scaler : StandardScaler()

        Returns
        --------
        data_scaled : pandas.DataFrame
                standardized data
        scaler
        """
        
        if scaler == None:
            scaler = StandardScaler()
            scaler.fit(data)

        data_scaled = pd.DataFrame(scaler.transform(data),
                                index=data.index,
                                columns=data.columns)
        
        util.pickle_dump(scaler, config_data["scaler"])

        self.data_scaled = data_scaled
        self.scaler = scaler
        
        return self.data_scaled, self.scaler
    
    def _handling_data(self, 
                       data, 
                       encoding = 'Label_Encoding',
                       encoder = None, 
                       scaler = None,
                       imputer_num = None, 
                       imputer_cat = None,
                       config = "None", 
                       y = True):
        """
        Preprocessed data from dataset (X,y) into cleaned data

        Parameters
        ----------
        data : array-like of shape
            dataset with predictor and label (opt.)

        y : bool. (default = True)
            True, data will split into X and y (label)
            False, X (predictor) only

        imputer : SimpleImputer object. (default = None)

        scaler : StandarScaler object. (default = None)

        config : str (default = None)
            Type of config data to save imputer, encoder, and scaler.

            - 'filter' = filter data feature selection
            - 'lasso' = lasso data feature selection
            - 'random_forest' = random forest data feature selection

            If None, will saving none.

        Returns
        ---------
        X : array-like of shape
            encoded and scaled data predictor.

        y : array-like of shape (if y = True)
            label.
        """

        # --- Split into Numeric and Categoric Data --- #
        if y == False:
            util.print_debug("Split Numeric and Categoric Data...")

            self._split_numcat(data)

        elif y == True:
            self._split_xy(data)

        # --- Impute Missing Value --- #
        util.print_debug("Perform Imputer...")

        self.X_num = self._imputer_Num(self.X_num, imputer_num)
        self.X_cat = self._imputer_Cat(self.X_cat, imputer_cat)
        
        # --- Label Encoding Categoric data --- #

        # ---- Label Encoding --- #
        if encoding == 'Label_Encoding':
            util.print_debug("Perform Label Encoding...")

            self._LE_cat(self.X_cat, encoder)
            X_train_ = pd.concat([self.X_num, self.data_encoded], axis=1)
            
        # --- One Hot Encoding --- #
        elif encoding == 'One_Hot_Encoding':
            util.print_debug("Perform One Hot Encoding...")

            self._OHE_Cat(self.X_cat, encoder = encoder)
            X_train_ = pd.concat([self.X_num, self.data_encoded], axis=1)
        
        # --- Both --- #
        elif encoding == 'Both':
            util.print_debug("Perform Both Label Encoding and One Hot Encoding...")

            X_train_ohe, _, encoder_ohe = self._OHE_Cat(self.X_cat)
            X_train_le, encoder_le = self._LE_cat(self.X_cat) 
            
            X_train_concat = pd.concat([X_train_ohe, X_train_le], axis=1)
            X_train_ = pd.concat([self.X_num, X_train_concat], axis=1)

        else:
            raise TypeError("encoding type is not recognized. Should be Label_Encoding, One_Hot_Encoding, or Both.")
        
        # --- Standardize Data --- #
        util.print_debug("Perform Standardizing....")

        # Reindex data
        X_train_ = X_train_.reindex(sorted(X_train_.columns), axis=1)
        
        # Standardizing data
        self._standardize_Data(X_train_, scaler)

        util.print_debug("Data has been standardized.")

        # --- Dumping/Save data
        if config == 'filter':
            util.print_debug("Dumping encoder and scaler.")

            util.pickle_dump(self.encoder, config_data["le_encoder_path_filter"])
            util.pickle_dump(self.scaler, config_data["scaler_filter"])

        elif config == 'lasso':
            util.print_debug("Dumping encoder and scaler.")

            util.pickle_dump(self.encoder, config_data["le_encoder_path_lasso"])
            util.pickle_dump(self.scaler, config_data["scaler_lasso"])

        elif config == 'random_forest':
            util.print_debug("Dumping encoder and scaler.")

            util.pickle_dump(self.encoder, config_data["le_encoder_path_rf"])
            util.pickle_dump(self.scaler, config_data["scaler_rf"])

        else:
            pass

        util.print_debug("Returned scaled data.")
        util.print_debug("="*40)

        if y == True:
            return self.data_scaled, self.y
        else:
            return self.data_scaled

In [7]:
preprocessor = _Preprocessing_Data()

    Training dataset

In [8]:
X_train_rf, y_train_rf = preprocessor._handling_data(
                                            data = train_set_rf, 
                                            encoding='Label_Encoding',
                                            config = 'random_forest'
                                            )

X_train_lasso, y_train_lasso = preprocessor._handling_data(
                                                    data = train_set_lasso,
                                                    encoding='Label_Encoding',
                                                    config = 'lasso'
                                                    )

X_train_filter, y_train_filter = preprocessor._handling_data(
                                                        data = train_set_filter,
                                                        encoding='Label_Encoding',
                                                        config = 'filter'
                                                        )
                                                            

2023-07-14 22:43:49.477593 Perform Imputer...
2023-07-14 22:43:49.703544 Perform Label Encoding...
2023-07-14 22:43:49.845884 Perform Standardizing....
2023-07-14 22:43:49.904066 Data has been standardized.
2023-07-14 22:43:49.904066 Dumping encoder and scaler.
2023-07-14 22:43:49.909995 Returned scaled data.
2023-07-14 22:43:49.933643 Perform Imputer...
2023-07-14 22:43:50.143330 Perform Label Encoding...
2023-07-14 22:43:50.288472 Perform Standardizing....
2023-07-14 22:43:50.349368 Data has been standardized.
2023-07-14 22:43:50.349368 Dumping encoder and scaler.
2023-07-14 22:43:50.356440 Returned scaled data.
2023-07-14 22:43:50.377284 Perform Imputer...
2023-07-14 22:43:50.559714 Perform Label Encoding...
2023-07-14 22:43:50.706903 Perform Standardizing....
2023-07-14 22:43:50.753566 Data has been standardized.
2023-07-14 22:43:50.753566 Dumping encoder and scaler.
2023-07-14 22:43:50.760157 Returned scaled data.


In [9]:
X_train = {
    "filter" : X_train_filter,
    "lasso" : X_train_lasso,
    "rf" : X_train_rf
    }

y_train = {
    "filter" : y_train_filter,
    "lasso" : y_train_lasso,
    "rf" : y_train_rf
    }

    Create Function to handle valid and test dataset

In [11]:
def _Concat_Preprocessing_valid(data_filter, data_lasso, data_rf):
    
    X_rf, y_rf = preprocessor._handling_data(data=data_rf, 
                                             encoding='Label_Encoding',
                                             encoder=util.pickle_load(config_data['le_encoder_path_rf']),
                                             scaler=util.pickle_load(config_data['scaler_rf']),
                                             y = True
                                             )

    X_lasso, y_lasso = preprocessor._handling_data(data=data_lasso,
                                                    encoding='Label_Encoding',
                                                    encoder=util.pickle_load(config_data['le_encoder_path_lasso']),
                                                    scaler=util.pickle_load(config_data['scaler_lasso']),
                                                    y = True
                                                    )

    X_filter, y_filter = preprocessor._handling_data(data=data_filter,
                                                        encoding='Label_Encoding',
                                                        encoder=util.pickle_load(config_data['le_encoder_path_filter']),
                                                        scaler=util.pickle_load(config_data['scaler_filter']),
                                                        y = True
                                                        )
    
    X = {
        "filter" : X_filter,
        "lasso" : X_lasso,
        "rf" : X_rf
    }
    
    y = {
        "filter" : y_filter,
        "lasso" : y_lasso,
        "rf" : y_rf
    }
    
    return X, y

    Validation dataset

In [12]:
X_valid, y_valid = _Concat_Preprocessing_valid(data_filter = valid_set_filter,
                                         data_lasso = valid_set_lasso,
                                         data_rf = valid_set_rf
                                         )

2023-07-14 22:43:58.731132 Perform Imputer...
2023-07-14 22:43:58.752466 Perform Label Encoding...
2023-07-14 22:43:58.774089 Perform Standardizing....
2023-07-14 22:43:58.782417 Data has been standardized.
2023-07-14 22:43:58.782417 Returned scaled data.
2023-07-14 22:43:58.791110 Perform Imputer...
2023-07-14 22:43:58.831097 Perform Label Encoding...
2023-07-14 22:43:58.875653 Perform Standardizing....
2023-07-14 22:43:58.887779 Data has been standardized.
2023-07-14 22:43:58.887779 Returned scaled data.
2023-07-14 22:43:58.895777 Perform Imputer...
2023-07-14 22:43:58.933977 Perform Label Encoding...
2023-07-14 22:43:58.970358 Perform Standardizing....
2023-07-14 22:43:58.981496 Data has been standardized.
2023-07-14 22:43:58.981496 Returned scaled data.


In [13]:
X_test, y_test = _Concat_Preprocessing_valid(data_filter = test_set_filter,
                                         data_lasso = test_set_lasso,
                                         data_rf = test_set_rf
                                         )

2023-07-14 22:44:00.748262 Perform Imputer...
2023-07-14 22:44:00.814322 Perform Label Encoding...
2023-07-14 22:44:00.865603 Perform Standardizing....
2023-07-14 22:44:00.880577 Data has been standardized.
2023-07-14 22:44:00.880577 Returned scaled data.
2023-07-14 22:44:00.893988 Perform Imputer...
2023-07-14 22:44:00.938420 Perform Label Encoding...
2023-07-14 22:44:00.983307 Perform Standardizing....
2023-07-14 22:44:00.998498 Data has been standardized.
2023-07-14 22:44:00.998498 Returned scaled data.
2023-07-14 22:44:01.007482 Perform Imputer...
2023-07-14 22:44:01.052107 Perform Label Encoding...
2023-07-14 22:44:01.091579 Perform Standardizing....
2023-07-14 22:44:01.102160 Data has been standardized.
2023-07-14 22:44:01.102160 Returned scaled data.


## Dumping and Save Dataset

In [14]:
util.pickle_dump(X_train, config_data["train_set_clean"][0])
util.pickle_dump(y_train, config_data["train_set_clean"][1])

util.pickle_dump(X_valid, config_data["valid_set_clean"][0])
util.pickle_dump(y_valid, config_data["valid_set_clean"][1])

util.pickle_dump(X_test, config_data["test_set_clean"][0])
util.pickle_dump(y_test, config_data["test_set_clean"][1])