In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from src import _PreprocessingData

In [2]:
config_data = util.load_config()

## Collect Data Set

In [3]:
def load_dataset(config_data: dict, config: str) -> pd.DataFrame:
    # Load set of data
    X_train = util.pickle_load(config_data["train_set_eda"][0])
    y_train = util.pickle_load(config_data["train_set_eda"][1])
    
    X_valid = util.pickle_load(config_data["valid_set_eda"][0])
    y_valid = util.pickle_load(config_data["valid_set_eda"][1])
    
    X_test = util.pickle_load(config_data["test_set_eda"][0])
    y_test = util.pickle_load(config_data["test_set_eda"][1])
    
    dataset_train = pd.concat([X_train[config], y_train[config]], axis=1)
    dataset_valid = pd.concat([X_valid[config], y_valid[config]], axis=1)
    dataset_test = pd.concat([X_test[config], y_test[config]], axis=1)
    
    
    return dataset_train, dataset_valid, dataset_test

In [4]:
train_set_rf, valid_set_rf, test_set_rf = load_dataset(config_data, config='rf')
train_set_filter, valid_set_filter, test_set_filter = load_dataset(config_data, config='filter')
train_set_lasso, valid_set_lasso, test_set_lasso = load_dataset(config_data, config='lasso')

## Check Data

## Function Preprocessing data

- Split dataset into numerical and categorical data, <br>
- Performed **SimpleImputer** in order of missing data <br>
- Performed **Encoding** for categorical data such as **LabelEncoder** for ordinal data and **OHE** for non-ordinal data <br>
- Suggested to use **OrdinalEncoder** and defined the ranked value of each features <br>
- Last, performed **Standardization** using StandardScaler

In [42]:
class _handling_data:
    def __init__(self):
        pass
        
    def _split_xy(self, data:dict) -> pd.DataFrame:
        X_data = data.drop(columns = config_data['label'], axis=1)
        y_data = data[config_data['label']]
        
        self.X = X_data
        self.y = y_data
        
        numerical_col = X_data.select_dtypes('float64').columns.to_list()
        categorical_col = X_data.select_dtypes('object').columns.to_list()

        X_num = X_data[numerical_col]
        X_cat = X_data[categorical_col]

        return  X_num, X_cat
    
    # Perform sanity check
    def _imputer_Num(self, data, imputer=None):
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='median')
            imputer.fit(data)

        data_imputed_num = pd.DataFrame(imputer.transform(data),
                                    index = data.index,
                                    columns = data.columns)
        
        data_imputed_num = data_imputed_num.astype('int64')
        
        self.data_imputed_num = data_imputed_num
        
        return data_imputed_num

    def _imputer_Cat(self, data, imputer = None) -> pd.DataFrame:
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='most_frequent')
            imputer.fit(data)

        data_imputed_cat = pd.DataFrame(imputer.transform(data),
                                    index=data.index,
                                    columns=data.columns
                                    )
        
        self.data_imputed_cat = data_imputed_cat
        
        return data_imputed_cat
    
    def _OHE_Cat(self, data, encoder_col = None, encoder = None) -> pd.DataFrame:

        if encoder == None:
            encoder = OneHotEncoder(handle_unknown= 'ignore',
                                    drop = 'if_binary')
            encoder.fit(data)
            encoder_col = encoder.get_feature_names_out(data.columns)

        data_encoded = encoder.transform(data).toarray()
        data_encoded = pd.DataFrame(data_encoded,
                                    index=data.index,
                                    columns=encoder_col)
        
        # util.pickle_dump(encoder, config_data["ohe_path"])
        
        return data_encoded, encoder_col, encoder
    
    def _LE_cat(self, data, encoder = None) -> pd.DataFrame:
        
        if encoder == None:
            le_encoder = LabelEncoder()
            for col in data.columns.to_list():
                data[col] = le_encoder.fit_transform(data[col])
        
        return data, le_encoder
    
    def _concat_data(self, nominal, ordinal_ohe=None, ordinal_le=None):
        
        if ordinal_ohe is not None and not ordinal_ohe.empty:
            X_train_ohe, encoder_ohe_col, encoder_ohe = self._OHE_Cat(ordinal_ohe)
            X_train_ = pd.concat([nominal, X_train_ohe], axis=1)
            
        if ordinal_le is not None and not ordinal_le.empty:
            X_train_le, encoder_le = self._LE_cat(data=ordinal_le)
            X_train_ = pd.concat([nominal, X_train_le], axis=1)
        
        if ordinal_ohe is not None and not ordinal_ohe.empty and ordinal_le is not None and not ordinal_le.empty:
            X_train_ohe, encoder_ohe_col, encoder_ohe = self._OHE_Cat(ordinal_ohe)
            X_train_le, encoder_le = self._LE_cat(data=ordinal_le)
            
            X_train_cat_concat = pd.concat([X_train_ohe, X_train_le], axis=1)
            X_train_ = pd.concat([nominal, X_train_cat_concat], axis=1)
            
        return X_train_
    
    def _standardize_Data(self, data, scaler=None) -> pd.DataFrame:
        if scaler == None:
            scaler = StandardScaler()
            scaler.fit(data)

        data_scaled = pd.DataFrame(scaler.transform(data),
                                index=data.index,
                                columns=data.columns)
        
        return data_scaled, scaler
    
    def _handling_data(self, data, encoding='le'):
        
        X_num, X_cat = self._split_xy(data)
        
        X_num = self._imputer_Num(data=X_num)
        X_cat = self._imputer_Cat(data=X_cat)
        
        if encoding == 'le':
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_le=X_cat)
            
        elif encoding == 'ohe':
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_ohe=X_cat)
        
        else:
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_ohe=X_cat[config_data[nominal]],
                                   ordinal_le=X_cat[config_data[ordinal]])   
        
        X_clean, scaler = self._standardize_Data(data=X_)
        
        return X_clean, self.y
                      

## Preprocessing Data

    We are generated dataset with filter method of feature selection, <br>
    not the other two (Lasso or Random Forest)

In [5]:
preprocessor_ = _PreprocessingData()

    Training dataset

In [6]:
X_train_rf, y_train_rf = preprocessor_._handling_data(data=train_set_rf, 
                                             encoding='label_encoder',
                                             method='random_forest')

X_train_lasso, y_train_lasso = preprocessor_._handling_data(data=train_set_lasso,
                                                            encoding='label_encoder',
                                                            method='lasso')

X_train_filter, y_train_filter = preprocessor_._handling_data(data=train_set_filter,
                                                              encoding='label_encoder',
                                                              method='filter')
                                                            

2023-06-15 15:55:59.132839 Split numeric and categoric data
2023-06-15 15:55:59.156806 Perform imputer.
2023-06-15 15:55:59.304382 Perform label encoding.
2023-06-15 15:55:59.397132 Perform Standardizing data.
2023-06-15 15:55:59.460962 Split numeric and categoric data
2023-06-15 15:55:59.480910 Perform imputer.
2023-06-15 15:55:59.596676 Perform label encoding.
2023-06-15 15:55:59.692421 Perform Standardizing data.
2023-06-15 15:55:59.732314 Split numeric and categoric data
2023-06-15 15:55:59.746276 Perform imputer.
2023-06-15 15:55:59.848004 Perform label encoding.
2023-06-15 15:55:59.927795 Perform Standardizing data.


In [14]:
X_train = {
    "filter" : X_train_filter,
    "lasso" : X_train_lasso,
    "rf" : X_train_rf
    }

y_train = {
    "filter" : y_train_filter,
    "lasso" : y_train_lasso,
    "rf" : y_train_rf
    }

    Create Function to handle valid and test dataset

In [13]:
def _Concat_Preprocessing(data_filter, data_lasso, data_rf):

    le_encoder_rf = util.pickle_load(config_data['le_encoder_path_rf'])
    scaler_rf = util.pickle_load(config_data['scaler_rf'])

    le_encoder_lasso = util.pickle_load(config_data['le_encoder_path_lasso'])
    scaler_lasso = util.pickle_load(config_data['scaler_lasso'])

    le_encoder_filter = util.pickle_load(config_data['le_encoder_path_filter'])
    scaler_filter = util.pickle_load(config_data['scaler_filter'])
    
    X_rf, y_rf = preprocessor_._handling_data(data=data_rf, 
                                             encoding='label_encoder',
                                             label_encod=util.pickle_load(config_data['le_encoder_path_rf']),
                                             standard_scaler=util.pickle_load(config_data['scaler_rf'])
                                             )

    X_lasso, y_lasso = preprocessor_._handling_data(data=data_lasso,
                                                    encoding='label_encoder',
                                                    label_encod=util.pickle_load(config_data['le_encoder_path_lasso']),
                                                    standard_scaler=util.pickle_load(config_data['scaler_lasso'])
                                                    )

    X_filter, y_filter = preprocessor_._handling_data(data=data_filter,
                                                        encoding='label_encoder',
                                                        label_encod=util.pickle_load(config_data['le_encoder_path_filter']),
                                                        standard_scaler=util.pickle_load(config_data['scaler_filter'])
                                                        )
    
    X = {
        "filter" : X_filter,
        "lasso" : X_lasso,
        "rf" : X_rf
    }
    
    y = {
        "filter" : y_filter,
        "lasso" : y_lasso,
        "rf" : y_rf
    }
    
    return X, y

    Validation dataset

In [11]:
X_valid, y_valid = _Concat_Preprocessing(data_filter=valid_set_filter,
                                         data_lasso=valid_set_lasso,
                                         data_rf=valid_set_rf
                                         )

2023-06-15 15:56:29.411381 Split numeric and categoric data
2023-06-15 15:56:29.420360 Perform imputer.
2023-06-15 15:56:29.460252 Perform label encoding.
2023-06-15 15:56:29.504134 Perform Standardizing data.
2023-06-15 15:56:29.519094 Split numeric and categoric data
2023-06-15 15:56:29.525079 Perform imputer.
2023-06-15 15:56:29.553999 Perform label encoding.
2023-06-15 15:56:29.575942 Perform Standardizing data.
2023-06-15 15:56:29.584918 Split numeric and categoric data
2023-06-15 15:56:29.588908 Perform imputer.
2023-06-15 15:56:29.609849 Perform label encoding.
2023-06-15 15:56:29.630794 Perform Standardizing data.


    Test Dataset

In [12]:
X_test, y_test = _Concat_Preprocessing(data_filter = test_set_filter,
                                         data_lasso = test_set_lasso,
                                         data_rf = test_set_rf
                                         )

2023-06-15 15:56:30.701568 Split numeric and categoric data
2023-06-15 15:56:30.707553 Perform imputer.
2023-06-15 15:56:30.747447 Perform label encoding.
2023-06-15 15:56:30.789335 Perform Standardizing data.
2023-06-15 15:56:30.833217 Split numeric and categoric data
2023-06-15 15:56:30.839202 Perform imputer.
2023-06-15 15:56:30.875106 Perform label encoding.
2023-06-15 15:56:30.900038 Perform Standardizing data.
2023-06-15 15:56:30.909016 Split numeric and categoric data
2023-06-15 15:56:30.914001 Perform imputer.
2023-06-15 15:56:30.940933 Perform label encoding.
2023-06-15 15:56:30.961874 Perform Standardizing data.


## Dumping and Save Dataset

In [15]:
util.pickle_dump(X_train, config_data["train_set_clean"][0])
util.pickle_dump(y_train, config_data["train_set_clean"][1])

util.pickle_dump(X_valid, config_data["valid_set_clean"][0])
util.pickle_dump(y_valid, config_data["valid_set_clean"][1])

util.pickle_dump(X_test, config_data["test_set_clean"][0])
util.pickle_dump(y_test, config_data["test_set_clean"][1])