In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from src import _PreprocessingData

In [2]:
config_data = util.load_config()

## Collect Data Set

In [3]:
def load_dataset(config_data: dict, config: str) -> pd.DataFrame:
    # Load set of data
    X_train = util.pickle_load(config_data["train_set_eda"][0])
    y_train = util.pickle_load(config_data["train_set_eda"][1])
    
    X_valid = util.pickle_load(config_data["valid_set_eda"][0])
    y_valid = util.pickle_load(config_data["valid_set_eda"][1])
    
    X_test = util.pickle_load(config_data["test_set_eda"][0])
    y_test = util.pickle_load(config_data["test_set_eda"][1])
    
    dataset_train = pd.concat([X_train[config], y_train[config]], axis=1)
    dataset_valid = pd.concat([X_valid[config], y_valid[config]], axis=1)
    dataset_test = pd.concat([X_test[config], y_test[config]], axis=1)
    
    
    return dataset_train, dataset_valid, dataset_test

In [11]:
train_set, valid_set, test_set = load_dataset(config_data, config='rf')

## Check Data

## Function Preprocessing data

- Split dataset into numerical and categorical data, <br>
- Performed **SimpleImputer** in order of missing data <br>
- Performed **Encoding** for categorical data such as **LabelEncoder** for ordinal data and **OHE** for non-ordinal data <br>
- Suggested to use **OrdinalEncoder** and defined the ranked value of each features <br>
- Last, performed **Standardization** using StandardScaler

In [42]:
class _handling_data:
    def __init__(self):
        pass
        
    def _split_xy(self, data:dict) -> pd.DataFrame:
        X_data = data.drop(columns = config_data['label'], axis=1)
        y_data = data[config_data['label']]
        
        self.X = X_data
        self.y = y_data
        
        numerical_col = X_data.select_dtypes('float64').columns.to_list()
        categorical_col = X_data.select_dtypes('object').columns.to_list()

        X_num = X_data[numerical_col]
        X_cat = X_data[categorical_col]

        return  X_num, X_cat
    
    # Perform sanity check
    def _imputer_Num(self, data, imputer=None):
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='median')
            imputer.fit(data)

        data_imputed_num = pd.DataFrame(imputer.transform(data),
                                    index = data.index,
                                    columns = data.columns)
        
        data_imputed_num = data_imputed_num.astype('int64')
        
        self.data_imputed_num = data_imputed_num
        
        return data_imputed_num

    def _imputer_Cat(self, data, imputer = None) -> pd.DataFrame:
        if imputer == None:
            imputer = SimpleImputer(missing_values=np.nan,
                                    strategy='most_frequent')
            imputer.fit(data)

        data_imputed_cat = pd.DataFrame(imputer.transform(data),
                                    index=data.index,
                                    columns=data.columns
                                    )
        
        self.data_imputed_cat = data_imputed_cat
        
        return data_imputed_cat
    
    def _OHE_Cat(self, data, encoder_col = None, encoder = None) -> pd.DataFrame:

        if encoder == None:
            encoder = OneHotEncoder(handle_unknown= 'ignore',
                                    drop = 'if_binary')
            encoder.fit(data)
            encoder_col = encoder.get_feature_names_out(data.columns)

        data_encoded = encoder.transform(data).toarray()
        data_encoded = pd.DataFrame(data_encoded,
                                    index=data.index,
                                    columns=encoder_col)
        
        # util.pickle_dump(encoder, config_data["ohe_path"])
        
        return data_encoded, encoder_col, encoder
    
    def _LE_cat(self, data, encoder = None) -> pd.DataFrame:
        
        if encoder == None:
            le_encoder = LabelEncoder()
            for col in data.columns.to_list():
                data[col] = le_encoder.fit_transform(data[col])
        
        return data, le_encoder
    
    def _concat_data(self, nominal, ordinal_ohe=None, ordinal_le=None):
        
        if ordinal_ohe is not None and not ordinal_ohe.empty:
            X_train_ohe, encoder_ohe_col, encoder_ohe = self._OHE_Cat(ordinal_ohe)
            X_train_ = pd.concat([nominal, X_train_ohe], axis=1)
            
        if ordinal_le is not None and not ordinal_le.empty:
            X_train_le, encoder_le = self._LE_cat(data=ordinal_le)
            X_train_ = pd.concat([nominal, X_train_le], axis=1)
        
        if ordinal_ohe is not None and not ordinal_ohe.empty and ordinal_le is not None and not ordinal_le.empty:
            X_train_ohe, encoder_ohe_col, encoder_ohe = self._OHE_Cat(ordinal_ohe)
            X_train_le, encoder_le = self._LE_cat(data=ordinal_le)
            
            X_train_cat_concat = pd.concat([X_train_ohe, X_train_le], axis=1)
            X_train_ = pd.concat([nominal, X_train_cat_concat], axis=1)
            
        return X_train_
    
    def _standardize_Data(self, data, scaler=None) -> pd.DataFrame:
        if scaler == None:
            scaler = StandardScaler()
            scaler.fit(data)

        data_scaled = pd.DataFrame(scaler.transform(data),
                                index=data.index,
                                columns=data.columns)
        
        return data_scaled, scaler
    
    def _handling_data(self, data, encoding='le'):
        
        X_num, X_cat = self._split_xy(data)
        
        X_num = self._imputer_Num(data=X_num)
        X_cat = self._imputer_Cat(data=X_cat)
        
        if encoding == 'le':
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_le=X_cat)
            
        elif encoding == 'ohe':
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_ohe=X_cat)
        
        else:
            X_ = self._concat_data(nominal=X_num,
                                   ordinal_ohe=X_cat[config_data[nominal]],
                                   ordinal_le=X_cat[config_data[ordinal]])   
        
        X_clean, scaler = self._standardize_Data(data=X_)
        
        return X_clean, self.y
                      

## Preprocessing Data

    We are generated dataset with filter method of feature selection, <br>
    not the other two (Lasso or Random Forest)

In [5]:
preprocessor_ = _PreprocessingData()

    Training dataset

In [6]:
X_train_rf, y_train = preprocessor_._handling_data(data=train_set, 
                                             encoding='label_encoder')

In [7]:
le_encoder = util.pickle_load(config_data['le_encoder_path'])
scaler = util.pickle_load(config_data['scaler'])

    Validation dataset

In [8]:
X_valid_rf, y_valid = preprocessor_._handling_data(data=valid_set,
                                                    encoding='label_encoder',
                                                    label_encod=le_encoder,
                                                    standardscaler=scaler)

    Test Dataset

In [9]:
X_test_rf, y_test = preprocessor_._handling_data(data=test_set,
                                                encoding='label_encoder',
                                                label_encod=le_encoder,
                                                standardscaler=scaler)

## Dumping and Save Dataset

In [10]:
util.pickle_dump(X_train_rf, config_data["train_set_clean"][0])
util.pickle_dump(y_train, config_data["train_set_clean"][1])

util.pickle_dump(X_valid_rf, config_data["valid_set_clean"][0])
util.pickle_dump(y_valid, config_data["valid_set_clean"][1])

util.pickle_dump(X_test_rf, config_data["test_set_clean"][0])
util.pickle_dump(y_test, config_data["test_set_clean"][1])