**Table of contents**<a id='toc0_'></a>    
- [Import Libraries](#toc1_)    
    - [Import Configuration File](#toc1_1_1_)    
    - [Load Dataset](#toc1_1_2_)    
- [<b>Removing Outlier</b>](#toc2_)    
- [<b>Handling Missing Value</b>](#toc3_)    
  - [Splitting data into X_train & y_train](#toc3_1_)    
  - [Splitting data into Numeric & Categoric Features](#toc3_2_)    
  - [Handling Numeric Features](#toc3_3_)    
    - [Handling missing value on numeric features](#toc3_3_1_)    
  - [Handling Categorical Data](#toc3_4_)    
    - [Handling missing value for Categoric Features](#toc3_4_1_)    
- [<b>Label Encoding</b>](#toc4_)    
  - [One Hot Encoder (OHE)](#toc4_1_)    
  - [Ordinal Encoding](#toc4_2_)    
  - [Encoding Categoric Features](#toc4_3_)    
  - [Concatenate Imputed Numeric & ENcoded Categoric Features](#toc4_4_)    
- [<b>Scaling Data</b>](#toc5_)    
- [<b>Handling Label Data</b>](#toc6_)    
  - [Label Categories](#toc6_1_)    
  - [Balancing Train Data](#toc6_2_)    
- [<b>Handling Valid & Test set</b>](#toc7_)    
  - [Function](#toc7_1_)    
  - [Handling Non-Balancing Valid & Test set](#toc7_2_)    
  - [Handling Smote Data Set](#toc7_3_)    
  - [Handling Over Data Set](#toc7_4_)    
- [<b> Dump Train, Valid & Test Set </b>](#toc8_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Import Libraries](#toc0_)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util

from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler

### <a id='toc1_1_1_'></a>[Import Configuration File](#toc0_)

In [8]:
config_data = util.load_config()

### <a id='toc1_1_2_'></a>[Load Dataset](#toc0_)

In [9]:
def load_dataset(config_data: dict, config: str) -> pd.DataFrame:
    # Load set of data
    x_train = util.pickle_load(config_data["train_set_eda"][0])
    y_train = util.pickle_load(config_data["train_set_eda"][1])

    x_valid = util.pickle_load(config_data["valid_set_eda"][0])
    y_valid = util.pickle_load(config_data["valid_set_eda"][1])

    x_test = util.pickle_load(config_data["test_set_eda"][0])
    y_test = util.pickle_load(config_data["test_set_eda"][1])

    # concatenate x and y each set
    train_set = pd.concat([x_train[config], y_train[config]], axis = 1)
    valid_set = pd.concat([x_valid[config], y_valid[config]], axis = 1)
    test_set = pd.concat([x_test[config], y_test[config]], axis = 1)

    # return 3 set of data
    return train_set, valid_set, test_set

In [10]:
train_set, valid_set, test_set = load_dataset(config_data, 'nonbalance')
train_set_smote, valid_set_smote, test_set_smote = load_dataset(config_data, 'smote')
train_set_over, valid_set_over, test_set_over = load_dataset(config_data, 'oversampling')

In [11]:
# reverse_map = {0: 'N', 1: 'Y'}

# train_set.fraud_reported = train_set.fraud_reported.map(reverse_map)

# <a id='toc2_'></a>[<b>Removing Outlier</b>](#toc0_)

In [12]:
def remove_outlier(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col in set_data[config_data['numeric_eda']]:
        q1 = set_data[col].quantile(0.25)
        q3 = set_data[col].quantile(0.75)
        iqr = q3 - q1

        set_data_cleaned = set_data[~((set_data[col] < (q1 - 1.5*iqr)) |
                                    (set_data[col] > (q3 + 1.5*iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())

    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == len(config_data['int32_col'])].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned


In [13]:
train_set_out = remove_outlier(train_set)

# <a id='toc3_'></a>[<b>Handling Missing Value</b>](#toc0_)
----

## <a id='toc3_1_'></a>[Splitting data into X_train & y_train](#toc0_)

In [14]:
def split_xy(set_data):
    x_data = set_data.drop(columns = config_data['label'], axis = 1)
    y_data = set_data[config_data['label']]

    return x_data, y_data

In [15]:
x_train, y_train = split_xy(train_set)

## <a id='toc3_2_'></a>[Splitting data into Numeric & Categoric Features](#toc0_)

In [16]:
def split_NumCat(set_data):

    num = set_data[config_data['numeric_eda']]
    cat = set_data[config_data['categoric_eda']]

    return  num, cat

In [17]:
x_train_num, x_train_cat = split_NumCat(x_train)

## <a id='toc3_3_'></a>[Handling Numeric Features](#toc0_)

In [18]:
x_train_num.isna().any()

injury_claim      False
property_claim    False
vehicle_claim     False
dtype: bool

### <a id='toc3_3_1_'></a>[Handling missing value on numeric features](#toc0_)

In [19]:
# Perform sanity check for any missing value for future data

def imputer_Num(data, imputer = None):

    if imputer == None:
        # Create imputer based on median value
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "median")
        imputer.fit(data)

    # Transform data dengan imputer

    data_imputed = pd.DataFrame(imputer.transform(data),
                                index = data.index,
                                columns = data.columns)
    
    # Convert data_imputed to int32
    data_imputed = data_imputed.astype('int32')
    
    return data_imputed, imputer

In [20]:
x_train_num_imputed, imputer_num = imputer_Num(data = x_train_num)

## <a id='toc3_4_'></a>[Handling Categorical Data](#toc0_)

### <a id='toc3_4_1_'></a>[Handling missing value for Categoric Features](#toc0_)

In [21]:
def imputer_Cat(data, imputer = None):
        
    if imputer == None:
        # Create Imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = 'constant',
                                fill_value = 'UNKNOWN')
        imputer.fit(data)

    # Transform data with imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)

    return data_imputed, imputer

In [22]:
x_train_cat_imputed, imputer_cat = imputer_Cat(data = x_train_cat)

# <a id='toc4_'></a>[<b>Label Encoding</b>](#toc0_)
----

## <a id='toc4_1_'></a>[One Hot Encoder (OHE)](#toc0_)

In [23]:
x_train_cat_imputed.head()

Unnamed: 0,insured_hobbies,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,property_damage
887,exercise,Parked Car,UNKNOWN,Minor Damage,UNKNOWN,VA,NO
317,camping,Multi-vehicle Collision,Side Collision,Total Loss,Ambulance,VA,YES
796,reading,Single Vehicle Collision,Rear Collision,Major Damage,Ambulance,NY,UNKNOWN
425,basketball,Multi-vehicle Collision,Side Collision,Major Damage,Police,WV,UNKNOWN
991,basketball,Single Vehicle Collision,Rear Collision,Total Loss,Other,WV,NO


In [24]:
pd.DataFrame(x_train_cat_imputed
            .melt(var_name='columns', value_name='index')
            .value_counts()).sort_values(by=['columns'])

Unnamed: 0_level_0,Unnamed: 1_level_0,0
columns,index,Unnamed: 2_level_1
authorities_contacted,Ambulance,154
authorities_contacted,Police,235
authorities_contacted,UNKNOWN,82
authorities_contacted,Other,156
authorities_contacted,Fire,173
collision_type,Side Collision,212
collision_type,UNKNOWN,152
collision_type,Rear Collision,232
collision_type,Front Collision,204
incident_severity,Trivial Damage,75


In [25]:
nominal = ['authorities_contacted', 'incident_state', 'insured_hobbies',  'property_damage']
ordinal = ['collision_type', 'incident_type', 'incident_severity']

In [26]:
def OHE_cat(data, encoder_col = None, encoder = None) -> pd.DataFrame:

    data_ohe = data[nominal]

    if encoder == None:
        # Create Object
        encoder = OneHotEncoder(handle_unknown = 'ignore',
                                drop = 'if_binary')
        encoder.fit(data_ohe)
        encoder_col = encoder.get_feature_names_out(data_ohe.columns)
    
    
    # Transform the data
    data_encoded = encoder.transform(data_ohe).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data_ohe.index,
                                columns = encoder_col)
    
    # Save the object
    util.pickle_dump(encoder, config_data["ohe_path"])

    return data_encoded, encoder_col, encoder


## <a id='toc4_2_'></a>[Ordinal Encoding](#toc0_)

In [27]:
def OE_cat(data, encoder = None) -> pd.DataFrame:

    data_le = data[ordinal]

    collision_type = ['UNKNOWN', 'Side Collision', 'Rear Collision', 'Front Collision']
    incident_severity = ['Trivial Damage','Minor Damage','Major Damage','Total Loss']
    incident_type = ['Parked Car','Single Vehicle Collision','Multi-vehicle Collision','Vehicle Theft']

    if encoder == None:
        # Create object
        encoder = OrdinalEncoder(categories=[collision_type, incident_type,incident_severity])
        encoder.fit(data_le)

    ## Transform the data
    data_encoded = encoder.transform(data_le)
    data_encoded = pd.DataFrame(data_encoded,
                                index = data_le.index,
                                columns = data_le.columns)
    
    # save the object
    util.pickle_dump(encoder, config_data["le_path"])

    return data_encoded, encoder

## <a id='toc4_3_'></a>[Encoding Categoric Features](#toc0_)

In [28]:
x_train_cat_ohe, encoder_ohe_col, encoder_ohe = OHE_cat(data = x_train_cat_imputed)
x_train_cat_oe, encoder_oe = OE_cat(data = x_train_cat_imputed)

## <a id='toc4_4_'></a>[Concatenate Imputed Numeric & ENcoded Categoric Features](#toc0_)

In [29]:
x_train_concat = pd.concat([x_train_num_imputed, x_train_cat_ohe, x_train_cat_oe], axis=1)

In [30]:
## Sanity Check
x_train_concat.head()

Unnamed: 0,injury_claim,property_claim,vehicle_claim,authorities_contacted_Ambulance,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,authorities_contacted_UNKNOWN,incident_state_NC,incident_state_NY,...,insured_hobbies_skydiving,insured_hobbies_sleeping,insured_hobbies_video-games,insured_hobbies_yachting,property_damage_NO,property_damage_UNKNOWN,property_damage_YES,collision_type,incident_type,incident_severity
887,640,640,5120,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
317,5360,10720,48240,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,3.0
796,16860,8430,67440,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,2.0
425,6080,12160,48640,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0
991,0,5220,41760,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,3.0


# <a id='toc5_'></a>[<b>Scaling Data</b>](#toc0_)

In [31]:
def scaling_Data(data, scaler=None):

    if scaler == None:

        # Create Fit Scaler
        scaler = StandardScaler()
        scaler.fit(data)

    # Transform data
    data_scaled = scaler.transform(data)

    data_scaled = pd.DataFrame(data_scaled,
                                index = data.index,
                                columns = data.columns)
    
    return data_scaled, scaler

In [32]:
x_train_scaled, scaler = scaling_Data(data = x_train_concat)

In [33]:
x_train_scaled.shape

(800, 41)

In [34]:
x_train_scaled.head()

Unnamed: 0,injury_claim,property_claim,vehicle_claim,authorities_contacted_Ambulance,authorities_contacted_Fire,authorities_contacted_Other,authorities_contacted_Police,authorities_contacted_UNKNOWN,incident_state_NC,incident_state_NY,...,insured_hobbies_skydiving,insured_hobbies_sleeping,insured_hobbies_video-games,insured_hobbies_yachting,property_damage_NO,property_damage_UNKNOWN,property_damage_YES,collision_type,incident_type,incident_severity
887,-1.361866,-1.367596,-1.676726,-0.488252,-0.525278,-0.492175,-0.644926,2.95907,-0.36489,-0.586973,...,-0.238334,-0.204124,-0.223313,-0.226381,1.355315,-0.745936,-0.639101,-1.51597,-1.900962,-0.765344
317,-0.407244,0.694777,0.557802,2.048122,-0.525278,-0.492175,-0.644926,-0.337944,-0.36489,-0.586973,...,-0.238334,-0.204124,-0.223313,-0.226381,-0.737836,-0.745936,1.564697,-0.574374,0.618954,1.296189
796,1.918636,0.226242,1.552769,2.048122,-0.525278,-0.492175,-0.644926,-0.337944,-0.36489,1.703655,...,-0.238334,-0.204124,-0.223313,-0.226381,-0.737836,1.340598,-0.639101,0.367223,-0.641004,0.265422
425,-0.261624,0.989402,0.578531,-0.488252,-0.525278,-0.492175,1.550566,-0.337944,-0.36489,-0.586973,...,-0.238334,-0.204124,-0.223313,-0.226381,-0.737836,1.340598,-0.639101,-0.574374,0.618954,0.265422
991,-1.491306,-0.430526,0.222001,-0.488252,-0.525278,2.031798,-0.644926,-0.337944,-0.36489,-0.586973,...,-0.238334,-0.204124,-0.223313,-0.226381,1.355315,-0.745936,-0.639101,0.367223,-0.641004,1.296189


# <a id='toc6_'></a>[<b>Handling Label Data</b>](#toc0_)

In [35]:
y_train_clean = y_train.map(dict(Y=1, N=0))

In [36]:
# --- Check X and y shape of data --- #

x_train_scaled.shape[0] == y_train_clean.shape[0]

True

In [37]:
train_set_clean = pd.concat([x_train_scaled, y_train_clean], axis=1)

## <a id='toc6_1_'></a>[Label Categories](#toc0_)

In [38]:
def le_fit(data_tobe_fitted: dict, le_path: str) -> LabelEncoder:
    # Create le object
    le_encoder = LabelEncoder()

    # Fit le
    le_encoder.fit(data_tobe_fitted)

    # Save le object
    util.pickle_dump(le_encoder, le_path)

    # Return trained le
    return le_encoder

In [39]:
le_fit(config_data["label_categories"], config_data["le_label_path"])

## <a id='toc6_2_'></a>[Balancing Train Data](#toc0_)

In [40]:
def balancing(data):
    x_data = data.drop(columns = config_data['label'])
    y_data = data[config_data['label']]

    x_over, y_over = RandomOverSampler(random_state=42).fit_resample(x_data, y_data)
    x_smote, y_smote = SMOTE(random_state=42).fit_resample(x_data, y_data)

    train_set_smote = pd.concat([x_smote, y_smote], axis = 1)
    train_set_over = pd.concat([x_over, y_over], axis = 1)

    return x_smote, y_smote, x_over, y_over

In [41]:
x_smote, y_smote, x_over, y_over = balancing(train_set_clean)

# <a id='toc7_'></a>[<b>Handling Valid & Test set</b>](#toc0_)

Here, I create class function in order to simplify our work above in one function to handle valid and test data, also for .py file later

## <a id='toc7_1_'></a>[Function](#toc0_)

In [42]:
class preprocessing_data:
    def __init__(self):
        pass

    def split_xy(self, set_data):
        self.x = set_data.drop(columns = config_data['label'], axis = 1)
        self.y = set_data[config_data['label']].map(dict(Y=1, N=0))

    def split_NumCat(self, set_data):

        self.num = set_data[config_data['numeric_eda']]
        self.cat = set_data[config_data['categoric_eda']]

    def imputer_Num(self, data, imputer = None):

        if imputer == None:
            # Create imputer based on median value
            imputer = SimpleImputer(missing_values = np.nan,
                                    strategy = "median")
            imputer.fit(data)

        self.imputer_num = imputer
        
        # Transform data dengan imputer
        imputed_data = pd.DataFrame(imputer.transform(data),
                                    index = data.index,
                                    columns = data.columns)
        
        # Convert data_imputed to int32
        self.imputed_num = imputed_data.astype('int32')

    def imputer_Cat(self, data, imputer = None) -> pd.DataFrame:
        
        if imputer == None:
            # Create Imputer
            imputer = SimpleImputer(missing_values = np.nan,
                                    strategy = 'constant',
                                    fill_value = 'UNKNOWN')
            imputer.fit(data)

        self.imputer_cat = imputer

        # Transform data with imputer
        data_imputed = imputer.transform(data)
        data_imputed = pd.DataFrame(data_imputed,
                                    index = data.index,
                                    columns = data.columns)

        self.imputed_cat = data_imputed

    def OHE_cat(self, data, encoder = None) -> pd.DataFrame:

        nominal = ['authorities_contacted', 'incident_state', 'insured_hobbies',  'property_damage']

        data_ohe = data[nominal]

        if encoder == None:
            # Create Object
            encoder = OneHotEncoder(handle_unknown = 'ignore',
                                    drop = 'if_binary')
            encoder.fit(data_ohe)
            encoder_col = encoder.get_feature_names_out(data_ohe.columns)
        
        self.encoder_ohe = encoder

        # Transform the data
        encoder_col = encoder.get_feature_names_out(data_ohe.columns)

        data_encoded = encoder.transform(data_ohe).toarray()
        data_encoded = pd.DataFrame(data_encoded,
                                    index = data_ohe.index,
                                    columns = encoder_col)
        
        self.encoded_ohe = data_encoded

    def OE_cat(self, data, encoder = None) -> pd.DataFrame:

        ordinal = ['collision_type', 'incident_type', 'incident_severity']

        data_le = data[ordinal]

        collision_type = ['UNKNOWN', 'Side Collision', 'Rear Collision', 'Front Collision']
        incident_severity = ['Trivial Damage','Minor Damage','Major Damage','Total Loss']
        incident_type = ['Parked Car','Single Vehicle Collision','Multi-vehicle Collision','Vehicle Theft']

        if encoder == None:
            # Create object
            encoder = OrdinalEncoder(categories=[collision_type, incident_type,incident_severity])
            encoder.fit(data_le)

        self.encoder_oe = encoder

        ## Transform the data
        data_encoded = encoder.transform(data_le)
        data_encoded = pd.DataFrame(data_encoded,
                                    index = data_le.index,
                                    columns = data_le.columns)
        
        self.encoded_oe = data_encoded

    def scaling_Data(self, data, scaler=None) -> pd.DataFrame:

        if scaler == None:

            # Create Fit Scaler
            scaler = StandardScaler()
            scaler.fit(data)

        self.scaler = scaler

        # Transform data
        data_scaled = scaler.transform(data)

        data_scaled = pd.DataFrame(data_scaled,
                                    index = data.index,
                                    columns = data.columns)

        self.scaled_data = data_scaled


    def fit(
        self,
        data, 
        imputer_num=None, imputer_cat=None, 
        encoder_ohe=None, encoder_oe=None,
        scaler=None,
        y=True
    ):

        # Split features into numeric and categoric
        if y == True:
            self.split_xy(data)
            self.split_NumCat(self.x)

        else:
            self.split_NumCat(x)

        # Handling numeric features
        self.imputer_Num(self.num, imputer=imputer_num)

        # Handling categoric features
        self.imputer_Cat(self.cat, imputer=imputer_cat)

        # Label Encoding
        self.OHE_cat(self.imputed_cat, encoder=encoder_ohe)

        self.OE_cat(self.imputed_cat, encoder=encoder_oe)

        # Concatenate imputed numeric and encoded categoric features
        concatenated_data = pd.concat([self.imputed_num, self.encoded_oe, self.encoded_ohe], axis=1)

        # Scaling data
        self.scaling_Data(concatenated_data, scaler=scaler)

        self.x_clean = self.scaled_data
        
        if y == True:
            return self.x_clean, self.y

        else:
            return self.x_clean

    def return_data(self):

        return self.x_clean

    def return_xy(self):

        return self.x_clean, self.y_clean

    def dumping_variable(self):

        util.pickle_dump(self.imputer_num, config_data['imputer_num'])
        util.pickle_dump(self.imputer_cat, config_data['imputer_cat'])
        util.pickle_dump(self.encoder_ohe, config_data['ohe_path'])
        util.pickle_dump(self.encoder_oe, config_data['le_path'])
        util.pickle_dump(self.scaler, config_data['scaler_path'])


Try the function into train_set_data

In [43]:
preprocessing_train_ = preprocessing_data()

x_train, y_train = preprocessing_train_.fit(train_set)

preprocessing_train_.dumping_variable()

## <a id='toc7_2_'></a>[Handling Non-Balancing Valid & Test set](#toc0_)

In [44]:
x_valid, y_valid = preprocessing_train_.fit(
                        valid_set,
                        imputer_num = util.pickle_load(config_data["imputer_num"]),
                        imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                        encoder_oe = util.pickle_load(config_data["le_path"]),
                        encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                        scaler = util.pickle_load(config_data['scaler_path'])
)

x_test, y_test = preprocessing_train_.fit(
                        test_set,
                        imputer_num = util.pickle_load(config_data["imputer_num"]),
                        imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                        encoder_oe = util.pickle_load(config_data["le_path"]),
                        encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                        scaler = util.pickle_load(config_data['scaler_path'])
)

## <a id='toc7_3_'></a>[Handling Smote Data Set](#toc0_)

In [45]:
preprocessing_smote_ = preprocessing_data()

x_smote, y_smote = preprocessing_smote_.fit(train_set_smote)

preprocessing_smote_.dumping_variable()

In [46]:
x_valid_smote, y_valid_smote = preprocessing_smote_.fit(
                                valid_set_smote,
                                imputer_num = util.pickle_load(config_data["imputer_num"]),
                                imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                                encoder_oe = util.pickle_load(config_data["le_path"]),
                                encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                                scaler = util.pickle_load(config_data['scaler_path'])
)

x_test_smote, y_test_smote = preprocessing_smote_.fit(
                                test_set_smote,
                                imputer_num = util.pickle_load(config_data["imputer_num"]),
                                imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                                encoder_oe = util.pickle_load(config_data["le_path"]),
                                encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                                scaler = util.pickle_load(config_data['scaler_path'])
)

## <a id='toc7_4_'></a>[Handling Over Data Set](#toc0_)

In [47]:
preprocessing_over_ = preprocessing_data()

x_over, y_over = preprocessing_over_.fit(train_set_over)

preprocessing_over_.dumping_variable()

In [48]:
x_valid_over, y_valid_over = preprocessing_over_.fit(
                                valid_set_over,
                                imputer_num = util.pickle_load(config_data["imputer_num"]),
                                imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                                encoder_oe = util.pickle_load(config_data["le_path"]),
                                encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                                scaler = util.pickle_load(config_data['scaler_path'])
)

x_test_over, y_test_over = preprocessing_over_.fit(
                                test_set_over,
                                imputer_num = util.pickle_load(config_data["imputer_num"]),
                                imputer_cat = util.pickle_load(config_data["imputer_cat"]),
                                encoder_oe = util.pickle_load(config_data["le_path"]),
                                encoder_ohe = util.pickle_load(config_data["ohe_path"]),
                                scaler = util.pickle_load(config_data['scaler_path'])
)

In [49]:
y_over.value_counts(normalize = True)

0    0.5
1    0.5
Name: fraud_reported, dtype: float64

# <a id='toc8_'></a>[<b> Dump Train, Valid & Test Set </b>](#toc0_)

In [50]:
x_train_final = {
    "nonbalance" : x_train,
    "smote" : x_smote,
    "oversampling" : x_over
}

y_train_final = {
    "nonbalance" : y_train,
    "smote" : y_smote,
    "oversampling" : y_over
}

In [51]:
x_valid_final = {
    "nonbalance" : x_valid,
    "smote" : x_valid_smote,
    "oversampling" : x_valid_over
}

y_valid_final = {
    "nonbalance" : y_valid,
    "smote" : y_valid_smote,
    "oversampling" : y_valid_over
}

In [52]:
x_test_final = {
    "nonbalance" : x_test,
    "smote" : x_test_smote,
    "oversampling" : x_test_over
}

y_test_final = {
    "nonbalance" : y_test,
    "smote" : y_test_smote,
    "oversampling" : y_test_over
}

In [53]:
util.pickle_dump(x_train_final, config_data['train_set_clean'][0])
util.pickle_dump(y_train_final, config_data['train_set_clean'][1])

util.pickle_dump(x_valid_final, config_data['valid_set_clean'][0])
util.pickle_dump(y_valid_final, config_data['valid_set_clean'][1])

util.pickle_dump(x_test_final, config_data['test_set_clean'][0])
util.pickle_dump(y_test_final, config_data['test_set_clean'][1])