### 0. Load Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

### Import Configuration File

In [77]:
config_data = util.load_config()

### Load Dataset

In [38]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    # Load set of data
    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # return 3 set of data
    return train_set, valid_set, test_set

In [39]:
train_set, valid_set, test_set = load_dataset(config_data)

## 2. Removing Outlier

In [6]:
def remove_outlier(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    # set_data = set_data.drop(['umbrella_limit'], axis = 1)
    # config_data_num = config_data['int32_col'].copy()
    # config_data_num = [x for x in config_data_num if x != 'umbrella_limit']

    for col in set_data[config_data['int32_col']]:
        q1 = set_data[col].quantile(0.25)
        q3 = set_data[col].quantile(0.75)
        iqr = q3 - q1

        set_data_cleaned = set_data[~((set_data[col] < (q1 - 1.5*iqr)) |
                                    (set_data[col] > (q3 + 1.5*iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())

    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == len(config_data['int32_col'])].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned


In [114]:
train_set_out = remove_outlier(train_set)

### 2. Handling Missing Value

### 2.1 Splitting data into X_train & y_train

In [40]:
def splitxy(set_data):
    x_data = set_data.drop(columns = config_data['label'], axis = 1)
    y_data = set_data[config_data['label']]

    return x_data, y_data

In [41]:
x_train, y_train = splitxy(train_set)

### 2.2 Splitting data into Numerical & Categorical

In [42]:
def splitNumCat(set_data):
    numerical_col = config_data['int32_col']
    categorical_col = config_data['object_predictor']

    x_train_num = set_data[numerical_col]
    x_train_cat = set_data[categorical_col]

    return  x_train_num, x_train_cat

In [43]:
x_train_num, x_train_cat = splitNumCat(x_train)

### 2.3 Handling numerical Data

In [44]:
x_train_num.isna().any()

months_as_customer          False
age                         False
policy_number               False
policy_annual_premium       False
insured_zip                 False
capital-gains               False
capital-loss                False
incident_hour_of_the_day    False
total_claim_amount          False
injury_claim                False
property_claim              False
vehicle_claim               False
dtype: bool

#### 2.3.1 Handling missing value on numerical data

In [45]:
# Perform sanity check for any missing value for future data

def imputerNum(data, imputer = None):
    if imputer == None:
        # Create imputer based on median value
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "median")
        imputer.fit(data)

    # Transform data dengan imputer
    # else:
    data_imputed = pd.DataFrame(imputer.transform(data),
                                index = data.index,
                                columns = data.columns)
    
    # Convert data_imputed to int32
    data_imputed = data_imputed.astype('int32')
    
    return data_imputed, imputer

In [46]:
x_train_num_imputed, imputer_num = imputerNum(data = x_train_num)

### 2.4 Handling Categorical Data

#### 2.4.1 Handling missing value for Categorical Data

In [47]:
def imputerCat(data, imputer = None):
    data.umbrella_limit = data.umbrella_limit.replace('-1000000','1000000')

    for col in ['collision_type','property_damage','police_report_available']:
        data[col] = data[col].replace('?', 'UNKNOWN')
        
    if imputer == None:
        # Create Imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = 'constant',
                                fill_value = 'UNKNOWN')
        imputer.fit(data)

    # Transform data with imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)

    return data_imputed, imputer

In [48]:
x_train_cat_imputed, imputer_cat = imputerCat(data = x_train_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.umbrella_limit = data.umbrella_limit.replace('-1000000','1000000')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].replace('?', 'UNKNOWN')


### 2.4.2 One Hot Encoder

In [70]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

nominal = ['policy_state','policy_csl','policy_deductable','insured_sex','insured_hobbies','collision_type',
            'authorities_contacted','incident_state','incident_city','property_damage','police_report_available',
            'auto_make','auto_model']
ordinal = ['incident_type','witnesses','incident_severity','auto_year','umbrella_limit','bodily_injuries',
            'number_of_vehicles_involved']

def OHEcat(data, encoder_col = None, encoder = None) -> pd.DataFrame:

    data_ohe = data[nominal]

    if encoder == None:
        # Create Object
        encoder = OneHotEncoder(handle_unknown = 'ignore',
                                drop = 'if_binary')
        encoder.fit(data_ohe)
        encoder_col = encoder.get_feature_names_out(data_ohe.columns)
    
    
    # Transform the data
    data_encoded = encoder.transform(data_ohe).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data_ohe.index,
                                columns = encoder_col)
    
    # Save the object
    util.pickle_dump(encoder, config_data["ohe_stasiun_path"])

    return data_encoded, encoder_col, encoder


### 2.4.4 Label Encoding

In [74]:
def LEcat(data, encoder = None) -> pd.DataFrame:

    data_le = data[ordinal]

    bodily_injuries = ['0','1','2']
    witnesses = ['0','1','2','3']
    umbrella_limit = ['0', '1000000', '2000000', '3000000', '4000000', '5000000', '6000000',
                      '7000000','8000000','9000000','10000000']
    incident_severity = ['Trivial Damage','Minor Damage','Major Damage','Total Loss']
    incident_type = ['Parked Car','Single Vehicle Collision','Multi-vehicle Collision','Vehicle Theft']
    auto_year = sorted(data_le.auto_year.unique())
    number_of_vehicles_involved = ['1','2','3','4']

    if encoder == None:
        # Create object
        encoder = OrdinalEncoder(categories=[incident_type,witnesses,incident_severity,auto_year,
                                   umbrella_limit,bodily_injuries,number_of_vehicles_involved])
        encoder.fit(data_le)

    ## Transform the data
    data_encoded = encoder.transform(data_le)
    data_encoded = pd.DataFrame(data_encoded,
                                index = data_le.index,
                                columns = data_le.columns)
    
    # save the object
    util.pickle_dump(encoder, config_data["le_encoder_path"])

    return data_encoded, encoder

### 2.4.5 Encoding data categorical

In [51]:
x_train_cat_ohe, encoder_ohe_col, encoder_ohe = OHEcat(data = x_train_cat_imputed)
x_train_cat_le, encoder_le = LEcat(data = x_train_cat_imputed)

In [52]:
x_train_cat_concat = pd.concat([x_train_cat_ohe,x_train_cat_le], axis = 1)

## 2.5 Concatenate Numerical data & Categorical encoded data

In [53]:
x_train_concat = pd.concat([x_train_num_imputed, x_train_cat_concat], axis=1)


In [54]:
def concat_numcat(data_num, data_cat_ohe, data_cat_le):
    data_cat = pd.concat([data_cat_ohe, data_cat_le], axis=1)
    data_concat = pd.concat([data_num, data_cat], axis=1)

    return data_concat

In [55]:
## Sanity Check
x_train_concat.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_annual_premium,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,total_claim_amount,injury_claim,...,auto_model_Wrangler,auto_model_X5,auto_model_X6,incident_type,witnesses,incident_severity,auto_year,umbrella_limit,bodily_injuries,number_of_vehicles_involved
887,441,55,669501,1270,449421,24000,-50500,4,6400,640,...,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0
317,275,45,403737,1447,605756,39400,-63900,8,64320,5360,...,0.0,0.0,0.0,2.0,1.0,3.0,3.0,0.0,1.0,2.0
796,421,56,728025,1935,470826,49500,-81100,7,92730,16860,...,0.0,0.0,0.0,1.0,3.0,2.0,9.0,4.0,2.0,0.0
425,4,34,424358,1282,616126,0,0,0,66880,6080,...,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,0.0,3.0
991,257,44,109392,1280,433981,59400,-32200,21,46980,0,...,0.0,0.0,0.0,1.0,1.0,3.0,7.0,0.0,0.0,0.0


## 2.6 Standardize the value

In [56]:
def standardizeData(data, scaler =None):
    if scaler == None:
        # Create Fit Scaler
        scaler = StandardScaler()
        scaler.fit(data)

    # Transform data
    data_scaled = scaler.transform(data)
    data_scaled = pd.DataFrame(data_scaled,
                                index = data.index,
                                columns = data.columns)
    
    return data_scaled, scaler

In [57]:
x_train_clean, scaler = standardizeData(data = x_train_concat)

In [58]:
x_train_clean.shape

(800, 131)

## 2.7 Change Label into Int format

In [59]:
y_train_clean = y_train.map(dict(Y=1, N=0))

In [60]:
x_train_clean.shape[0] == y_train_clean.shape[0]

True

In [61]:
train_set_clean = pd.concat([x_train_clean, y_train_clean], axis=1)

## 2.8 Label Categories

In [78]:
def le_fit(data_tobe_fitted: dict, le_path: str) -> LabelEncoder:
    # Create le object
    le_encoder = LabelEncoder()

    # Fit le
    le_encoder.fit(data_tobe_fitted)

    # Save le object
    util.pickle_dump(le_encoder, le_path)

    # Return trained le
    return le_encoder

In [79]:
le_fit(config_data["label_categories"], config_data["le_label_path"])

LabelEncoder()

## 2.9 Balancing Train Data

In [62]:
def balancing(data):
    x_data = data.drop(columns = config_data['label'])
    y_data = data[config_data['label']]

    x_over, y_over = RandomOverSampler(random_state=42).fit_resample(x_data, y_data)
    x_smote, y_smote = SMOTE(random_state=42).fit_resample(x_data, y_data)

    train_set_smote = pd.concat([x_smote, y_smote], axis = 1)
    train_set_over = pd.concat([x_over, y_over], axis = 1)

    return x_smote, y_smote, x_over, y_over

In [63]:
x_smote, y_smote, x_over, y_over = balancing(train_set_clean)

## 2.9 Handle Valid and Test data

In [64]:
# Handling valid data using previous function
## Splitting into num & cat, imputer num & cat, ohe & le, standarization

def handlingData(set_data):
    # Split data into x_data, y_data
    x_data, y_data = splitxy(set_data)

    # Split x_data into numerical and categorical
    x_data_num, x_data_cat = splitNumCat(x_data)

    # Encoding categorical data by separating it into OHE and Ordinal..
    nominal = ['policy_state','policy_csl','policy_deductable','insured_sex','insured_hobbies','collision_type',
                'authorities_contacted','incident_state','incident_city','property_damage','police_report_available',
                'auto_make','auto_model']
    ordinal = ['incident_type','witnesses','incident_severity','auto_year','umbrella_limit','bodily_injuries',
            'number_of_vehicles_involved']

    # Impute num data
    x_data_num_imputed, imputer_num_ = imputerNum(data = x_data_num, imputer = imputer_num)

    # Impute cat data
    x_data_cat_imputed, imputer_cat_ = imputerCat(data = x_data_cat, imputer = imputer_cat)

    x_data_cat_ohe, encoder_col_, encoder_ = OHEcat(x_data_cat_imputed, encoder_ohe_col,
                                                            encoder_ohe)
    x_data_cat_le, encoder_ = LEcat(x_data_cat_imputed, encoder_le)

    x_data_cat_concat = pd.concat([x_data_cat_ohe, x_data_cat_le], axis=1)
        
    # Concatenate data numeric and categorical
    x_data_concat = pd.concat([x_data_num_imputed, x_data_cat_concat], axis = 1)

    # Standardize data using standarscaler
    x_data_clean, scaler_ = standardizeData(x_data_concat, scaler)

    y_data_clean = y_data.map(dict(Y=1, N=0))

    train_set_clean = pd.concat([x_data_clean, y_data_clean], axis=1)

    x_smote_set, y_smote_set, x_over_set, y_over_set = balancing(train_set_clean)

    return x_data_clean, y_data_clean, x_smote_set, y_smote_set, x_over_set, y_over_set


## 2.10 Create smote and oversampling data

In [65]:
x_valid_clean, y_valid_clean, \
x_valid_smote_clean, y_valid_smote_clean, \
x_valid_over_clean, y_valid_over_clean  = handlingData(valid_set)

x_test_clean, y_test_clean, \
x_test_smote_clean, y_test_smote_clean, \
x_test_over_clean, y_test_over_clean = handlingData(test_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.umbrella_limit = data.umbrella_limit.replace('-1000000','1000000')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].replace('?', 'UNKNOWN')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.umbrella_limit = data.umbrella_limit.replace('-1000000','1000000')
A value is tryi

In [66]:
y_test_clean.value_counts(normalize = True)

0    0.76
1    0.24
Name: fraud_reported, dtype: float64

## 3. DUMP TRAINSET

In [67]:
x_train_final = {
    "nonbalance" : x_train_clean,
    "smote" : x_smote,
    "oversampling" : x_over
}

y_train_final = {
    "nonbalance" : y_train_clean,
    "smote" : y_smote,
    "oversampling" : y_over
}

In [69]:
util.pickle_dump(x_train_final, config_data['train_set_clean'][0])
util.pickle_dump(y_train_final, config_data['train_set_clean'][1])

util.pickle_dump(x_valid_clean, config_data['valid_set_clean'][0])
util.pickle_dump(y_valid_clean, config_data['valid_set_clean'][1])

util.pickle_dump(x_test_clean, config_data['test_set_clean'][0])
util.pickle_dump(y_test_clean, config_data['test_set_clean'][1])