# Feature Engineering
- Create Aggregate Features
- Extract Features
- Encode Categorical Variables
- Handle Missing Values
- Normalize/Standardize Numerical Features

In [113]:
import pandas as pd
import numpy as np
import importlib

import os
import sys

sys.path.append(os.path.abspath(os.path.join('..','src')))

import eda
importlib.reload(eda)

import feature_eng
importlib.reload(feature_eng)

import warnings
warnings.filterwarnings('ignore')

In [114]:
# intialize the feature engineering clas
fre_eng = feature_eng.FeatEng('data.csv')
fre_eng.change_datatype()

df = fre_eng.get_dataframe()

New shape of the dataframe: (95662, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   TransactionId         95662 non-null  category      
 1   BatchId               95662 non-null  category      
 2   AccountId             95662 non-null  category      
 3   SubscriptionId        95662 non-null  category      
 4   CustomerId            95662 non-null  category      
 5   ProviderId            95662 non-null  category      
 6   ProductId             95662 non-null  category      
 7   ProductCategory       95662 non-null  category      
 8   ChannelId             95662 non-null  category      
 9   Amount                95662 non-null  float64       
 10  Value                 95662 non-null  int64         
 11  TransactionStartTime  95662 non-null  datetime64[ns]
 12  PricingStrategy       95662 non-nu

In [115]:
# add total transaction feature
fre_eng.aggregate_transaction()     # calculate RFMS values

# extract each parts of datetime
fre_eng.extract_datetime()

# change categorical features to numerical
fre_eng.label_encoder()

(95662, 20)
Shape of the dataframe (95662, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   TransactionId            95662 non-null  int64         
 1   BatchId                  95662 non-null  int64         
 2   AccountId                95662 non-null  int64         
 3   SubscriptionId           95662 non-null  int64         
 4   CustomerId               95662 non-null  int64         
 5   ProviderId               95662 non-null  int64         
 6   ProductId                95662 non-null  int64         
 7   ProductCategory          95662 non-null  int64         
 8   ChannelId                95662 non-null  int64         
 9   Amount                   95662 non-null  float64       
 10  Value                    95662 non-null  int64         
 11  TransactionStartTime     95662 non-null  datet

In [116]:
# null value columns
fre_eng.null_col()

# check
df = fre_eng.get_dataframe()
df[df['Amount_std'].isnull()]

Column containing null value: Amount_std


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,...,Amount_Total,Amount_Average,Amount_std,TransactionId_frequency,Recency,FraudResult_severity,Month,Year,Day,Hour


In [117]:
# standardization of the data
fre_eng.normalization()

# adding a risk score column
fre_eng.risk_score()

- Amount_std is null for customers with only one transaction.

In [118]:
# the risk space
fre_eng.risk_space()

[[-5.02732029e-03  1.19919614e-03 -2.73165603e-02  6.96326779e-02]
 [ 1.18243814e+00 -2.82053893e-01  6.42492243e+00  2.24716049e+01]]


> Looking at the above cluster centers,
-  Cluster 0 has more recent, more frequent, low monetary value also low severity than Cluster 1 respectively, thus Cluster 0 indicates customers with non-defaulting behaviour while Cluster 1 shows default behavior.

In [119]:
# checking for bias
df['FraudResult'].value_counts()

FraudResult
0    95469
1      193
Name: count, dtype: int64

As it can be seen, their is high imbalance in the dataset on the value of *FraudResult* therefore number of clusers in 0 will be higher than in 1.

In [120]:
# check the number of customers in each clusters
df = fre_eng.get_dataframe()
df['Cluster'].value_counts()

Cluster
0    95257
1      405
Name: count, dtype: int64

In [121]:
# feature selection woe values and adding woe values of features
fre_eng.woe_best_festures()

Selected Continuous Features based on IV: ['Amount_Total', 'TransactionId_frequency', 'Amount', 'Value']
Selected Categorical Features based on IV: ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy', 'Month', 'Day', 'Hour']
All Selected Features: ['Amount_Total', 'TransactionId_frequency', 'Amount', 'Value', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy', 'Month', 'Day', 'Hour']


In [122]:
# export the dataset for model training
df = fre_eng.get_dataframe()
df.to_csv('model_dataset.csv',index = False)

In [123]:
# close logging
fre_eng.close_log()