# Feature Engineering

In [1]:
import sys
import os
import random
import calendar
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew, ttest_ind
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../scripts')
from data_loader import *
from feature_engineering import *

In [2]:
# path to the CSV file
filename1 = 'fraud_ip_data.parquet'
filename2 = 'credit_card_data.parquet'

path1 = os.path.join('..', 'data/processed', filename1)
path2 = os.path.join('..', 'data/processed', filename2)

# Load dataset
fraud_ip_data = load_data(path1)
credit_card_data = load_data(path2)

In [3]:
fraud_ip_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0,Unknown
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,United States


In [4]:
fraud_ip_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149277 entries, 0 to 151111
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         149277 non-null  int64         
 1   signup_time     149277 non-null  datetime64[ns]
 2   purchase_time   149277 non-null  datetime64[ns]
 3   purchase_value  149277 non-null  int64         
 4   device_id       149277 non-null  object        
 5   source          149277 non-null  object        
 6   browser         149277 non-null  object        
 7   sex             149277 non-null  object        
 8   age             149277 non-null  int64         
 9   ip_address      149277 non-null  int64         
 10  class           149277 non-null  int64         
 11  country         149277 non-null  object        
dtypes: datetime64[ns](2), int64(5), object(5)
memory usage: 14.8+ MB


In [5]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
credit_card_data['Class'].value_counts()

Class
0    283253
1       473
Name: count, dtype: int64

In [10]:
fraud_ip_data['class'].value_counts()

class
0    135304
1     13973
Name: count, dtype: int64

## Feature Engineering

In [7]:
from feature_engineering import FeatureEngineering

In [8]:
feature_engineer = FeatureEngineering(fraud_ip_data)
df_processed = feature_engineer.preprocess(scale=False, normalize=False)

In [9]:
df_processed.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,user_transaction_count,device_transaction_count,user_transaction_velocity,device_transaction_velocity,purchase_hour_of_day,purchase_day_of_week,signup_hour_of_day,signup_day_of_week,device_id_hash,ip_device_hash
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,...,1,1,0.019231,0.019231,2,5,22,1,12513400056446199135,7425471509159509586
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,...,1,1,1.0,1.0,1,0,20,6,16127019210742502905,8185797105025392718
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,...,1,12,1.0,12.0,18,3,18,3,4439187569685402083,15847530065043608306
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,...,1,1,0.2,0.2,13,0,21,1,11158880965962902071,10181174757385685349
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,...,1,1,0.02,0.02,18,2,7,1,16277722974267400436,9294410698253790355


In [10]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149277 entries, 0 to 149276
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   user_id                      149277 non-null  int64         
 1   signup_time                  149277 non-null  datetime64[ns]
 2   purchase_time                149277 non-null  datetime64[ns]
 3   purchase_value               149277 non-null  int64         
 4   device_id                    149277 non-null  object        
 5   source                       149277 non-null  object        
 6   browser                      149277 non-null  object        
 7   sex                          149277 non-null  object        
 8   age                          149277 non-null  int64         
 9   ip_address                   149277 non-null  int64         
 10  class                        149277 non-null  int64         
 11  country                   

In [11]:
# Define output folder and file name
output_folder = os.path.join('..', 'data', 'featured')
filename = 'featured_fraud_ip_data.parquet'

output_path = save_data(df_processed, output_folder, filename)

Dataset saved to ..\data\featured\featured_fraud_ip_data.parquet


## Encode Categorical Features
## Normalize and Scale

In [3]:
feature_engineer = FeatureEngineering(fraud_ip_data)
scaled_df= feature_engineer.preprocess()

In [4]:
scaled_df

Unnamed: 0,user_id,purchase_value,age,ip_address,class,device_id_hash,ip_device_hash,user_transaction_count,user_transaction_velocity,purchase_hour_of_day,purchase_day_of_week,signup_hour_of_day,signup_day_of_week,country_id_hash,source_encoded,browser_encoded,sex_encoded
0,0.100011,0.0250,0.615385,0.117196,0.0,0.793513,0.739952,0.0,0.000530,1.000000,0.000000,0.217391,0.000000,0.728108,1.000000,0.571389,1.0
1,0.010595,0.0500,0.461538,0.316396,0.0,0.882391,0.511297,0.0,0.005340,0.260870,0.000000,0.347826,0.500000,0.757123,1.000000,0.362915,0.0
2,0.714106,0.4625,0.179487,0.594414,0.0,0.883671,0.459606,0.0,0.009218,0.869565,0.833333,0.565217,0.666667,0.728108,0.976232,1.000000,1.0
3,0.303369,0.0250,0.538462,0.742689,0.0,0.463492,0.615528,0.0,0.035372,0.000000,0.333333,0.478261,1.000000,0.888382,0.000000,0.571389,1.0
4,0.928565,0.5375,0.000000,0.269405,0.0,0.134684,0.338486,0.0,0.002608,0.434783,0.833333,0.391304,0.833333,0.728108,0.976232,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27941,0.125231,0.3875,0.282051,0.202498,1.0,0.382222,0.834205,0.0,0.001610,0.391304,1.000000,0.260870,0.666667,0.129641,0.000000,0.571389,0.0
27942,0.288679,0.6500,0.153846,0.699275,1.0,0.976577,0.622117,0.0,1.000000,0.304348,0.500000,0.304348,0.500000,0.879999,0.000000,0.571389,1.0
27943,0.013888,0.2250,0.538462,0.769989,1.0,0.369668,0.809461,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,0.151934,0.000000,1.000000,1.0
27944,0.630898,0.1375,0.538462,0.628392,1.0,0.974848,0.194110,0.0,1.000000,0.391304,1.000000,0.391304,1.000000,0.675589,0.976232,0.571389,0.0


## Undersampling

In [5]:
scaled_df['class'].value_counts()

class
0.0    13973
1.0    13973
Name: count, dtype: int64

In [6]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27946 entries, 0 to 27945
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   user_id                    27946 non-null  float64
 1   purchase_value             27946 non-null  float64
 2   age                        27946 non-null  float64
 3   ip_address                 27946 non-null  float64
 4   class                      27946 non-null  float64
 5   device_id_hash             27946 non-null  float64
 6   ip_device_hash             27946 non-null  float64
 7   user_transaction_count     27946 non-null  float64
 8   user_transaction_velocity  27946 non-null  float64
 9   purchase_hour_of_day       27946 non-null  float64
 10  purchase_day_of_week       27946 non-null  float64
 11  signup_hour_of_day         27946 non-null  float64
 12  signup_day_of_week         27946 non-null  float64
 13  country_id_hash            27946 non-null  flo

## Save dataset

In [7]:
# Define output folder and file name
output_folder = os.path.join('..', 'data', 'model_input')
filename = 'final_df.parquet'

output_path = save_data(scaled_df, output_folder, filename)

Dataset saved to ..\data\model_input\final_df.parquet
