# Feature Engineering (Data Preprocessing)

In [1]:
# Load the required libraries 
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns 

# import warnings 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Load the clean data set  
df = pd.read_csv("../data/fraudTrainClean.csv")
df.head()

Unnamed: 0,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,trans_month,trans_day,trans_hour,age
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",36.011293,-82.048315,0,1,1,0,37
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,49.159047,-118.186462,0,1,1,0,47
2,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,43.150704,-112.154481,0,1,1,0,63
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,47.034331,-112.561071,0,1,1,0,58
4,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,38.674999,-78.632459,0,1,1,0,39


Feature Engineering Activities 

    - Category, street, city, state and job converted to numerical column - label encoding
    - Gender to numerical columns by one hot encoding
    - Drop the merchant name

**Category, street, city, state and job converted to numerical column - label encoding**

In [3]:
df.columns

Index(['merchant', 'category', 'amt', 'gender', 'street', 'city', 'state',
       'zip', 'lat', 'long', 'city_pop', 'job', 'merch_lat', 'merch_long',
       'is_fraud', 'trans_month', 'trans_day', 'trans_hour', 'age'],
      dtype='object')

In [4]:
label_encode_columns = ["category", "street", "city", "state", "job"]

for col in label_encode_columns: 
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Confirm the operation 
df[label_encode_columns]

Unnamed: 0,category,street,city,state,job
0,8,568,526,27,370
1,4,435,612,47,428
2,0,602,468,13,307
3,2,930,84,26,328
4,9,418,216,45,116
...,...,...,...,...,...
1296670,0,154,330,44,215
1296671,1,856,813,20,360
1296672,1,158,346,32,308
1296673,1,433,471,41,485


**Gender to numerical columns by hot encoding**

In [5]:
df["gender"].unique()

array(['F', 'M'], dtype=object)

In [6]:
# Switch to label encoding this time 
le = LabelEncoder()

df["gender"] = le.fit_transform(df["gender"])

le.classes_

array(['F', 'M'], dtype=object)

In [7]:
df.head()

Unnamed: 0,merchant,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,trans_month,trans_day,trans_hour,age
0,"fraud_Rippin, Kub and Mann",8,4.97,0,568,526,27,28654,36.0788,-81.1781,3495,370,36.011293,-82.048315,0,1,1,0,37
1,"fraud_Heller, Gutmann and Zieme",4,107.23,0,435,612,47,99160,48.8878,-118.2105,149,428,49.159047,-118.186462,0,1,1,0,47
2,fraud_Lind-Buckridge,0,220.11,1,602,468,13,83252,42.1808,-112.262,4154,307,43.150704,-112.154481,0,1,1,0,63
3,"fraud_Kutch, Hermiston and Farrell",2,45.0,1,930,84,26,59632,46.2306,-112.1138,1939,328,47.034331,-112.561071,0,1,1,0,58
4,fraud_Keeling-Crist,9,41.96,1,418,216,45,24433,38.4207,-79.4629,99,116,38.674999,-78.632459,0,1,1,0,39


In [8]:
# Drop the merchant column 
df.drop(["merchant"], axis=1, inplace=True)
df.head()

Unnamed: 0,category,amt,gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,trans_month,trans_day,trans_hour,age
0,8,4.97,0,568,526,27,28654,36.0788,-81.1781,3495,370,36.011293,-82.048315,0,1,1,0,37
1,4,107.23,0,435,612,47,99160,48.8878,-118.2105,149,428,49.159047,-118.186462,0,1,1,0,47
2,0,220.11,1,602,468,13,83252,42.1808,-112.262,4154,307,43.150704,-112.154481,0,1,1,0,63
3,2,45.0,1,930,84,26,59632,46.2306,-112.1138,1939,328,47.034331,-112.561071,0,1,1,0,58
4,9,41.96,1,418,216,45,24433,38.4207,-79.4629,99,116,38.674999,-78.632459,0,1,1,0,39


In [9]:
# write the data locally
df.to_csv("../data/preprocessed/fraudTrain.csv", index=False)