In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from datetime import datetime

Use this as a base setup for training a neural network on the data

# Dataset information - 

 ### Transaction Dataset information
- TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
- TransactionAMT: transaction payment amount in USD
- ProductCD: product code, the product for each transaction
- card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
- addr: address
- dist: distance
- P_ and (R__) emaildomain: purchaser and recipient email domain
- C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
- D1-D15: timedelta, such as days between previous transaction, etc.
- M1-M9: match, such as names on card and address, etc.
- Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
### Categorical Features:
- ProductCD
- card1 - card6
- addr1, addr2(billing region, country)
- P_emaildomain
- R_emaildomain
- M1 - M9



In [3]:
# This turns all the axes white in all the matplotlib plots. Comment this out if you dont want that
COLOR = 'white'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR

### Train model

In [4]:
df_transaction = pd.read_csv('../datasets/ieee-fraud-detection/train_transaction.csv')
df_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Features that are used, isFraud is the target
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain']

In [6]:
df = df_transaction[features]
df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,P_emaildomain,R_emaildomain
0,0,86400,68.5,W,,
1,0,86401,29.0,W,gmail.com,
2,0,86469,59.0,W,outlook.com,
3,0,86499,50.0,W,yahoo.com,
4,0,86506,50.0,H,gmail.com,


In [7]:
num_rows, num_cols = df.shape
print(f"The dataframe has {num_rows} rows and {num_cols} columns.")

The dataframe has 590540 rows and 6 columns.


In [8]:
# RESPONSE VARIABLES:
target = 'isFraud'
# EXPLANATORY VARIABLES:
# Categorical features
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain']
# Numeric features
num = ['TransactionAmt']

In [9]:
#Drop rows with missing features
df = df.dropna()
y = df[target].values
num_rows, num_cols = df.shape
print(f"The dataframe now has {num_rows} rows and {num_cols} columns.")
# A lot of NA values

The dataframe now has 126227 rows and 6 columns.


In [10]:
x_cat = df.filter(items = cat).values # n x 4 matrix
x_num = df.filter(items = num).values # n x 1 column

In [11]:
rows_cat, cols_cat = x_cat.shape
rows_num, cols_num = x_num.shape
print(f"The x_cat frame has {rows_cat} rows and {cols_cat} columns. The x_num frame has {rows_num} rows and {cols_num} columns.")

The x_cat frame has 126227 rows and 4 columns. The x_num frame has 126227 rows and 1 columns.


In [12]:
labelencoder_X = LabelEncoder()
# Label encode every categorical column
# take column provide label and put back in column. Need only for categorical variable
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])

In [13]:
# Build input vector X, the training data
X = np.concatenate((x_cat, x_num), axis=1) # n x 5 matrix

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
# X is categorical numerized and the numerical explanatory vavriables, y is the the numerical binary response
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)
# split train into 80% train and 20% validation data - final performance in the wild

In [15]:
X_train = np.asarray(X_train).astype('float32') # np array of data type float
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

In [16]:
X_train.shape, y_train.shape

((80784, 5), (80784, 1))

### Train Model
Model:
Input -> Dense(Linear + activation) -> Dense -> Dense -> Output

In [24]:
# Initialize model
model = tf.keras.Sequential() # initializing the model
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # first dense layer with 128 neurons with rectified linear unit for a spectrum of values.
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # second layer
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) # final layer with sigmoid for binary classification
model.compile(optimizer='adam', # optomizing weight with adam using stochastic gradient descent
              loss='binary_crossentropy', # evaluate perfromance of model with binary_crossentropy as output is binary
              metrics=['accuracy']) # gives out accuracy of model
model.fit(X_train, y_train, epochs=3) # pass training data 3 times through model and fit

# loss is on training data, lower loss is good but might overfit
# accuracy is on training data

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f81c89d7820>

In [26]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)
# accuracy is on validation data - performance in wild

0.3715013861656189 0.9202539324760437


Use this as starter code to getting a neural network up

Ideas going forward:
- Add more features to the model
- Use one-hot encoding instead of label-encoding
- Figure out the meanings of hidden features
- Categorize the email features into common vs uncommon emails
- 


In [21]:
# Adding extra dense layer decreases loss and increases accuracy

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
0.2779393196105957 0.9204027056694031


In [27]:
# Adding more features to the model
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']

df = df_transaction[features]
df.head()

target = 'isFraud'
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']
num = ['TransactionAmt']

df = df.dropna()
y = df[target].values

x_cat = df.filter(items = cat).values 
x_num = df.filter(items = num).values

labelencoder_X = LabelEncoder()
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])
    
X = np.concatenate((x_cat, x_num), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

X_train = np.asarray(X_train).astype('float32') 
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

X_train.shape, y_train.shape

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

# For same number of layers, adding the extra feature of card4 does not really enhance the predictive prowess of our model.

Epoch 1/3
Epoch 2/3
Epoch 3/3
4.164956569671631 0.9204027056694031
