In [4]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from datetime import datetime

2023-04-20 19:51:30.470009: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# This turns all the axes white in all the matplotlib plots. Comment this out if you dont want that
COLOR = 'white'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR

In [6]:
df_transaction = pd.read_csv('./datasets/ieee-fraud-detection/train_transaction.csv')

df_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Features that are used, isFraud is the target
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain']

In [8]:
df = df_transaction[features]
df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,P_emaildomain,R_emaildomain
0,0,86400,68.5,W,,
1,0,86401,29.0,W,gmail.com,
2,0,86469,59.0,W,outlook.com,
3,0,86499,50.0,W,yahoo.com,
4,0,86506,50.0,H,gmail.com,


In [9]:
target = 'isFraud'
# Categorical features
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain']
# Numeric features
num = ['TransactionAmt']

In [10]:
#Drop rows with missing features
df = df.dropna()
y = df[target].values

In [11]:
x_cat = df.filter(items = cat).values
x_num = df.filter(items = num).values

In [12]:
labelencoder_X = LabelEncoder()
# Label encode every categorical column
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])

In [13]:
# Build input vector X, the training data
X = np.concatenate((x_cat, x_num), axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

In [15]:
X_train = np.asarray(X_train).astype('float32')
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

In [16]:
X_train.shape, y_train.shape

((80784, 5), (80784, 1))

In [17]:
# Initialize model
model = tf.keras.Sequential() # initializing the model
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # first dense layer with 128 neurons with rectified linear unit for a spectrum of values.
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # second layer
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) # final layer with sigmoid for binary classification
model.compile(optimizer='adam', # optomizing weight with adam using stochastic gradient descent
              loss='binary_crossentropy', # evaluate performance of model with binary_crossentropy as output is binary
              metrics=['accuracy']) # gives out accuracy of model
model.fit(X_train, y_train, epochs=3) # pass training data 3 times through model and fit

# loss is on training data, lower loss is good but might overfit
# accuracy is on training data

Epoch 1/3


2023-04-20 19:51:59.789410: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f84e8ea3d30>

In [18]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)
# accuracy is on validation data - performance in wild

7.1495466232299805 0.9173144698143005


In [19]:
# Adding extra dense layer decreases loss and increases accuracy

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
0.2854485511779785 0.9171659350395203


In [20]:
# Adding more features to the model
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']

df = df_transaction[features]
df.head()

target = 'isFraud'
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']
num = ['TransactionAmt']

df = df.dropna()
y = df[target].values

x_cat = df.filter(items = cat).values 
x_num = df.filter(items = num).values

labelencoder_X = LabelEncoder()
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])
    
X = np.concatenate((x_cat, x_num), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

X_train = np.asarray(X_train).astype('float32') 
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

X_train.shape, y_train.shape

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

# For same number of layers, adding the extra feature of card4 does not really enhance the predictive prowess of our model.

Epoch 1/3
Epoch 2/3
Epoch 3/3
6.708495140075684 0.9204027056694031


#Feature Importance
We are going to determine which feature is the most important for predicting the target feature "isFraud"

Methology:
Use the get_weights method of the Dense layer object. This method returns a list of two numpy arrays, the first of which contains the weight values and the second contains the bias values for the layer.

Example code:
Methology below compares each feature against "isFraud" target feature.

In [21]:
# This is for the first hidden layer. 

#Get the weights for the first hidden layer
weights = model.layers[0].get_weights()[0]

# Calculate the feature importance scores as the absolute sum of the weights for each feature
importance_scores = np.abs(weights).sum(axis=0)

# Normalize the scores to sum to 1
importance_scores = importance_scores / importance_scores.sum()

# Print the importance scores for each feature
print('Feature Importance Scores:')
for i, feature in enumerate(features[1:]):
    print(f'{feature}: {importance_scores[i]:.3f}')

Feature Importance Scores:
TransactionDT: 0.007
TransactionAmt: 0.007
ProductCD: 0.005
P_emaildomain: 0.006
R_emaildomain: 0.007
card4: 0.007


In [22]:
#The methology below implements feature importance for each layer

# Define a function to calculate and normalize feature importance scores for a given layer
def get_feature_importance(layer):
    # Get the weights for the layer
    weights = layer.get_weights()[0]

    # Calculate the feature importance scores as the absolute sum of the weights for each feature
    importance_scores = np.abs(weights).sum(axis=0)

    # Normalize the scores to sum to 1
    importance_scores = importance_scores / importance_scores.sum()

    return importance_scores

# Calculate the feature importance scores for each layer
layer_importance = {}
for i, layer in enumerate(model.layers):
    if isinstance(layer, tf.keras.layers.Dense):
        layer_importance[f'layer_{i}'] = get_feature_importance(layer)

# Print the importance scores for each layer and feature
for layer_name, importance_scores in layer_importance.items():
    print(f'{layer_name} Feature Importance Scores:')
    for i, feature in enumerate(features[1:]):
        print(f'{feature}: {importance_scores[i]:.3f}')

layer_0 Feature Importance Scores:
TransactionDT: 0.007
TransactionAmt: 0.007
ProductCD: 0.005
P_emaildomain: 0.006
R_emaildomain: 0.007
card4: 0.007
layer_1 Feature Importance Scores:
TransactionDT: 0.007
TransactionAmt: 0.008
ProductCD: 0.008
P_emaildomain: 0.008
R_emaildomain: 0.008
card4: 0.008
layer_2 Feature Importance Scores:
TransactionDT: 1.000


IndexError: index 1 is out of bounds for axis 0 with size 1

#Undersampling
Implementing NearMiss Undersampling technique

This method selects the majority class examples that are closest to the minority class examples, based on a distance metric. This can help focus on the most informative majority class examples and reduce the imbalance.

In [24]:
# Import required libraries
from imblearn.under_sampling import NearMiss

# Instantiate the NearMiss object
nm = NearMiss(version=2, sampling_strategy='majority', n_neighbors=3)

In [26]:
# Apply undersampling to the original data
X_train_resampled, y_train_resampled = nm.fit_resample(X_train, y_train)


In [29]:
# Convert data to numpy arrays and preprocess
X_train_resampled = np.asarray(X_train_resampled).astype('float32')
y_train_resampled = np.asarray(y_train_resampled).astype('float32').reshape((-1, 1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1, 1))
X_test = np.asarray(X_test).astype('float32')
y_test = np.asarray(y_test).astype('float32').reshape((-1, 1))

# Print the shape of the resampled data
print('Original data shape:', X_train.shape, y_train.shape)
print('Resampled data shape:', X_train_resampled.shape, y_train_resampled.shape)

#Save the undersampled data for NearMiss
X_train_nm = X_train_resampled
y_train_nm = y_train_resampled

Original data shape: (80656, 6) (80656, 1)
Resampled data shape: (13500, 6) (13500, 1)


#Implement Tomek Links Undersampling

This method involves removing samples that are classified as borderline cases, where there is a very small distance between samples of the minority class and majority class. This can help in removing noisy or ambiguous data and improve classification performance.

In [31]:
# Import required libraries
from imblearn.under_sampling import TomekLinks

# Instantiate the TomekLinks object
tl = TomekLinks()

# Apply undersampling to the original data
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Convert data to numpy arrays and preprocess
X_train_resampled = np.asarray(X_train_resampled).astype('float32')
y_train_resampled = np.asarray(y_train_resampled).astype('float32').reshape((-1, 1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1, 1))
X_test = np.asarray(X_test).astype('float32')
y_test = np.asarray(y_test).astype('float32').reshape((-1, 1))

# Print the shape of the resampled data
print('Original data shape:', X_train.shape, y_train.shape)
print('Resampled data shape:', X_train_resampled.shape, y_train_resampled.shape)

#Save the undersampled data for Tomek Links
X_train_tl = X_train_resampled
y_train_tl = y_train_resampled


Original data shape: (80656, 6) (80656, 1)
Resampled data shape: (79423, 6) (79423, 1)


In [37]:
#Passing NearMiss sorted data to the model

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train_nm, y_train_nm, epochs=3, validation_data=(X_val, y_val))

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Resampled data:', test_loss, test_acc)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Resampled data: 1.1114143133163452 0.8828057050704956


In [38]:
#Passing Tomek Links sorted data to the model

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train_tl, y_train_tl, epochs=3)

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Resampled data:', test_loss, test_acc)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Resampled data: 0.29636842012405396 0.9127588868141174
