# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

# Dataset
The link to the dataset can be found [here](https://www.kaggle.com/datasets/kartik2112/fraud-detection?resource=download).
The goal is to find whether the transaction made was fradulent or not based on the features given in the dataset. Not all the features are equally important, so we would have to do some preprocessing and standardization before training the neural network.

In [2]:
train_dataset = pd.read_csv('/content/fraudTrain.csv')
test_dataset = pd.read_csv('/content/fraudTest.csv')

dataset = pd.concat([train_dataset, test_dataset], axis = 0)

In [3]:
dataset.head() # First look at the data.

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
dataset.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [5]:
dataset = dataset.drop(columns=['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'street', 'merch_lat', 'merch_long', 'trans_num', 'job', 'dob', 'unix_time', 'city'])

In [23]:
dataset.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,state,zip,lat,long,city_pop,is_fraud
0,2019-01-01 00:00:18,misc_net,4.97,F,NC,28654,36.0788,-81.1781,3495,0
1,2019-01-01 00:00:44,grocery_pos,107.23,F,WA,99160,48.8878,-118.2105,149,0
2,2019-01-01 00:00:51,entertainment,220.11,M,ID,83252,42.1808,-112.262,4154,0
3,2019-01-01 00:01:16,gas_transport,45.0,M,MT,59632,46.2306,-112.1138,1939,0
4,2019-01-01 00:03:06,misc_pos,41.96,M,VA,24433,38.4207,-79.4629,99,0


# Data Preprocessing

In [24]:
#nan_values = dataset[dataset.isna().any(axis = 1)]
#nan_values

Unnamed: 0,trans_date_trans_time,category,amt,gender,state,zip,lat,long,city_pop,is_fraud


In [8]:
#sample = nan_values[nan_values['is_fraud'].isna() & nan_values.drop(['is_fraud'], axis = 1).notna().all(axis = 1)]
dataset = dataset.dropna()

In [9]:
X = dataset.iloc[:, :-1].values # Obtain features
y = dataset.iloc[:, -1].values  # Obtain labels

In [10]:
# Convert Datetime to Unix time
datetime_col = pd.to_datetime(X[:, 0])
X[:, 0] = datetime_col.astype(int) / 10**9

In [11]:
# Label encode the Gender column
le = LabelEncoder()
X[:, 3] = le.fit_transform(X[:, 3])

In [12]:
# One Hot Encode the Category and State columns
ct = ColumnTransformer(
    transformers=[
        ('category', OneHotEncoder(), [1]),
        ('state', OneHotEncoder(), [4])
    ],
    remainder='passthrough'
)
X = ct.fit_transform(X)

In [13]:
X = X.toarray()

In [14]:
# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [15]:
# Standardize all the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
# Neural Network Model Architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units = 64, activation = 'relu'),
    tf.keras.layers.Dense(units = 32, activation = 'relu'),
    tf.keras.layers.Dense(units = 16, activation = 'relu'),
    tf.keras.layers.Dense(units = 1, activation = 'sigmoid')]
)

In [17]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [18]:
model.fit(X_train, y_train, batch_size = 32, epochs = 5)

Epoch 1/5
[1m46310/46310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 2ms/step - accuracy: 0.9957 - loss: 0.0199
Epoch 2/5
[1m46310/46310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2ms/step - accuracy: 0.9967 - loss: 0.0120
Epoch 3/5
[1m46310/46310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 2ms/step - accuracy: 0.9970 - loss: 0.0106
Epoch 4/5
[1m46310/46310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 2ms/step - accuracy: 0.9971 - loss: 0.0103
Epoch 5/5
[1m46310/46310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 2ms/step - accuracy: 0.9971 - loss: 0.0100


<keras.src.callbacks.history.History at 0x7a5765d69de0>

In [19]:
# Assign threshold of 0.5. If y_pred > 0.5 then the label is classifed as fradulent, otherwise not fradulent activity.
y_pred = model.predict(X_test) > 0.5

[1m11578/11578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step


In [20]:
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n {cm}')
print(f'Testing Accuracy: {accuracy_score(y_test, y_pred)}')

Confusion Matrix:
 [[368078    417]
 [   685   1299]]
Testing Accuracy: 0.9970254724289366


In [25]:
#import time
#sample.iloc[0] = time.mktime(pd.to_datetime(sample.iloc[0]).timetuple()) / 10**9
#sample['gender'] = le.transform([sample['gender']])
#sample = ct.transform([sample])
#sample = sample.toarray()

In [None]:
#validate_fraud = model.predict(sc.transform([sample])) > 0.5

In [None]:
#result = 'Fraud' if validate_fraud > 0.5 else 'Not Fraud'
#print(f'The sample is {result}')