In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve, accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from keras.layers import LeakyReLU

#from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Input 
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
import keras

import time

#add other imports here if any (for example, pytorch)
import torch
import torch.nn as nn
import torch.optim as optim

Label_encoder = LabelEncoder()

# Load the dataset
train_data = pd.read_csv('fraudTrain.csv')
pd.set_option('display.max_columns', None)

#Convert Gender
gender_conversion = {'F': 0, 'M': 1}
train_data['gender'] = train_data['gender'].map(gender_conversion)

train_data['name'] = train_data['first'] + " " + train_data['last']
train_data['address'] = train_data['street'] + ", " + train_data['city'] + ", " + train_data['state']

#Convert first name
train_data['name'] = Label_encoder.fit_transform(train_data['name'])

train_data['age'] = 2024 - pd.to_numeric(train_data['dob'].str[:4]) 

#Convert Job
train_data['job'] = Label_encoder.fit_transform(train_data['job'])

train_data['merchant'] = Label_encoder.fit_transform(train_data['merchant'])

train_data['category'] = Label_encoder.fit_transform(train_data['category'])

train_data['address'] = Label_encoder.fit_transform(train_data['address'])

train_data['trans_num'] = Label_encoder.fit_transform(train_data['trans_num'])

train_data = train_data.drop(columns=['first', 'last', 'street', 'street', 'city', 'state', 'trans_date_trans_time', 'Unnamed: 0', 'dob'])


test_data = pd.read_csv('fraudTest.csv')
pd.set_option('display.max_columns', None)

#Convert Gender
gender_conversion = {'F': 0, 'M': 1}
test_data['gender'] = test_data['gender'].map(gender_conversion)

test_data['name'] = test_data['first'] + " " + test_data['last']
test_data['address'] = test_data['street'] + ", " + test_data['city'] + ", " + test_data['state']

#Convert first name
test_data['name'] = Label_encoder.fit_transform(test_data['name'])

test_data['age'] = 2024 - pd.to_numeric(test_data['dob'].str[:4]) 

#Convert Job
test_data['job'] = Label_encoder.fit_transform(test_data['job'])

test_data['merchant'] = Label_encoder.fit_transform(test_data['merchant'])

test_data['category'] = Label_encoder.fit_transform(test_data['category'])

test_data['address'] = Label_encoder.fit_transform(test_data['address'])

test_data['trans_num'] = Label_encoder.fit_transform(test_data['trans_num'])

test_data = test_data.drop(columns=['first', 'last', 'street', 'street', 'city', 'state', 'trans_date_trans_time', 'Unnamed: 0', 'dob'])


train_data.head()
y_train = train_data["is_fraud"]
x_train = train_data.drop(columns=["is_fraud"], axis=1)

In [None]:
test_data.head()
y_test = test_data["is_fraud"]
x_test = test_data.drop(columns=["is_fraud"], axis=1)

In [None]:
test_data.head()

In [None]:
from imblearn.over_sampling import SMOTE

sc = StandardScaler() 
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)

#Use SMOTE (Synthetic Minority Oversampling Technique) to generate more samples of the fraud class.
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_std, y_train)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LeakyReLU, Dropout
from tensorflow.keras.optimizers import SGD

def Grid_Search_NN_model(hidden_neurons=8, learning_rate=0.1, dropout_rate=0.5):
    input_layer = Input(shape=(x_train_std.shape[1],))
    hidden1 = Dense(hidden_neurons)(input_layer)
    hidden1 = ReLU(alpha=0.01)(hidden1)
    hidden1 = Dropout(dropout_rate)(hidden1) 
    
    hidden2 = Dense(hidden_neurons)(hidden1)
    hidden2 = ReLU(alpha=0.01)(hidden2)
    hidden2 = Dropout(dropout_rate)(hidden2)

    hidden3 = Dense(hidden_neurons)(hidden2)
    hidden3 = ReLU(alpha=0.01)(hidden3)
    hidden3 = Dropout(dropout_rate)(hidden3)

    hidden4 = Dense(hidden_neurons)(hidden3)
    hidden4 = ReLU(alpha=0.01)(hidden4)
    hidden4 = Dropout(dropout_rate)(hidden4)
    
    output = Dense(1, activation='sigmoid')(hidden2)

    
    # Create the model object
    myGSModel = Model(inputs=input_layer, outputs=output)  

    # Define optimizer with learning rate
    optimizer =  keras.optimizers.SGD(learning_rate=learning_rate)

    myGSModel.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return myGSModel

In [None]:
# Run gridsearch here
param_grid = {
    "model__hidden_neurons": [8, 16, 32],  # Number of neurons in each hidden layer
    "model__learning_rate": [0.01, 0.1],  # Learning rates to test
    "batch_size": [32],                   # Fixed batch size
    "epochs": [20]                        # Number of epochs per training session
}


#to use a keras model with sklearn we need to call a wrapper function where the build function is our previously defined function
model = KerasClassifier(model=Grid_Search_NN_model, verbose=0)
#instantiate gridsearch object using 3 fold crossvaliadtion
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
#fit the gridsearch object on the data
grid_result = grid.fit(x_train_std, y_train)
#determine the best parameter
print(grid_result.best_params_)

In [None]:

print(f"Best Hyperparameters: {grid_result.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_result.best_score_:.4f}")


In [None]:
#BASED OFF OF AWESOME GRID SEARCH RESULTS



In [None]:
#NEED TO EDIT THIS BASED OFF HYPER PARAMETER OPTIMIZATION
#NEED TO EDIT THIS BASED OFF HYPER PARAMETER OPTIMIZATION
from sklearn.utils.class_weight import compute_class_weight

losses = []

param_grid2 = {
    "model_hidden_neurons": [16],
    "model_learning_rate": [0.1],
    "batch_size": [32],
    "epochs": [20]
}

model2 = Grid_Search_NN_model(hidden_neurons=16, learning_rate = 0.1, dropout_rate = 0.5)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

best_model = model2.fit(
    x_train_resampled,
    y_train_resampled,
    epochs=20,
    batch_size=32,
    class_weight=class_weights_dict,
    verbose = 1,
    validation_split=0.2
)

In [None]:
losses = best_model.history['loss']

y_pred = model2.predict(x_test_std)
y_pred_val = (y_pred >= 0.5)


precision = precision_score(y_test, y_pred_val)
recall = recall_score(y_test, y_pred_val)
f1 = f1_score(y_test, y_pred_val)

# Print the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


plt.figure(figsize=(8, 6))
plt.plot(range(1, len(losses) + 1), losses, marker='o', linestyle='-', color='b')
plt.title('Loss vs. Epochs', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Binary Cross-Entropy Loss', fontsize=14)
plt.grid(True)
plt.show()

#1 hidden layers
#Precision: 0.6412
#Recall: 0.5748
#F1-Score: 0.6062