In [297]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [73]:
#Function to calculate Z score
def calculateZScore(row):
    card_number = row['Card Number']
    mean_amount = cardStats[card_number]['Mean Amount']
    std_amount = cardStats[card_number]['Std Amount']
    zScore = (row['Amount'] - mean_amount) / std_amount
    return zScore

In [74]:
#Read from file, change columns
file_path1 = r"C:\Users\youss\Desktop\FraudDetection\fraudTrain.csv"
file_path2 = r"C:\Users\youss\Desktop\FraudDetection\fraudTest.csv"

df = pd.read_csv(file_path1)
#df2 = pd.read_csv(file_path2)
#df = pd.concat([df1, df2], ignore_index=True)

df.set_index('trans_num', inplace = True)
df = df.drop('ID', axis = 1)
df = df.drop('firstName', axis = 1)
df = df.drop('lastName', axis = 1)
df = df.drop('merchant', axis = 1) #Remove merchant, too many different labels

In [75]:
#Change time string to object
df["Time"] = pd.to_datetime(df["Time"], errors='coerce')
df.dropna(inplace=True)

In [76]:
#Split time to Day of week, hour, and month
df['Hour of Day'] = df['Time'].dt.hour
df['Day of Week'] = df['Time'].dt.dayofweek
df['Month'] = df['Time'].dt.month

In [77]:
#Turn category into labels
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['category'])

In [78]:
#Group data by card num
grouped_by_card = df.groupby('Card Number')
cardStats = {}

#calculate standard deviation for each card
for card, data in grouped_by_card:
    mean = data['Amount'].mean()
    std = data['Amount'].std()
    cardStats[card] = {'Mean Amount': mean, 'Std Amount': std}

#save data into a dictionary to potentially be used
cardStatsDf = pd.DataFrame.from_dict(cardStats, orient='index')
cardStatsDf.to_csv('CardStats.csv')

In [79]:
#Apply Z Score Function
df['Z Score'] = df.apply(calculateZScore, axis = 1)

In [80]:
#sort data by time
df_sorted = df.sort_values(by=['Card Number', 'Time'])

#.diff() calculates the difference between consecutive entries
df['Time Difference'] = df_sorted.groupby('Card Number')['Time'].diff().dt.total_seconds() 
median = df['Time Difference'].median()
df['Time Difference'] = df['Time Difference'].fillna(median)

#Log transform and normalize, try different combinations to find best accuracy
scaler = MinMaxScaler()
df['Log Time Difference'] = np.log(df['Time Difference'] + 1) #adding 1 to deal with log(0)
df['Normalized Log Time Difference'] = scaler.fit_transform(df[['Time Difference']])

In [81]:
#Log transform and normalize, try different combinations to find best accuracy
scaler = MinMaxScaler()
df['Log Transformed Amount'] = np.log10(df['Amount'])
df['Normalized Log Amount'] = scaler.fit_transform(df[['Log Transformed Amount']])

In [82]:
#drop unneeded data
df = df.drop("Amount", axis = 1)
df = df.drop("Time", axis = 1)
df = df.drop("Card Number", axis = 1)
df = df.drop("Log Transformed Amount", axis = 1)
df = df.drop("Time Difference", axis = 1)
df = df.drop("Log Time Difference", axis = 1)
df = df.drop("category", axis = 1)

In [83]:
X = df[['Category', 'Hour of Day', 'Day of Week', 'Month', 'Z Score', 'Normalized Log Time Difference', 'Normalized Log Amount']]
y = df['is_fraud']

In [305]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [353]:
xgb_model = XGBClassifier(scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum(), alpha = 5, learning_rate = 0.3)
xgb_model.fit(X_train, y_train)

In [354]:
custom_threshold = 0.01
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_proba > custom_threshold).astype(int)

In [355]:
conf_matrix = confusion_matrix(y_val, y_val_pred)

TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

accuracy = (TP + TN) / (TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
F1_score = (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {F1_score}")
print(f"TP: {TP}")
print(f"TN: {TN}")
print(f"FP: {FP}")
print(f"FN: {FN}")

Accuracy: 0.936663323703604
Precision: 0.07961390521808549
Recall: 0.9891794409377818
F1 Score: 0.07368350349274584
TP: 1097
TN: 186630
FP: 12682
FN: 12


In [356]:
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba > custom_threshold).astype(int)

In [357]:
conf_matrix = confusion_matrix(y_test, y_test_pred)

TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

accuracy = (TP + TN) / (TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
F1_score = (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {F1_score}")
print(f"TP: {TP}")
print(f"TN: {TN}")
print(f"FP: {FP}")
print(f"FN: {FN}")

Accuracy: 0.9370475149809651
Precision: 0.08219277810133954
Recall: 0.9912203687445127
F1 Score: 0.07589915966386554
TP: 1129
TN: 186675
FP: 12607
FN: 10


In [None]:
single_data_point = [[category_value, hour_of_day_value, day_of_week_value, month_value, z_score_value, time_between_transactions_value, amount_value]]