In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import ADASYN
from joblib import parallel_backend

In [2]:
#Function to calculate Z score
def calculateZScore(row):
    card_number = row['Card Number']
    mean_amount = cardStats[card_number]['Mean Amount']
    std_amount = cardStats[card_number]['Std Amount']
    zScore = (row['Amount'] - mean_amount) / std_amount
    return zScore

In [3]:
#Read from file, change columns
file_path1 = r"C:\Users\youss\Desktop\FraudDetection\fraudTrain.csv"
file_path2 = r"C:\Users\youss\Desktop\FraudDetection\fraudTest.csv"

df = pd.read_csv(file_path1)
#df2 = pd.read_csv(file_path2)
#df = pd.concat([df1, df2], ignore_index=True)

df.set_index('trans_num', inplace = True)
df = df.drop('ID', axis = 1)
df = df.drop('firstName', axis = 1)
df = df.drop('lastName', axis = 1)
df = df.drop('merchant', axis = 1) #Remove merchant, too many different labels

In [4]:
#Change time string to object
df["Time"] = pd.to_datetime(df["Time"], errors='coerce')
df.dropna(inplace=True)

In [5]:
#Split time to Day of week, hour, and month
df['Hour of Day'] = df['Time'].dt.hour
df['Day of Week'] = df['Time'].dt.dayofweek
df['Month'] = df['Time'].dt.month

In [6]:
#Turn category into labels
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['category'])

In [7]:
#Group data by card num
grouped_by_card = df.groupby('Card Number')
cardStats = {}

#calculate standard deviation for each card
for card, data in grouped_by_card:
    mean = data['Amount'].mean()
    std = data['Amount'].std()
    cardStats[card] = {'Mean Amount': mean, 'Std Amount': std}

#save data into a dictionary to potentially be used
cardStatsDf = pd.DataFrame.from_dict(cardStats, orient='index')
cardStatsDf.to_csv('CardStats.csv')

In [8]:
#Apply Z Score Function
df['Z Score'] = df.apply(calculateZScore, axis = 1)

In [9]:
#sort data by time
df_sorted = df.sort_values(by=['Card Number', 'Time'])

#.diff() calculates the difference between consecutive entries
df['Time Difference'] = df_sorted.groupby('Card Number')['Time'].diff().dt.total_seconds() 
median = df['Time Difference'].median()
df['Time Difference'] = df['Time Difference'].fillna(median)

#Log transform and normalize, try different combinations to find best accuracy
scaler = MinMaxScaler()
df['Log Time Difference'] = np.log(df['Time Difference'] + 1) #adding 1 to deal with log(0)
df['Normalized Log Time Difference'] = scaler.fit_transform(df[['Time Difference']])

In [10]:
#Log transform and normalize, try different combinations to find best accuracy
scaler = MinMaxScaler()
df['Log Transformed Amount'] = np.log10(df['Amount'])
df['Normalized Log Amount'] = scaler.fit_transform(df[['Log Transformed Amount']])

In [11]:
#drop unneeded data
df = df.drop("Amount", axis = 1)
df = df.drop("Time", axis = 1)
df = df.drop("Card Number", axis = 1)
df = df.drop("Log Transformed Amount", axis = 1)
df = df.drop("Time Difference", axis = 1)
df = df.drop("Log Time Difference", axis = 1)
df = df.drop("category", axis = 1)

In [12]:
X = df[['Category', 'Hour of Day', 'Day of Week', 'Month', 'Z Score', 'Normalized Log Time Difference', 'Normalized Log Amount']]
y = df['is_fraud']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

In [15]:
#model = LogisticRegression(class_weight='balanced', C=, penalty='l1', solver='liblinear')
model = LogisticRegression(class_weight='balanced', penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=2000)
with parallel_backend('threading', n_jobs=-1):
    model.fit(X_train_resampled, y_train_resampled)

In [16]:
y_prob = model.predict_proba(X_test)[:, 1]
y_hat = (y_prob > 0.5).astype(int)

conf_matrix_custom = confusion_matrix(y_test, y_hat)
TP = conf_matrix_custom[1, 1]
FP = conf_matrix_custom[0, 1]
TN = conf_matrix_custom[0, 0]
FN = conf_matrix_custom[1, 0]

accuracy = (TP + TN) / (TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
F1_score = (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {F1_score}")
print(f"TP: {TP}")
print(f"TN: {TN}")
print(f"FP: {FP}")
print(f"FN: {FN}")

NameError: name 'model' is not defined