In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [3]:
train_df = pd.read_csv("/content/fraudTrain.csv")
test_df = pd.read_csv("/content/fraudTest.csv")

df = pd.concat([train_df, test_df], axis=0)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 488808 entries, 0 to 252125
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             488808 non-null  int64  
 1   trans_date_trans_time  488808 non-null  object 
 2   cc_num                 488808 non-null  int64  
 3   merchant               488807 non-null  object 
 4   category               488807 non-null  object 
 5   amt                    488807 non-null  float64
 6   first                  488807 non-null  object 
 7   last                   488807 non-null  object 
 8   gender                 488807 non-null  object 
 9   street                 488807 non-null  object 
 10  city                   488807 non-null  object 
 11  state                  488807 non-null  object 
 12  zip                    488807 non-null  float64
 13  lat                    488807 non-null  float64
 14  long                   488807 non-null  f

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,488808.0,488808.0,488807.0,488807.0,488807.0,488807.0,488807.0,488806.0,488806.0,488806.0,488806.0
mean,122323.489174,4.173906e+17,70.194375,48795.079678,38.539353,-90.208685,89197.83,1353972000.0,38.53882,-90.208175,0.005994
std,70764.300838,1.309128e+18,154.857996,26871.037302,5.07185,13.737411,301988.4,22515150.0,5.107275,13.749964,0.07719
min,0.0,3040768000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.029798,-166.671575,0.0
25%,61100.75,180040000000000.0,9.65,26237.0,34.6689,-96.7909,743.0,1331323000.0,34.748487,-96.874618,0.0
50%,122201.5,3520550000000000.0,47.47,48174.0,39.3716,-87.4616,2443.0,1371991000.0,39.366157,-87.421019,0.0
75%,183302.25,4635331000000000.0,83.205,72011.0,41.9404,-80.158,20478.0,1375683000.0,41.957998,-80.237229,0.0
max,252125.0,4.992346e+18,17897.24,99783.0,66.6933,-67.9503,2906700.0,1379773000.0,67.510267,-66.952352,1.0


In [7]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
252121,False
252122,False
252123,False
252124,False


In [8]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
trans_date_trans_time,0
cc_num,0
merchant,1
category,1
amt,1
first,1
last,1
gender,1
street,1


In [9]:
df.drop(columns=[
    'Unnamed: 0', 'trans_date_trans_time',
    'cc_num', 'merchant', 'first', 'last',
    'street', 'city', 'state', 'zip',
    'dob', 'trans_num'
], inplace=True)


In [10]:
df = pd.get_dummies(df, drop_first=True)


In [12]:
df.dropna(subset=['is_fraud'], inplace=True)
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [14]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

print("Logistic Regression")
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
print("ROC-AUC:", roc_auc_score(y_test, lr.predict_proba(X_test)[:,1]))


Logistic Regression
[[85466 11710]
 [   83   503]]
              precision    recall  f1-score   support

         0.0       1.00      0.88      0.94     97176
         1.0       0.04      0.86      0.08       586

    accuracy                           0.88     97762
   macro avg       0.52      0.87      0.51     97762
weighted avg       0.99      0.88      0.93     97762

ROC-AUC: 0.9569049058026659


In [15]:
dt = DecisionTreeClassifier(
    max_depth=10,
    class_weight='balanced',
    random_state=42
)
dt.fit(X_train, y_train)

dt_pred = dt.predict(X_test)

print("Decision Tree")
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))
print("ROC-AUC:", roc_auc_score(y_test, dt.predict_proba(X_test)[:,1]))


Decision Tree
[[91757  5419]
 [   30   556]]
              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97     97176
         1.0       0.09      0.95      0.17       586

    accuracy                           0.94     97762
   macro avg       0.55      0.95      0.57     97762
weighted avg       0.99      0.94      0.97     97762

ROC-AUC: 0.9733417178949226


In [16]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print("Random Forest")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))


Random Forest
[[95387  1789]
 [  131   455]]
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     97176
         1.0       0.20      0.78      0.32       586

    accuracy                           0.98     97762
   macro avg       0.60      0.88      0.66     97762
weighted avg       0.99      0.98      0.99     97762

ROC-AUC: 0.9612436345748652
