### Install necessary libraries

In [81]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

### Load and preprocess the data

In [82]:
df = pd.read_csv('fraud_data.csv')
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1


In [83]:
df.shape

(14446, 15)

In [84]:
df.describe()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long
count,14446.0,14446.0,14446.0,14446.0,14446.0,14446.0
mean,124.430073,39.787692,-110.874225,106537.0,39.787991,-110.874892
std,231.352587,5.317039,12.985813,290291.6,5.360593,12.995596
min,1.0,20.0271,-165.6723,46.0,19.032689,-166.670685
25%,12.08,36.7154,-120.4158,493.0,36.794655,-120.146253
50%,51.52,39.6662,-111.0985,1645.0,39.620953,-111.192629
75%,101.03,41.9404,-101.136,35439.0,42.27574,-100.446822
max,3261.47,66.6933,-89.6287,2383912.0,67.510267,-88.646366


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14446 entries, 0 to 14445
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trans_date_trans_time  14446 non-null  object 
 1   merchant               14446 non-null  object 
 2   category               14446 non-null  object 
 3   amt                    14446 non-null  float64
 4   city                   14446 non-null  object 
 5   state                  14446 non-null  object 
 6   lat                    14446 non-null  float64
 7   long                   14446 non-null  float64
 8   city_pop               14446 non-null  int64  
 9   job                    14446 non-null  object 
 10  dob                    14446 non-null  object 
 11  trans_num              14446 non-null  object 
 12  merch_lat              14446 non-null  float64
 13  merch_long             14446 non-null  float64
 14  is_fraud               14446 non-null  object 
dtypes:

In [86]:
df.isnull().sum()

trans_date_trans_time    0
merchant                 0
category                 0
amt                      0
city                     0
state                    0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [87]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d-%m-%Y %H:%M')
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_hour
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1,0
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1,15
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1,22
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1,23
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1,23


In [88]:
df['dob'] = pd.to_datetime(df['dob'], dayfirst=True)
current_date = datetime.now()
df['age'] = (current_date - df['dob']).dt.days // 365
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_hour,age
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1,0,85
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1,15,85
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1,22,85
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1,23,85
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1,23,85


In [89]:
df['location'] = df['city'] + ',' + df['state'].astype(str)
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_hour,age,location
0,2019-01-04 00:58:00,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1,0,85,"Wales,AK"
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1,15,85,"Wales,AK"
2,2019-01-04 22:37:00,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1,22,85,"Wales,AK"
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1,23,85,"Wales,AK"
4,2019-01-04 23:59:00,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",1939-11-09,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1,23,85,"Wales,AK"


In [90]:
df.drop(['trans_date_trans_time', 'city', 'state', 'lat', 'long', 'city_pop', 'trans_num',
                       'merch_lat', 'merch_long', 'dob'], axis = 1, inplace = True)

In [91]:
df.head()

Unnamed: 0,merchant,category,amt,job,is_fraud,trans_hour,age,location
0,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,"""Administrator, education""",1,0,85,"Wales,AK"
1,Predovic Inc,shopping_net,966.11,"""Administrator, education""",1,15,85,"Wales,AK"
2,Wisozk and Sons,misc_pos,49.61,"""Administrator, education""",1,22,85,"Wales,AK"
3,Murray-Smitham,grocery_pos,295.26,"""Administrator, education""",1,23,85,"Wales,AK"
4,Friesen Lt,health_fitness,18.17,"""Administrator, education""",1,23,85,"Wales,AK"


In [92]:
new_df = df.to_csv('updated_fraud_data.csv')

In [93]:
df.dtypes

merchant       object
category       object
amt           float64
job            object
is_fraud       object
trans_hour      int32
age             int64
location       object
dtype: object

### Encode Categorical features

In [94]:
# Initialize LabelEncoder & encode categorical features
encoder = LabelEncoder()
columns_to_encode = ['merchant', 'category', 'job', 'location']
df[columns_to_encode] = df[columns_to_encode].apply(encoder.fit_transform)
df.head()

Unnamed: 0,merchant,category,amt,job,is_fraud,trans_hour,age,location
0,179,3,14.37,1,1,0,85,167
1,564,11,966.11,1,1,15,85,167
2,682,9,49.61,1,1,22,85,167
3,537,4,295.26,1,1,23,85,167
4,352,5,18.17,1,1,23,85,167


In [95]:
df['is_fraud'].value_counts()

is_fraud
0                         12600
1                          1844
1"2020-12-24 16:56:24"        1
0"2019-01-01 00:00:44"        1
Name: count, dtype: int64

In [96]:
df = df[df['is_fraud'].isin(['0', '1', 0, 1])]
df['is_fraud'] = df['is_fraud'].astype(int)
print(df['is_fraud'].value_counts())

is_fraud
0    12600
1     1844
Name: count, dtype: int64


In [97]:
x = df.drop(columns = 'is_fraud', axis = 1)
x

Unnamed: 0,merchant,category,amt,job,trans_hour,age,location
0,179,3,14.37,1,0,85,167
1,564,11,966.11,1,15,85,167
2,682,9,49.61,1,22,85,167
3,537,4,295.26,1,23,85,167
4,352,5,18.17,1,23,85,167
...,...,...,...,...,...,...,...
14441,411,12,122.00,78,0,48,8
14442,126,9,9.07,16,0,68,48
14443,549,4,104.84,2,0,51,2
14444,14,12,268.16,1,0,85,167


In [98]:
y = df['is_fraud']
y

0        1
1        1
2        1
3        1
4        1
        ..
14441    0
14442    0
14443    0
14444    0
14445    0
Name: is_fraud, Length: 14444, dtype: int32

### Scale input features

In [99]:
scalar = StandardScaler()
x_scaled = scalar.fit_transform(x)
x_scaled

array([[-0.83643561, -0.87358494, -0.47578177, ..., -1.78784723,
         1.88438912,  1.57561578],
       [ 1.11557346,  1.18264595,  3.6379487 , ...,  0.26978256,
         1.88438912,  1.57561578],
       [ 1.71385157,  0.66858823, -0.323463  , ...,  1.23000979,
         1.88438912,  1.57561578],
       ...,
       [ 1.03952116, -0.61655608, -0.08474094, ..., -1.78784723,
        -0.0921634 , -1.76149334],
       [-1.67301093,  1.43967481,  0.62118133, ..., -1.78784723,
         1.88438912,  1.57561578],
       [-0.16717536, -1.38764266, -0.32138828, ..., -1.78784723,
         1.24491624,  1.45426635]])

### Split the dataset and handle imbalance

In [100]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.20, stratify= y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((11555, 7), (2889, 7), (11555,), (2889,))

In [101]:
# Balance the dataset using SMOTE
smote = SMOTE(sampling_strategy='auto')
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)
x_train_resampled.shape, y_train_resampled.shape

((20160, 7), (20160,))

### Model Building

In [102]:
# Define the parameter grid for Decision Tree
param_dist = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Split quality measures
    'splitter': ['best', 'random'],  # Splitting strategy
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples in a leaf
    'max_features': [None, 'sqrt', 'log2'],  # Number of features considered for splitting
}

In [103]:
# Initialize RandomizedSearchCV for Decision Tree Classifier
random_search_dt = RandomizedSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  
    scoring='accuracy',
    cv=5,  
    random_state=42,
    n_jobs=-1  
)

# Fit RandomizedSearchCV to the training data
random_search_dt.fit(x_train_resampled, y_train_resampled)
print(f'Best Parameters: {random_search_dt.best_params_}')

Best Parameters: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10, 'criterion': 'log_loss'}


In [104]:
# Evaluate the best model
dt_model = random_search_dt.best_estimator_

y_pred_train = dt_model.predict(x_train_resampled)
print(f"Training Accuracy: {accuracy_score(y_train_resampled, y_pred_train)}")

y_pred_dt = dt_model.predict(x_test)
print(f"Testing Accuracy: {accuracy_score(y_test, y_pred_dt)}")
print(classification_report(y_test, y_pred_dt))

Training Accuracy: 0.9855654761904762
Testing Accuracy: 0.9688473520249221
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2520
           1       0.84      0.93      0.88       369

    accuracy                           0.97      2889
   macro avg       0.92      0.95      0.93      2889
weighted avg       0.97      0.97      0.97      2889



### Save the model

In [105]:
dump(dt_model, 'dt_model.joblib')
dump(encoder, 'label_encoder.joblib')
dump(scalar, 'scalar.joblib')
print("Model, encoder and standard scalar are dumped succesfully")

Model, encoder and standard scalar are dumped succesfully
