**Importing necessary libraries**

In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic
from scipy.stats import entropy
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [12]:
data = pd.read_csv('drive/MyDrive/Colab/Dataset/fraud_data.csv')
data.head(6)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1
5,05-01-2019 03:15,"""Raynor, Reinger and Hagenes""",gas_transport,20.45,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,ef010a5f4f570d306a050a368ee2729d,64.088838,-165.104078,1


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14446 entries, 0 to 14445
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trans_date_trans_time  14446 non-null  object 
 1   merchant               14446 non-null  object 
 2   category               14446 non-null  object 
 3   amt                    14446 non-null  float64
 4   city                   14446 non-null  object 
 5   state                  14446 non-null  object 
 6   lat                    14446 non-null  float64
 7   long                   14446 non-null  float64
 8   city_pop               14446 non-null  int64  
 9   job                    14446 non-null  object 
 10  dob                    14446 non-null  object 
 11  trans_num              14446 non-null  object 
 12  merch_lat              14446 non-null  float64
 13  merch_long             14446 non-null  float64
 14  is_fraud               14446 non-null  object 
dtypes:

**DATA PREPROCESSING**

In [14]:
# checking the no of missing values in the each column
data.isnull().sum()

Unnamed: 0,0
trans_date_trans_time,0
merchant,0
category,0
amt,0
city,0
state,0
lat,0
long,0
city_pop,0
job,0


In [15]:
#Delete duplicate data
data.drop_duplicates(inplace=True)

In [16]:
#Processing time data
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], format='%d-%m-%Y %H:%M')
data['trans_hour'] = data['trans_date_trans_time'].dt.hour
data['trans_day'] = data['trans_date_trans_time'].dt.day
data['trans_month'] = data['trans_date_trans_time'].dt.month
data['trans_year'] = data['trans_date_trans_time'].dt.year
data['trans_day_of_week'] = data['trans_date_trans_time'].dt.dayofweek

In [17]:
#Converting categorical data to numerical using Label Encoding
label_encoder = LabelEncoder()
data['merchant_encoded'] = label_encoder.fit_transform(data['merchant'])
data['category_encoded'] = label_encoder.fit_transform(data['category'])
data['city_encoded'] = label_encoder.fit_transform(data['city'])
data['state_encoded'] = label_encoder.fit_transform(data['state'])
data['job_encoded'] = label_encoder.fit_transform(data['job'])

In [18]:
#Create a new Feature that calculates the distance between user location (lat, long) and merchant location (merch_lat, merch_long)
def calculate_distance(row):
  user_location = (row['lat'], row['long'])
  merchant_location = (row['merch_lat'], row['merch_long'])
  return geodesic(user_location, merchant_location).km

In [19]:
data['distance'] = data.apply(calculate_distance, axis=1)

In [20]:
#Counting unique cities
unique_cities = data['city_pop'].nunique()
print(f"Number of unique cities: {unique_cities}")

Number of unique cities: 174


In [21]:
#Calculating the frequency of each city
city_counts = data['city_pop'].value_counts()
print(city_counts)

city_pop
1312922    297
241        282
2368       197
149        192
1789       187
          ... 
478404       9
2601         8
5505         8
12866        8
99475        7
Name: count, Length: 174, dtype: int64


In [22]:
#Measuring city diversity
city_probabilities = city_counts / len(data)
city_entropy = entropy(city_probabilities)
print(f"City entropy: {city_entropy}")

City entropy: 4.969442210506932


In [24]:
#Coefficient of Variation
cv = data['city_pop'].std() / data['city_pop'].mean()
print(f"City Population CoV: {cv}")

City Population CoV: 2.726177521024549


In [25]:
#Transformation
data['city_pop_log'] = np.log1p(data['city_pop'])

In [26]:
#Create a new feature from the difference between transaction date and birth date
data['dob'] = pd.to_datetime(data['dob'], format='%d-%m-%Y')
data['age'] = data['trans_date_trans_time'].dt.year - data['dob'].dt.year

In [28]:
#Finding outliers using IQR (Interquartile Range)
Q1 = data['amt'].quantile(0.25)
Q3 = data['amt'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier = data[(data['amt'] < lower_bound) | (data['amt'] > upper_bound)]
outlier

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,...,trans_year,trans_day_of_week,merchant_encoded,category_encoded,city_encoded,state_encoded,job_encoded,distance,city_pop_log,age
1,2019-01-04 15:06:00,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",...,2019,4,564,11,166,0,1,80.073191,4.983607,80
3,2019-01-04 23:06:00,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",...,2019,4,537,4,166,0,1,39.752058,4.983607,80
7,2019-01-05 11:31:00,Padberg-Welch,grocery_pos,367.29,Browning,MO,40.0290,-93.1607,602,Cytogeneticist,...,2019,5,553,4,18,6,77,106.515828,6.401917,65
8,2019-01-05 18:03:00,McGlynn-Heathcote,misc_net,768.15,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",...,2019,5,519,8,166,0,1,37.903421,4.983607,80
9,2019-01-05 22:02:00,Dooley-Thompson,misc_net,849.49,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",...,2019,5,324,8,166,0,1,68.461376,4.983607,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14282,2019-01-21 18:05:00,"""Larson, Quitzon and Spence""",travel,455.51,Williamsburg,MO,38.8874,-91.7689,710,Glass blower/designer,...,2019,0,99,13,173,6,92,96.796622,6.566672,49
14313,2019-01-21 19:43:00,Lind-Buckridge,entertainment,278.86,Lakeport,CA,39.0470,-122.9328,11256,Podiatrist,...,2019,0,500,0,82,2,131,61.319473,9.328745,47
14390,2019-01-21 22:39:00,Kris-Kertzmann,travel,517.60,Alva,WY,44.6873,-104.4414,110,"""Administrator, local government""",...,2019,0,466,13,2,12,2,102.635898,4.709530,46
14398,2019-01-21 22:57:00,Schmeler Inc,misc_pos,825.37,Matthews,MO,36.7154,-89.6287,1019,Aeronautical engineer,...,2019,0,602,9,96,6,46,55.998137,6.927558,40


In [29]:
#Convert outlier values to upper or lower bound values (Capping)
data['amt_capped'] = np.where(data['amt'] > upper_bound, upper_bound,
                              np.where(data['amt'] < lower_bound, lower_bound, data['amt']))

In [30]:
data['is_fraud'].unique()

array(['1', '1"2020-12-24 16:56:24"', '0', '0"2019-01-01 00:00:44"'],
      dtype=object)

In [31]:
#Cleans up unneeded values and retrieves numbers before other characters
data['is_fraud'] = data['is_fraud'].str.extract('(\d+)').astype(int)

In [32]:
#Remove unnecessary features
data = data.drop(columns=['merchant', 'category', 'amt', 'city', 'state', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'merch_lat', 'merch_long'], axis = 1)

In [33]:
#Seprating the data for analysis
legit = data[data.is_fraud == 0]
fraud = data[data.is_fraud == 1]

print(legit.shape)
print(fraud.shape)

(12601, 16)
(1782, 16)


In [34]:
#Statistical measure of data
legit.amt_capped.describe()

Unnamed: 0,amt_capped
count,12601.0
mean,58.575463
std,56.96185
min,1.0
25%,9.77
50%,46.14
75%,81.91
max,232.4325


In [35]:
fraud.amt_capped.describe()

Unnamed: 0,amt_capped
count,1782.0
mean,180.948823
std,90.615982
min,1.78
25%,214.6775
50%,232.4325
75%,232.4325
max,232.4325


In [36]:
#Compare the value for both transactions
data.groupby('is_fraud').mean()

Unnamed: 0_level_0,trans_date_trans_time,trans_hour,trans_day,trans_month,trans_year,trans_day_of_week,merchant_encoded,category_encoded,city_encoded,state_encoded,job_encoded,distance,city_pop_log,age,amt_capped
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,2019-12-18 20:13:48.348544000,12.891993,19.033886,6.236807,2019.476073,2.793508,343.601143,6.291088,88.692643,6.058091,82.954051,75.880491,8.252598,47.683755,58.575463
1,2019-12-08 14:22:47.508417536,13.976431,15.985971,6.318182,2019.451178,3.022447,346.52862,7.143659,86.434343,5.999439,81.319865,75.582474,8.25988,50.776655,180.948823


**Undersampling**

**Build the dataset containing similar distribution of normal transactions and fraudulent transacton**

In [40]:
#Number of fraudulent transaction = 1782
legit_sample=legit.sample(n=1782)

In [42]:
#Concatenating two dataframes
df = pd.concat([legit_sample, fraud],axis=0)
df.head(6)

Unnamed: 0,trans_date_trans_time,is_fraud,trans_hour,trans_day,trans_month,trans_year,trans_day_of_week,merchant_encoded,category_encoded,city_encoded,state_encoded,job_encoded,distance,city_pop_log,age,amt_capped
7648,2020-12-31 19:54:00,0,19,31,12,2020,3,216,10,46,12,8,112.199198,6.156979,53,16.57
3148,2020-12-26 18:18:00,0,18,26,12,2020,5,545,1,69,9,80,89.134585,6.464588,27,58.98
11696,2019-01-13 16:45:00,0,16,13,1,2019,6,253,7,112,2,86,87.979228,11.56694,46,96.95
5357,2020-12-28 22:18:00,0,22,28,12,2020,0,373,5,44,2,31,119.455298,5.624018,91,109.86
9385,2019-01-06 08:03:00,0,8,6,1,2019,6,283,3,100,5,104,98.524179,11.339845,48,85.72
10931,2019-01-10 23:32:00,0,23,10,1,2019,3,421,6,24,6,155,100.478631,7.021976,62,14.77


In [44]:
#Split data into features (X) and target (y)
X = df.drop(columns = ['trans_date_trans_time', 'is_fraud'], axis = 1)
y = df['is_fraud']

In [45]:
#Split the dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
#Feature normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**MODEL TRAINING**

**SUPPORT VECTOR MACHINE (SVM)**

In [48]:
#Train Model SVM with training data
model = SVC(C=100, gamma=0.01, kernel='rbf')
model.fit(X_train_scaled, y_train)

In [49]:
#Predict Model with testing data
y_pred = model.predict(X_test_scaled)

**MODEL EVALUATION**

In [50]:
#Model evaluation with Classification Report and Confusion Matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       378
           1       0.99      0.99      0.99       335

    accuracy                           0.99       713
   macro avg       0.99      0.99      0.99       713
weighted avg       0.99      0.99      0.99       713

[[374   4]
 [  2 333]]


In [51]:
#Model evaluation with ROC-AUC
print("ROC-AUC:", roc_auc_score(y_test, y_pred))

ROC-AUC: 0.9917239200821291


Checking for overfitting using Cross Validation

In [59]:
cv_scores_train = cross_val_score(model, X_train_scaled, y_train, cv=5)
cv_scores_test = cross_val_score(model, X_test_scaled, y_test, cv=5)
print("Cross-validation scores on Train data: ", cv_scores_train)
print("Mean cross-validation score on Train data: ", cv_scores_train.mean())
print("Cross-validation scores on Test data: ", cv_scores_test)
print("Mean cross-validation score on Test data: ", cv_scores_test.mean())

Cross-validation scores on Train data:  [0.98598949 0.98070175 0.99122807 0.99122807 0.98421053]
Mean cross-validation score on Train data:  0.9866715826343441
Cross-validation scores on Test data:  [0.95804196 0.95804196 0.95804196 0.95774648 0.96478873]
Mean cross-validation score on Test data:  0.959332217078696


This score indicates that the model performs very well in each data subset.