In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:
data_df=pd.read_csv('credit_card_transactions.csv')
data_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


In [3]:
data_df.shape

(1296675, 24)

In [4]:
data_df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1100702.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652,46825.75
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269,25834.0
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0,1001.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0,25114.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0,45860.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0,68319.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0,99403.0


In [5]:
data_df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [17]:
data_df=data_df.drop(columns=['Unnamed: 0','first', 'last', 'street','trans_num','cc_num','dob'])

In [9]:
data_df['amount']=data_df['amt']
data_df.drop(columns=['amt'],inplace=True)

In [11]:
data_df['timestamp']=data_df['trans_date_trans_time']
data_df.drop(columns=['trans_date_trans_time'],inplace=True)

In [15]:

data_df = data_df.sort_values(['cc_num','unix_time'])

# convert to datetime
data_df['unix_time'] = pd.to_datetime(data_df['unix_time'], unit='s')

# time difference
data_df['time_diff'] = (
    data_df.groupby('cc_num')['unix_time']
    .diff()
    .dt.total_seconds()
    .fillna(0)
)

# tx count in last 24h
data_df['tx_count_24h'] = (
    data_df
    .set_index('unix_time')
    .groupby('cc_num')
    .rolling('86400s')
    .count()['amount']
    .reset_index(level=0, drop=True)
    .reset_index(drop=True)
)

# location change
data_df['location_change'] = (
    data_df.groupby('cc_num')['city']
    .transform(lambda x: (x != x.shift()).astype(int))
)

# device change
data_df['device_change'] = (
    data_df.groupby('cc_num')['job']
    .transform(lambda x: (x != x.shift()).astype(int))
)

# biometric confidence
data_df['biometric_confidence'] = np.where(
    data_df['is_fraud'] == 1,
    np.random.uniform(0.65, 0.85, size=len(data_df)),
    np.random.uniform(0.93, 0.99, size=len(data_df))
)


In [18]:
data_df.head()

Unnamed: 0,merchant,category,gender,city,state,zip,lat,long,city_pop,job,...,is_fraud,merch_zipcode,amtount,amount,timestamp,time_diff,tx_count_24h,location_change,device_change,biometric_confidence
1017,"fraud_Jones, Sawayn and Romaguera",misc_net,F,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,...,0,,7.27,7.27,2019-01-01 12:47:15,0.0,5.0,1,1,0.951209
2724,fraud_Berge LLC,gas_transport,F,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,...,0,,52.94,52.94,2019-01-02 08:44:57,71862.0,1.0,0,0,0.965988
2726,fraud_Luettgen PLC,gas_transport,F,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,...,0,,82.08,82.08,2019-01-02 08:47:36,159.0,3.0,0,0,0.933301
2882,fraud_Daugherty LLC,kids_pets,F,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,...,0,,34.79,34.79,2019-01-02 12:38:14,13838.0,3.0,0,0,0.975792
2907,fraud_Beier and Sons,home,F,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Information systems manager,...,0,82649.0,27.18,27.18,2019-01-02 13:10:46,1952.0,3.0,0,0,0.96027
