# Part 1- Credit Card Fraud Predictive Analysis
## By Maduako Akachi

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Load and read the dataset into panda dataframe

train = pd.read_csv('../input/fraud-detection/fraudTrain.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
#Get size of the dataframe
train.shape

(1296675, 23)

In [4]:
#Drop irrelevant columns
train.drop(['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'street', 'lat', 'long', 'city_pop', 'job', 'trans_num', 'unix_time', 'merch_lat', 'merch_long'], axis=1, inplace=True)

In [5]:
train.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,city,state,zip,dob,is_fraud
0,2019-01-01 00:00:18,misc_net,4.97,F,Moravian Falls,NC,28654,1988-03-09,0
1,2019-01-01 00:00:44,grocery_pos,107.23,F,Orient,WA,99160,1978-06-21,0
2,2019-01-01 00:00:51,entertainment,220.11,M,Malad City,ID,83252,1962-01-19,0
3,2019-01-01 00:01:16,gas_transport,45.0,M,Boulder,MT,59632,1967-01-12,0
4,2019-01-01 00:03:06,misc_pos,41.96,M,Doe Hill,VA,24433,1986-03-28,0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 9 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   category               1296675 non-null  object 
 2   amt                    1296675 non-null  float64
 3   gender                 1296675 non-null  object 
 4   city                   1296675 non-null  object 
 5   state                  1296675 non-null  object 
 6   zip                    1296675 non-null  int64  
 7   dob                    1296675 non-null  object 
 8   is_fraud               1296675 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 89.0+ MB


In [7]:
#Get statistical information about the dataframe
train.describe()

Unnamed: 0,amt,zip,is_fraud
count,1296675.0,1296675.0,1296675.0
mean,70.35104,48800.67,0.005788652
std,160.316,26893.22,0.07586269
min,1.0,1257.0,0.0
25%,9.65,26237.0,0.0
50%,47.52,48174.0,0.0
75%,83.14,72042.0,0.0
max,28948.9,99783.0,1.0


In [8]:
#Check for missing values for train
train.isnull().sum()

trans_date_trans_time    0
category                 0
amt                      0
gender                   0
city                     0
state                    0
zip                      0
dob                      0
is_fraud                 0
dtype: int64

In [9]:
#Change format of trans_date_trans_time for better readability
train['trans_date_trans_time'] = pd.to_datetime(train['trans_date_trans_time'])

In [10]:
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')

In [11]:
#convert values in dob column to a more suitable format in  a new column named age
from datetime import datetime

train['age']=(datetime.today()-train['dob']).dt.days/365.25
train['age']=train['age'].astype(int)

In [12]:
#Drop the dob column
train.drop(['dob'], axis=1, inplace=True)

In [13]:
train.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,city,state,zip,is_fraud,age
0,2019-01-01 00:00:18,misc_net,4.97,F,Moravian Falls,NC,28654,0,34
1,2019-01-01 00:00:44,grocery_pos,107.23,F,Orient,WA,99160,0,44
2,2019-01-01 00:00:51,entertainment,220.11,M,Malad City,ID,83252,0,60
3,2019-01-01 00:01:16,gas_transport,45.0,M,Boulder,MT,59632,0,55
4,2019-01-01 00:03:06,misc_pos,41.96,M,Doe Hill,VA,24433,0,36


In [14]:
train[(train['gender'] == 'M') & (train['is_fraud'] ==1)]

Unnamed: 0,trans_date_trans_time,category,amt,gender,city,state,zip,is_fraud,age
2449,2019-01-02 01:06:37,grocery_pos,281.06,M,Collettsville,NC,28611,1,34
2546,2019-01-02 03:38:03,gas_transport,7.03,M,Collettsville,NC,28611,1,34
2937,2019-01-02 13:38:08,shopping_net,844.80,M,Collettsville,NC,28611,1,34
3527,2019-01-02 23:52:08,misc_net,843.91,M,Collettsville,NC,28611,1,34
4654,2019-01-03 22:21:15,shopping_net,942.62,M,Collettsville,NC,28611,1,34
...,...,...,...,...,...,...,...,...,...
1295103,2020-06-20 22:28:18,misc_net,735.19,M,Benton,WI,53803,1,64
1295219,2020-06-20 23:17:07,grocery_pos,307.71,M,Benton,WI,53803,1,64
1295274,2020-06-20 23:40:26,misc_net,725.60,M,Benton,WI,53803,1,64
1295532,2020-06-21 02:16:56,gas_transport,10.24,M,Denham Springs,LA,70726,1,28


In [15]:
train[(train['gender'] == 'F') & (train['is_fraud'] ==1)]

Unnamed: 0,trans_date_trans_time,category,amt,gender,city,state,zip,is_fraud,age
2472,2019-01-02 01:47:29,gas_transport,11.52,F,San Antonio,TX,78208,1,61
2523,2019-01-02 03:05:23,grocery_pos,276.31,F,San Antonio,TX,78208,1,61
2553,2019-01-02 03:55:47,grocery_pos,275.73,F,San Antonio,TX,78208,1,61
3580,2019-01-03 01:05:27,gas_transport,10.76,F,San Antonio,TX,78208,1,61
3600,2019-01-03 01:35:52,grocery_pos,332.35,F,San Antonio,TX,78208,1,61
...,...,...,...,...,...,...,...,...,...
1295314,2020-06-21 00:05:03,grocery_net,15.87,F,Notrees,TX,79759,1,53
1295315,2020-06-21 00:07:09,misc_pos,9.18,F,Vero Beach,FL,32960,1,36
1295399,2020-06-21 01:00:08,shopping_net,977.01,F,Vero Beach,FL,32960,1,36
1295491,2020-06-21 01:53:35,shopping_net,1210.91,F,Vero Beach,FL,32960,1,36


In [16]:
train['amt'] = train['amt'].astype(np.int64)

In [17]:
train.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,city,state,zip,is_fraud,age
0,2019-01-01 00:00:18,misc_net,4,F,Moravian Falls,NC,28654,0,34
1,2019-01-01 00:00:44,grocery_pos,107,F,Orient,WA,99160,0,44
2,2019-01-01 00:00:51,entertainment,220,M,Malad City,ID,83252,0,60
3,2019-01-01 00:01:16,gas_transport,45,M,Boulder,MT,59632,0,55
4,2019-01-01 00:03:06,misc_pos,41,M,Doe Hill,VA,24433,0,36


In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 9 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   trans_date_trans_time  1296675 non-null  datetime64[ns]
 1   category               1296675 non-null  object        
 2   amt                    1296675 non-null  int64         
 3   gender                 1296675 non-null  object        
 4   city                   1296675 non-null  object        
 5   state                  1296675 non-null  object        
 6   zip                    1296675 non-null  int64         
 7   is_fraud               1296675 non-null  int64         
 8   age                    1296675 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 89.0+ MB


In [19]:
#Label Encoding

featured = [ 'trans_date_trans_time','category','gender','city','state']

for feature in featured:
  train[f"{feature}_cat"] = train[feature].astype('category')
  train[f"{feature}_cat"] = train[f"{feature}_cat"].cat.codes

In [20]:
# Drop the reductant features since Label encoding have been done
train.drop(['trans_date_trans_time','category','gender','city','state'], axis=1, inplace=True)

In [21]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [22]:
#for all independent features
x=train.drop(['is_fraud'], axis=1)

In [23]:
#for dependent feature
y=train['is_fraud']

In [24]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x,y, test_size= 0.3, random_state= 1)

In [25]:
train_x.shape

(907672, 8)

In [26]:
test_x.shape

(389003, 8)

In [27]:
#create the model
from sklearn.ensemble import RandomForestClassifier

rf1=RandomForestClassifier(n_estimators= 100)

In [28]:
rf1.fit(train_x,train_y)

RandomForestClassifier()

In [29]:
prediction_rf=rf1.predict(test_x)

In [30]:
rf_score=rf1.score(test_x,test_y)*100

In [31]:
rf_score

99.78303509227435

In [32]:
# Importing class and libraries

from sklearn.svm import SVC
model = SVC()

In [33]:
# Training the Model

model.fit(train_x,train_y)

SVC()

In [34]:
# Accuracy of the model in training

model.score(train_x,train_y)

0.9941399536396408

In [35]:
# Accuracy of the model in Testing

model.score(train_x,train_y)

0.9941399536396408

In [36]:
# Importing Libraries and Classes

from sklearn import metrics

In [37]:
# Y contains all the outputs and X contains all the inputs. We will test on the machine if it gives to expected Output for the 
# corresponding Inputs.

expected = test_y
predicted = model.predict(test_x)

In [38]:
# Obtaining Report

print(metrics.classification_report(expected, predicted))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00    386816
           1       0.00      0.00      0.00      2187

    accuracy                           0.99    389003
   macro avg       0.50      0.50      0.50    389003
weighted avg       0.99      0.99      0.99    389003



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Output in the form of count

print(metrics.confusion_matrix(expected, predicted))

[[386816      0]
 [  2187      0]]
