# dataset is from https://www.kaggle.com/mlg-ulb/creditcardfraud

In [1]:
# Import necessary packages 

import pandas as pd 
import numpy as np


In [2]:
# Import the dataset as a dataframe

credit_trans = pd.read_csv('creditcard.csv')
credit_trans.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Drop the unuseful time column

credit_trans.drop('Time', axis=1, inplace=True)
credit_trans.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Check the number of transcations in the two classes

total_credit_trans = len(credit_trans)
non_fraud_trans = len(credit_trans[credit_trans['Class'] == 0])
fraud_trans = len(credit_trans[credit_trans['Class'] == 1])
percentage_non_fraud = round(non_fraud_trans / total_credit_trans * 100, 2)
percentage_fraud = round(fraud_trans / total_credit_trans * 100, 2)

print('the total numebr of credit card transactions is {}'.format(total_credit_trans))
print('the number of non-fraudulent transactions is {}'.format(non_fraud_trans))
print('the number of fraudulent transactions is {}'.format(fraud_trans))
print('the percentage of non-fraudulent transactions out of all transcations is {}'.format(percentage_non_fraud))
print('the percentage of fraudulent transactions is {}'.format(percentage_fraud))

the total numebr of credit card transactions is 284807
the number of non-fraudulent transactions is 284315
the number of fraudulent transactions is 492
the percentage of non-fraudulent transactions out of all transcations is 99.83
the percentage of fraudulent transactions is 0.17


In [5]:
# Splitting the dataset 

from sklearn.model_selection import train_test_split

X = credit_trans.drop('Class', axis = 1)
y = credit_trans['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [6]:
# Try to balance this extremely unbalanced dataset before modeling 

# Method: oversample the fraudulent cases in the train sets, keeping the test sets unchanged

from sklearn.utils import resample 

train_set = pd.concat([X_train, y_train], axis = 1)

train_non_fraud = train_set[train_set['Class'] == 0]
train_fraud = train_set[train_set['Class'] == 1]


In [7]:
# Now the train set is balanced after applying oversampling method on the original train set

oversampled_fraud = resample(train_fraud, replace = True, n_samples = len(train_non_fraud), random_state = 1)

oversampled_train_set = pd.concat([train_non_fraud, oversampled_fraud])

oversampled_train_set['Class'].value_counts()

1    227440
0    227440
Name: Class, dtype: int64

In [8]:
# reassign the training set after oversampling 

X_train = oversampled_train_set.drop('Class', axis = 1)
y_train = oversampled_train_set['Class']

In [9]:
# Modeling for binary classification: 

# since this is a binary classification task, and our objective is to choose the model that has the best performance against our performance metric in classifying fraudulent transactions in unseen dataset. 

# The following models are suitable in this case and I will compare them by different evaluation standards:  


# 1. logistic regression
# 2. knn
# 3. svm
# 4. decision tree
# 5. random forest
# 6. bagging
# 7. boosting 

In [10]:
# logistic regression model

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print("train set score: " + str(lr.score(X_train, y_train)))
print("test set score: " + str(lr.score(X_test, y_test)))

train set score: 0.9541967112205417
test set score: 0.9807942136863171


In [11]:
# knn model

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier (n_neighbors = 10)
knn.fit(X_train, y_train)

print("train set score: " + str(knn.score(X_train, y_train)))
print("test set score: " + str(knn.score(X_test, y_test)))

train set score: 0.9996152831516004
test set score: 0.9986657771847899


In [15]:
# support vector machine model 

from sklearn.svm import LinearSVC

svm = LinearSVC(C=0.0001)
svm.fit(X_train, y_train)

print("train set score: " + str(svm.score(X_train, y_train)))
print("test set score: " + str(svm.score(X_test, y_test)))

train set score: 0.9484215617305664
test set score: 0.9834451037533795




In [13]:
# decision tree 

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

print("train set score: " + str(clf.score(X_train, y_train)))
print("test set score: " + str(clf.score(X_test, y_test)))

train set score: 1.0
test set score: 0.9992275552122467


In [17]:
# random forest

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 29, max_depth = 10)
rfc.fit(X_train, y_train)

print("train set score: " + str(rfc.score(X_train, y_train)))
print("test set score: " + str(rfc.score(X_test, y_test)))

train set score: 0.9950536405205769
test set score: 0.9993328885923949


In [18]:
# bagging

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bc = BaggingClassifier(DecisionTreeClassifier(), max_samples = 0.7, max_features = 1.0, n_estimators = 29)
bc.fit(X_train, y_train)

print("train set score: " + str(bc.score(X_train, y_train)))
print("test set score: " + str(bc.score(X_test, y_test)))

train set score: 0.9999692226521281
test set score: 0.9991573329588147


In [21]:
# boosting 

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=12, max_depth=8), n_estimators=29, learning_rate = 0.6)
adb.fit(X_train, y_train)

print("train set score: " + str(adb.score(X_train, y_train)))
print("test set score: " + str(adb.score(X_test, y_test)))

train set score: 1.0
test set score: 0.9995084442259752


## Boosting model generates the best accuracy score in predicting credit card fraud