In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'creditcardfraud:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F310%2F23498%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240823%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240823T080401Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D364f8e0fadce4201df2301edbff1948c1224961c76cfe15c517f052e13f41c209d7ed92a34024edbcd9ad301df0c734818f286020cda3a54a6c43cb4f2a86899f88e297b410f3f8141b542063f1becf7d735176044ec4ceb7a842da8c2d27a20ae63c8842ea605292ad96f6cdd27c664f6a9c785439b64a6f7544f465039ed346d2855658bce3a35af568ccefb18131bf52fc3ba4434812a2c57847b6733f07ae4a0f574c51f5e04d565da548a91cba93eb2c4a0b256ec5c9f31edfe7409c09b27a65f78991fee36649cdcda89db41749c11da8326358f71753f5e36dcf92639df74116b9be03bcb69febec0e8afc3aafc981ab6f6671ece11b17b84af4a89be'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


I've referred to the notebook by Victoria Mendoza [https://www.kaggle.com/mendozav/credit-card-fraud-detection-project](http://) for this project.
 A big thanks to her for creating such a wonderful notebook!

# 1.1 Introduction

I have recently started using Kaggle after completing the course "Python for Machine Learning and Data Science Bootcamp" on Udemy. Right now, I'm trying to do as many projects as I can before starting my Masters in Data Science for Fall 2021.

I frequently refer to other people's submissions in my notebook while trying to build my own code. Although, I try to give as much credits as possible to the authors of various notebooks, however, if I've forgotten to give credit to someone, please accept my apologies in advance.

So, without any further adeiu, lets get started!

# 1.2 About the Dataset

* The datasets contains transactions made by credit cards in September 2013 by european cardholders.

* This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.

* The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

* It contains only numerical input variables which are the result of a PCA transformation.

* Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data.

* Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.

* Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset.

* The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning.

* Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.


# 1.3 Objective

* The dataset contains a very minute percentage transacions, which are fraudulent. We need to find out those transactions which belong to the Fraud Class

* Based on the data we have to generate a set of insights and recommendations that will help the credit card company from preventing the customers to be charged falsly!



In [None]:
#Credit Card Fraud Detection Project
#Date April 21, 2021

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sys
import scipy
import sklearn

import warnings
warnings.filterwarnings('ignore') # To supress warnings
sns.set(style="whitegrid") # set the background for the graphs


In [None]:
#importing data from kaggle
data = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
data.head(5)

So, we do not know what actually does V1,V2....V28 means due to data confidentiality, but what we know is they're going to help us draw insights from the data.

In [None]:
print(data.columns)

In [None]:
data.info()

The dataset does not contain any object data type, so we do not have to spend any time on conversion. Lets see if our data contains any null values!

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
# random_state helps assure that you always get the same output when you split the data
# this helps create reproducible results and it does not actually matter what the number is
# frac is percentage of the data that will be returned
data = data.sample(frac = 0.2, random_state = 1)
print(data.shape)

Now let us conduct some exploratory data analysis on our data!

In [None]:
# Visualize the count of survivors
sns.countplot('Class', data=data)

Wow! The count of fraudulent transactions as compared to the non fraudulent one's is almost null. It makes it so difficult for us to classify the test data.

Remember, Rule 1 of the dataset is that the predicted value should be somewhat equally divided between the two classes!

Anyway, lets see how well we are able to perform!

In [None]:
sns.pairplot(data)

In [None]:
print("Fraud to NonFraud Ratio of {:.3f}%".format(492/284315*100))

In [None]:
sns.kdeplot(data.Amount[data.Class == 0], label = 'Fraud', shade=True)
sns.kdeplot(data.Amount[data.Class == 1], label = 'NonFraud', shade=True)
plt.xlabel('Amount');

Looks like there a lot more instances of small fraud amounts than really large ones.

In [None]:
sns.kdeplot(data.Time[data.Class == 0], label = 'Fraud', shade=True)
sns.kdeplot(data.Time[data.Class == 1], label = 'NonFraud', shade=True)
plt.xlabel('Time')

We notice that the feature time doesn't seem to have an impact in the frequency of frauds.



In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.corr()) # Displaying the Heatmap

plt.title('Heatmap correlation')
plt.show()

As we can notice, most of the features are not correlated with each other.

What can generally be done on a massive dataset is a dimension reduction. By picking the most important dimensions, there is a possiblity of explaining most of the problem, thus gaining a considerable amount of time while preventing the accuracy to drop too much.


In [None]:
# get the columns from the dataframe
columns = data.columns.tolist()

# filter the columns to remove the data we do not want
columns = [c for c in columns if c not in ['Class']]

# store the variable we will be predicting on which is class
target = 'Class'

# X includes everything except our class column
X = data[columns]
# Y includes all the class labels for each sample
# this is also one-dimensional
Y = data[target]

# print the shapes of X and Y
print(X.shape)
print(Y.shape)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
# determine the number of fraud cases
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

outlier_fraction = len(fraud) / float(len(valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(fraud)))
print('Valid Cases: {}'.format(len(valid)))

In [None]:
state = 1

# define the outlier detection methods
classifiers = {
    # contamination is the number of outliers we think there are
    'Isolation Forest': IsolationForest(max_samples = len(X),
                                       contamination = outlier_fraction,
                                       random_state = state),
    # number of neighbors to consider, the higher the percentage of outliers the higher you want to make this number
    'Local Outlier Factor': LocalOutlierFactor(
    n_neighbors = 20,
    contamination = outlier_fraction)
}

In [None]:
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):

    # fit the data and tag outliers
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)



# reshape the prediction values to 0 for valid and 1 for fraud
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    # calculate the number of errors
    n_errors = (y_pred != Y).sum()

    # classification matrix
    print('{}: {}'.format(clf_name, n_errors))
    print(accuracy_score(Y, y_pred))
    print(classification_report(Y, y_pred))

Looking at precision for fraudulent cases (1) lets us know the percentage of cases that are getting correctly labeled. 'Precision' accounts for false-positives. 'Recall' accounts for false-negatives. Low numbers could mean that we are constantly calling clients asking them if they actually made the transaction which could be annoying.

Goal: To get better percentages.

Our Isolation Forest method (which is Random Forest based) was able to produce a better result. Looking at the f1-score 26% (or approx. 30%) of the time we are going to detect the fraudulent transactions.