In [2]:
#Package Imports
import pandas as pd
#Reading in the dataset
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
#Viewing the first 5 rows of the dataset
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [3]:
#Dropping the redundant features
df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

In [5]:
# Storing the fraudulent data into a dataframe
df_fraud = df[df['isFraud'] == 1]
#Storing the non-fraudulent data into a dataframe
df_nofraud = df[df['isFraud'] == 0]
#Storing 12,000 rows of non-fraudulent data
df_nofraud = df_nofraud.head(12000)
#Joining both datasets together
df = pd.concat([df_fraud, df_nofraud], axis = 0)
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1.0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1.0
251,1,TRANSFER,2806.0,2806.0,0.0,0.0,0.0,1.0
252,1,CASH_OUT,2806.0,2806.0,0.0,26202.0,0.0,1.0
680,1,TRANSFER,20128.0,20128.0,0.0,0.0,0.0,1.0


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Converting the type column to categorical
df['type'] = df['type'].astype('category')
#Integer Encoding the 'type' column
type_encode = LabelEncoder()
#Integer encoding the 'type' column
df['type'] = type_encode.fit_transform(df.type)

In [7]:
#One hot encoding the 'type' column
type_one_hot = OneHotEncoder()
type_one_hot_encode = type_one_hot.fit_transform(df.type.values.reshape(-1,1)).toarray()

#Adding the one hot encoded variables to the dataset
ohe_variable = pd.DataFrame(type_one_hot_encode, columns =
["type_"+str(int(i)) for i in range(type_one_hot_encode.shape[1])])

df = pd.concat([df, ohe_variable], axis=1)
#Dropping the original type variable
df = df.drop('type', axis = 1)
#Viewing the new dataframe after one-hot-encoding
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_0,type_1,type_2,type_3,type_4
0,1.0,9839.64,170136.0,160296.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1864.28,21249.0,19384.72,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,181.0,181.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,181.0,181.0,0.0,21182.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,11668.14,41554.0,29885.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
#Checking every column for missing values
df.isnull().any()

step              True
amount            True
oldbalanceOrg     True
newbalanceOrig    True
oldbalanceDest    True
newbalanceDest    True
isFraud           True
type_0            True
type_1            True
type_2            True
type_3            True
type_4            True
dtype: bool

In [9]:
#Imputing the missing values with a 0
df = df.fillna(0)

In [10]:
df.to_csv('fraud_prediction.csv')

## Implementing the k-NN algorithm using scikit-learn

In [11]:
#Creating the features
features = df.drop('isFraud', axis = 1).values
target = df['isFraud'].values

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target,
test_size = 0.3, random_state = 42, stratify = target)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

#Initializing the kNN classifier with 3 neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=3)

#Fitting the classifier on the training data
knn_classifier.fit(X_train, y_train)

#Extracting the accuracy score from the test sets
knn_classifier.score(X_test, y_test)

0.9898879475266467

In [15]:
import numpy as np
from sklearn.model_selection import GridSearchCV

#Initializing a grid with possible number of neighbors from 1 to 24
grid = {'n_neighbors' : np.arange(1, 25)}

#Initializing a k-NN classifier
knn_classifier = KNeighborsClassifier()

#Using cross validation to find optimal number of neighbors
knn = GridSearchCV(knn_classifier, grid, cv = 10)
knn.fit(X_train, y_train)

#Extracting the optimal number of neighbors
knn.best_params_
#Extracting the accuracy score for optimal number of neighbors
knn.best_score_

0.992033215184869

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Setting up the scaling pipeline
pipeline_order = [('scaler', StandardScaler()), ('knn',
KNeighborsClassifier(n_neighbors = 1))]
pipeline = Pipeline(pipeline_order)

#Fitting the classfier to the scaled dataset
knn_classifier_scaled = pipeline.fit(X_train, y_train)

#Extracting the score
knn_classifier_scaled.score(X_test, y_test)

0.992347635966111

In [38]:
df.columns

Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'type_0', 'type_1', 'type_2', 'type_3',
       'type_4'],
      dtype='object')

In [None]:
%matplotlib inline
import matplotlib

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
import matplotlib.cm as cm


# import some data to play with
# take the first two features
x = df[df.columns[0:3]]
y = df[df.columns[4:7]]
h = .02  # step size in the mesh

# Calculate min, max and limits
#x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
#y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
#xx, yy = np.meshgrid(np.arange(x, h), np.arange(y, h))

# Put the result into a color plot
plt.figure()
colors = cm.rainbow(np.linspace(0, 1, len(y)))
for c in colors:
    plt.scatter(x, y, color=c)
#plt.xlim(xx.min(), xx.max())
#plt.ylim(yy.min(), yy.max())
plt.title("Data points")
plt.show()