In [68]:
#importing the dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [69]:
#converting zip file into csv file

from zipfile import ZipFile
dataset = '/content/onlinefraud.csv.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [70]:
#loading the dataset into a pandas dataframe
dataset = pd.read_csv('/content/onlinefraud.csv')

In [71]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [72]:
dataset.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [73]:
dataset.shape

(6362620, 11)

In [74]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [75]:
dataset.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [76]:
dataset['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,6354407
1,8213


In [77]:
#converting type column into labels

label_encoder = LabelEncoder()
label_encoder.fit(dataset['type'])

In [78]:
dataset['type'] = label_encoder.transform(dataset['type'])

In [79]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,3,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,3,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [80]:
dataset['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
1,2237500
3,2151495
0,1399284
4,532909
2,41432


In [81]:
#separating the data as legal and fraud for analysis

legal = dataset[dataset.isFraud == 0]
fraud = dataset[dataset.isFraud == 1]

In [82]:
print(legal.shape)
print(fraud.shape)

(6354407, 11)
(8213, 11)


In [83]:
legal.describe()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0,6354407.0
mean,243.2357,1.713139,178197.0,832828.7,855970.2,1101421.0,1224926.0,0.0,0.0
std,142.1402,1.349619,596237.0,2887144.0,2924987.0,3399202.0,3673816.0,0.0,0.0
min,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,1.0,13368.4,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,1.0,74684.72,14069.0,0.0,133311.8,214881.7,0.0,0.0
75%,334.0,3.0,208364.8,106969.5,144730.7,944144.6,1111975.0,0.0,0.0
max,718.0,4.0,92445520.0,43818860.0,43686620.0,356015900.0,356179300.0,0.0,0.0


In [84]:
fraud.describe()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0,8213.0
mean,368.413856,2.49653,1467967.0,1649668.0,192392.6,544249.6,1279708.0,1.0,0.001948
std,216.38869,1.500087,2404253.0,3547719.0,1965666.0,3336421.0,3908817.0,0.0,0.044097
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,181.0,1.0,127091.3,125822.4,0.0,0.0,0.0,1.0,0.0
50%,367.0,1.0,441423.4,438983.5,0.0,0.0,4676.42,1.0,0.0
75%,558.0,4.0,1517771.0,1517771.0,0.0,147828.7,1058725.0,1.0,0.0
max,743.0,4.0,10000000.0,59585040.0,49585040.0,236230500.0,236726500.0,1.0,1.0


Building a sample dataset contains similar distribution of legal and fraud transactions to avoid undersampling

In [85]:
legal_sample = legal.sample(n=8213)

In [86]:
#concatenating the two dataframes
new_dataset = pd.concat([legal_sample, fraud], axis=0)

In [87]:
new_dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
568957,23,0,360386.65,C636704368,665917.79,1026304.43,C1807858012,511234.31,150847.66,0,0
2500045,204,3,450.3,C48784316,0.0,0.0,M1079474574,0.0,0.0,0,0
1722564,160,1,179637.75,C1427055894,0.0,0.0,C342231390,192813.77,372451.53,0,0
2789473,215,0,379026.64,C365007393,2617851.42,2996878.06,C635281078,8746741.46,8367714.82,0,0
4350989,309,0,368433.64,C1791320921,5131139.16,5499572.8,C2093967426,5621680.52,5253246.88,0,0


In [88]:
new_dataset.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,1,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,4,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,1,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,4,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,1,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [89]:
#splitting the data into features and target and removing unnecesary columns for prediction
X = new_dataset.drop(columns=['isFraud','nameOrig','nameDest','isFlaggedFraud'], axis=1)
Y = new_dataset['isFraud']

In [90]:
print(X)

         step  type      amount  oldbalanceOrg  newbalanceOrig  \
568957     23     0   360386.65      665917.79      1026304.43   
2500045   204     3      450.30           0.00            0.00   
1722564   160     1   179637.75           0.00            0.00   
2789473   215     0   379026.64     2617851.42      2996878.06   
4350989   309     0   368433.64     5131139.16      5499572.80   
...       ...   ...         ...            ...             ...   
6362615   743     1   339682.13      339682.13            0.00   
6362616   743     4  6311409.28     6311409.28            0.00   
6362617   743     1  6311409.28     6311409.28            0.00   
6362618   743     4   850002.52      850002.52            0.00   
6362619   743     1   850002.52      850002.52            0.00   

         oldbalanceDest  newbalanceDest  
568957        511234.31       150847.66  
2500045            0.00            0.00  
1722564       192813.77       372451.53  
2789473      8746741.46      8367714.82

In [91]:
print(Y)

568957     0
2500045    0
1722564    0
2789473    0
4350989    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


In [92]:
#splitting into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state = 2)

In [93]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((13140, 7), (3286, 7), (13140,), (3286,))

Model Training

In [94]:
classifier = LogisticRegression(max_iter = 1000)

In [95]:
classifier.fit(X_train,Y_train)

Model Evaluation

In [96]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [97]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9225266362252663


In [98]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [99]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9257455873402313


wrongly predicted values by our model

In [100]:
results = pd.DataFrame()
results['Actual'] = Y_test
results['Predicted'] = X_test_prediction
misclassified = results[results['Actual'] != results['Predicted']]
print(misclassified)

         Actual  Predicted
4667522       1          0
5558          1          0
6039712       1          0
3960324       1          0
5880708       0          1
...         ...        ...
6115985       1          0
4388740       1          0
3774434       0          1
1811492       0          1
572855        1          0

[244 rows x 2 columns]


In [102]:
cm = confusion_matrix(Y_test,X_test_prediction)
print(cm)

[[1578   65]
 [ 179 1464]]
