<a href="https://colab.research.google.com/github/Ava100rav/Bank-fraud_prediction/blob/main/Fraud_prediction_o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bank Transaction Fraud Detection

In [None]:
# Import library
import numpy as np   
import pandas as pd    
import seaborn as sns

import matplotlib.pyplot as plt   
import matplotlib.style

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/bank_fraud/Fraud.csv')

In [None]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [None]:
df.shape

(6362620, 11)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


# Domain Analysis

In [None]:
# displays all the columns
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

# Data Preprocessing

1. Data cleaning including missing values

In [None]:
df.isnull().sum()


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [None]:
df['step'].nunique()

743

In [None]:
df['type'].value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [None]:
df['type']=df['type'].map({'CASH_OUT':0,'PAYMENT':1,'CASH_IN':2,'TRANSFER':3,'DEBIT':4})

In [None]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,3,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,0,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [None]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [None]:
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [None]:
df['newbalanceDest'].nunique()

3555499

In [None]:
df['nameOrig'].nunique()

6353307

In [None]:
# correlation between step Col and output Col2
df['step'].corr(df['isFraud'])

0.031577568632692204

In [None]:
df.drop(['nameOrig','nameDest'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,1,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,3,181.0,181.0,0.0,0.0,0.0,1,0
3,1,0,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,1,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [None]:
df.shape

(6362620, 9)

In [None]:
# pair-wise correlation between columns
print(df.corr())

                    step      type    amount  oldbalanceOrg  newbalanceOrig  \
step            1.000000  0.012627  0.022373      -0.010058       -0.010299   
type            0.012627  1.000000  0.198987       0.260418        0.270669   
amount          0.022373  0.198987  1.000000      -0.002762       -0.007861   
oldbalanceOrg  -0.010058  0.260418 -0.002762       1.000000        0.998803   
newbalanceOrig -0.010299  0.270669 -0.007861       0.998803        1.000000   
oldbalanceDest  0.027665  0.066255  0.294137       0.066243        0.067812   
newbalanceDest  0.025888  0.079111  0.459304       0.042029        0.041837   
isFraud         0.031578  0.016171  0.076688       0.010154       -0.008148   
isFlaggedFraud  0.003277  0.003144  0.012295       0.003835        0.003776   

                oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
step                  0.027665        0.025888  0.031578        0.003277  
type                  0.066255        0.079111  0.016171   

In [None]:
df.shape

(6362620, 9)

In [None]:
df['amount'].nunique()

5316900

# 3. How did you select variables to be included in the model?


checking correlation btw target variable with remaining variable

# There are too much of outliers so we cannot impute them.. we have to take outliers as  our original data

In [None]:
from sklearn.model_selection import train_test_split
X=df.drop('isFraud',axis=1)
y=df['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)


LogisticRegression(random_state=0)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)


Confusion Matrix : 
 [[2094907    2038]
 [   1565    1155]]


In [None]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096945
           1       0.36      0.42      0.39      2720

    accuracy                           1.00   2099665
   macro avg       0.68      0.71      0.69   2099665
weighted avg       1.00      1.00      1.00   2099665



In [None]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred))


Accuracy :  0.9982840119733386


# Describe your fraud detection model in elaboration. 

From this dataset i have drop 2 name column as they are categorical variable and even if i convert them into numerical by labelencoder they do't give a better output. so i exluded column ['nameOrig','nameDest']

also converted type column into integer by python.map()

there is not any null values and we can see relation of each column wrt to output variable as well as other column by correlation. by implementing one of best classification algorithm logistic regression, i trained model and find accuracy