### Project setup

In [76]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tt_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics

### Data Exploration

In [3]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


Then, check for datatypes of the table columns, the purpose of doing this is to check the presence of numerical data represented as string data or to check for categorical variables

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [10]:
df.isFraud.value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

The code from above shows that there are 3 *object* type variables: `nameDest`, `nameOrig` and `type`. Note that two of them are just strinfs with the codes of the banks, however the `type` is a categorical variables that tells us more infornation about the characteristic of the operation. Therefore let's analyze the values of such variable.

In [5]:
df.type.unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

Now that I have the possible value of the variable `type`, I will encode the varibale. A new column called `isPayment` will be created, it will contain 1 if the value is a payment and 0 otherwise.

In [25]:
df['isPayment'] = [ 1 if p in ['PAYMENT', 'DEBIT'] else 0 for p in df.type ]
df['isMovement'] = [ 1 if p in ['CASH_OUT', 'TRANSFER'] else 0 for p in df.type ]
df['accountDiff'] = np.abs(df.oldbalanceOrg - df.oldbalanceDest)

In [27]:
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isPayment,isMovement,accountDiff
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1,0,170136.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1,0,21249.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,1,181.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,21001.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,1,0,41554.0


### Preparing the logistic regression

In [54]:
features = df[['amount', 'isPayment', 'isMovement', 'accountDiff']]
label = df.isFraud

X_train, X_test, y_train, y_test = tt_split(features, label, test_size = 0.3)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [57]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [59]:
model.score(X_train, y_train)

0.9987022417090534

In [83]:
model.score(X_test, y_test)

0.9986703590659194

In [80]:
predictions = model.predict(X_test)

In [87]:
metrics.recall_score(y_test, predictions)

0.0

In [82]:
print(f'{metrics.accuracy_score(y_test, predictions)}')

0.9986703590659194

In [61]:
model.coef_

array([[ 0.21502907, -0.91304625,  3.6516335 , -0.64345015]])

In [88]:
metrics.confusion_matrix(y_test, predictions)

array([[1906248,      28],
       [   2510,       0]])

In [98]:
sum(y_test == 1)

2510

In [91]:
len(df[df.isFraud == 0])

6354407

In [94]:
df[df.isFraud == 1]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isPayment,isMovement,accountDiff
2,1,TRANSFER,181.00,C1305486145,181.00,0.0,C553264065,0.00,0.00,1,0,0,1,181.00
3,1,CASH_OUT,181.00,C840083671,181.00,0.0,C38997010,21182.00,0.00,1,0,0,1,21001.00
251,1,TRANSFER,2806.00,C1420196421,2806.00,0.0,C972765878,0.00,0.00,1,0,0,1,2806.00
252,1,CASH_OUT,2806.00,C2101527076,2806.00,0.0,C1007251739,26202.00,0.00,1,0,0,1,23396.00
680,1,TRANSFER,20128.00,C137533655,20128.00,0.0,C1848415041,0.00,0.00,1,0,0,1,20128.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.00,339682.13,1,0,0,1,339682.13
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.00,0.00,1,0,0,1,6311409.28
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0,0,1,6242920.44
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.00,0.00,1,0,0,1,850002.52
