In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn import metrics
import seaborn as sns

In [2]:
df = pd.read_csv("../Data/banking_cleaned.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,amount,oldbalanceOrg,oldbalanceDest,isFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0,9839.64,170136.0,0.0,0,0,0,0,1,0
1,1,1864.28,21249.0,0.0,0,0,0,0,1,0
2,2,181.0,181.0,0.0,1,0,0,0,0,1
3,3,181.0,181.0,21182.0,1,0,1,0,0,0
4,4,11668.14,41554.0,0.0,0,0,0,0,1,0


In [4]:
df.shape

(6362620, 10)

In [5]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [6]:
df.shape

(6362620, 9)

#### Splitting Data

In [12]:
y = df["isFraud"]
X = df.drop("isFraud", axis=1)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
len(X_test)

1272524

In [16]:
len(X_train)

5090096

#### Standardizing Data

In [8]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
scaler.fit(X_train) 
scaler.transform(X_train)

array([[ 0.34345292,  1.32880177, -0.17499203, ..., -0.08101666,
        -0.71462102, -0.30238726],
       [ 0.05435533, -0.20740058, -0.0865455 , ..., -0.08101666,
        -0.71462102, -0.30238726],
       [-0.26560052, -0.28741626, -0.32418077, ..., -0.08101666,
         1.3993431 , -0.30238726],
       ...,
       [-0.28120584, -0.26814157, -0.22902672, ..., -0.08101666,
        -0.71462102, -0.30238726],
       [-0.17741639, -0.28164719, -0.24867679, ..., -0.08101666,
        -0.71462102, -0.30238726],
       [-0.14969432, -0.22075864, -0.32418077, ..., -0.08101666,
        -0.71462102, -0.30238726]])

#### Random Forest Classifier 

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from scipy.stats import randint


In [19]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
model.score(X_test, y_test)

0.999598435864471

In [22]:
f1_score(y_test, y_pred)

0.8248200205690779

In [38]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.999598435864471


#### Hyperparameter Tuning
- To fine tune our model, we will randomly search parameters to define the best hyperparameter to improve the accuracy of our model.

In [24]:
param_dist = {'n_estimators': randint(50,500),'max_depth': randint(1,20)}

In [25]:
#randomsearchcv was taking too long with full dataset, took  a sample of the dataset for hyperparameter tuning
X_train_sample = X_train.sample(frac=0.1, random_state=42)
y_train_sample = y_train.sample(frac=0.1, random_state=42)

In [26]:
rand_search = RandomizedSearchCV(model,param_distributions = param_dist, n_iter=5, cv=5)

In [27]:
rand_search.fit(X_train_sample, y_train_sample)

In [30]:
print('Best Hyperparameters:',  rand_search.best_params_)
print('Best Estimator:', rand_search.best_estimator_)
print("Best Score:", rand_search.best_score_)

Best Hyperparameters: {'max_depth': 14, 'n_estimators': 102}
Best Estimator: RandomForestClassifier(max_depth=14, n_estimators=102)
Best Score: 0.9993556118740299


#### Tuning Random Classifier Model

In [31]:
new_model = RandomForestClassifier(n_estimators=102, max_depth=14)
new_model.fit(X_train, y_train)

In [36]:
y_pred_n = new_model.predict(X_test)

In [37]:
f1_score(y_test, y_pred_n)

0.7555391432791729

In [34]:
new_model.score(X_test, y_test)

0.9994797740553419