In [None]:
import numpy as np
import pandas as pd
import seaborn as snb
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier , BaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report , ConfusionMatrixDisplay , \
 confusion_matrix , recall_score , f1_score , precision_score , roc_auc_score , accuracy_score
from imblearn.under_sampling import TomekLinks , OneSidedSelection
snb.set_style('darkgrid')

In [None]:
df = pd.read_csv('/content/Fraud.csv', nrows = 2500000)
df.sample(10)

# Analysis of the data

## Checking for the duplicate records , null values

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
# dropping duplicates
df.drop_duplicates()

# dealing with the null values and dropping the null values
print(df.isnull().sum())
df.dropna(inplace = True)

print('Shape of the dataframe is {}'.format(df.shape))

## Summary of dataset

In [None]:
summary_df = df.iloc[:,:-2].merge(df[['isFraud' , 'isFlaggedFraud']].astype('object') , left_index = True , right_index = True)

In [None]:
summary_df.select_dtypes(['int' , 'float']).describe().round(2)

In [None]:
summary_df.select_dtypes(['object']).describe()

In [None]:
# CORRELATION MATRIX
f, ax = plt.subplots(figsize=(8,4))
snb.heatmap(df.corr(), annot=True, fmt='.2f')
plt.show()

In [None]:
snb.countplot(data = df , x = 'type')

It can be seen that most of the transactions are of type payments and least amount of payments are of type debit.

In [None]:
print( 'Percentage of transactions where initial balance is 0 = {}%'.format((df.query('oldbalanceOrg == 0').shape[0] * 100) / df.shape[0]))
print( 'Percentage of transactions where final balance is 0 = {}%'.format((df.query('newbalanceDest == 0').shape[0] * 100) / df.shape[0]))

In [None]:
df[df.amount == 0]

In [None]:
print('Number of valid and fraud transactions :')
print('Number of valid transactions = {}'.format(df['isFraud'].value_counts()[0]))
print('Number of fraud transactions = {}'.format(df['isFraud'].value_counts()[1]))
df['isFraud'].value_counts().plot.pie(autopct='%1.2f%%');

Only 0.09% of transactions are fraud transactions. This creates an imbalance in dataset thus making the predictive modelling difficult.

In [None]:
snb.boxplot(data = df , x = 'isFraud' , y = 'amount')

Fraud transactions involve large amount as compared to the valid transactions.

In [None]:
snb.stripplot(data = df , x = 'type' , y = 'amount' , hue = 'isFraud')

Transfer and Cash out mode of payments are responsible for the transactions involving the large amounts. While other mode of payments account for small amount transactions.

## Analyzing only fraud transactions

In [None]:
fraud_df = df.query('isFraud == 1')

In [None]:
print('Frequency of the modes of fraudulent transactions :')
fraud_df.type.value_counts()

In [None]:
plt.title('Type of transactions that are fraud');
snb.countplot(data = fraud_df , x = 'type')

The fraud transactions are carried out only through the transfer and cash out mode of payment.

In [None]:
snb.boxplot(data = fraud_df , x = 'type' , y = 'amount')

Transfers mode of payment involves large amount transactions as compared to Cash_out mode of payment.

# Preparing dataset for Modelling

In [None]:
cash_out = 'CASH_OUT'
transfer = 'TRANSFER'
model_df = df.query('type == @cash_out or type == @transfer ')
model_df.sample(5)

In [None]:
model_df.isFraud.value_counts()

In [None]:
model_df.columns

In [None]:
ohe = OneHotEncoder(drop = 'first' , sparse = False)
model_df['type_encoded'] = ohe.fit_transform(model_df[['type']])

In [None]:
model_df.sample(5)

In [None]:
x = model_df[['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest' , 'type_encoded']]
y = model_df[['isFraud']]

# Handling Class Imbalance

## Undersampling

Undersampling refers to a technique of reducing the number of observations in the majority class to balance the class distribution with the minority class. It involves removing some of the data points in the majority class until the number of observations in both classes becomes comparable.

1. Random Undersampling

In [None]:
rus = RandomUnderSampler()
x_rus , y_rus = rus.fit_resample(x,y)

In [None]:
print('Comparing the classes in the original and modified dataset')
print('------'*10)
print('Original dataset :')
print('---'*10)
print(model_df.isFraud.value_counts(),'\n')
print('modified dataset :')
print('---'*10)
print(y_rus.value_counts())

2.One Sided Selection under-sampling

In [None]:
oss  = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
x_oss , y_oss = oss.fit_resample(x, y)

In [None]:
print('Comparing the classes in the original and modified dataset')
print('------'*10)
print('Original dataset :')
print('---'*10)
print(model_df.isFraud.value_counts(),'\n')
print('modified dataset :')
print('---'*10)
print(y_oss.value_counts())

## Oversampling

Oversampling is a method of increasing the number of instances in the minority class by creating additional copies of the existing data points. It is a suitable approach in cases where there is insufficient data available to build a model that can accurately capture the minority class.

1. Random Over Sampling

In [None]:
ros = RandomOverSampler()
x_ros , y_ros = ros.fit_resample(x,y)

In [None]:
print('Comparing the classes in the original and modified dataset')
print('------'*10)
print('Original dataset :')
print('---'*10)
print(model_df.isFraud.value_counts(),'\n')
print('modified dataset :')
print('---'*10)
print(y_ros.value_counts())

2. SMOTE

In [None]:
sm = SMOTE()
x_smote , y_smote = sm.fit_resample(x,y)

In [None]:
print('Comparing the classes in the original and modified dataset')
print('------'*10)
print('Original dataset :')
print('---'*10)
print(model_df.isFraud.value_counts(),'\n')
print('modified dataset :')
print('---'*10)
print(y_smote.value_counts())

## Combining Oversampling and Undersampling using SMOTE and Tomek links

In [None]:
st = SMOTETomek()
x_combine , y_combine = st.fit_resample(x,y)

In [None]:
print('Comparing the classes in the original and modified dataset')
print('------'*10)
print('Original dataset :')
print('---'*10)
print(model_df.isFraud.value_counts(),'\n')
print('modified dataset :')
print('---'*10)
print(y_combine.value_counts())

# Experimental Setup and Training

In [None]:
def training_model(x,y,model):

  # SPLITTING DATA IN TRAINING AND TESTING SET
  xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.2 , random_state = 45 , stratify = y)
  print('Classes in training and test set after the split :')
  print('==='*20)
  print('training dataset')
  print('==='*5)
  print(ytrain.value_counts())
  print('testing dataset')
  print('==='*5)
  print(ytest.value_counts() , '\n')

  # CREATING AND TRAINING MODEL
  if model == 'lr':
    classifier = LogisticRegression()
  elif model == 'rf':
    classifier = RandomForestClassifier(n_jobs = -1)
  else:
    classifier = BaggingClassifier()



  skfold = StratifiedKFold(3)
  trin_scr = cross_val_score(classifier , xtrain , ytrain , scoring = 'f1' , cv = skfold , n_jobs = -1 )
  print('Perfomance on the training data = {}'.format(np.mean(trin_scr)))

  classifier.fit(xtrain,ytrain)
  ypred = classifier.predict(xtest)

  # CLASSIFICATION REPORT
  print('Classification Report for individual classes:')
  print('==='*20)
  print(classification_report(ytest , ypred) , '\n')

  # OVERALL MODEL SCORE
  print('Scores for the overall model')
  print('==='*20)
  print('Precision = {}'.format(precision_score(ypred , ytest)))
  print('Recall = {}'.format(recall_score(ytest , ypred)))
  print('f1-score = {}'.format(f1_score(ytest , ypred)),'\n')

  # AREA UNDER CURVE
  print('Area under the curve:')
  print('==='*20)
  print('AUC score = {}'.format(roc_auc_score(ytest , ypred)) , '\n')

  # PLOT CONFUSION MATRIX
  print('Confusion matrix')
  print('==='*20)
  cm = confusion_matrix(ytest,ypred)
  disp = ConfusionMatrixDisplay(confusion_matrix = cm , display_labels = classifier.classes_ )
  disp.plot()
  plt.show()

In [None]:
training_model(x,y,'lr')

In [None]:
training_model(x,y,'rf')

In [None]:
training_model(x,y,'bc')

#Under-Sampling

In [None]:
training_model(x_rus,y_rus,'lr')

In [None]:
training_model(x_oss,y_oss,'lr')

In [None]:
training_model(x_rus,y_rus,'rf')

In [None]:
training_model(x_oss,y_oss,'rf')

In [None]:
training_model(x_rus,y_rus,'bc')

In [None]:
training_model(x_oss,y_oss,'bc')

#Over-Sampling

In [None]:
training_model(x_ros,y_ros,'lr')

In [None]:
training_model(x_smote,y_smote,'lr')

In [None]:
training_model(x_ros,y_ros,'rf')

In [None]:
training_model(x_smote,y_smote,'rf')

In [None]:
training_model(x_ros,y_ros,'bc')

In [None]:
training_model(x_smote,y_smote,'bc')

# Combining Oversampling and Undersampling SMOTE and Tomek links

In [None]:
training_model(x_combine , y_combine , 'lr')

In [None]:
training_model(x_combine , y_combine , 'rf')

In [None]:
training_model(x_combine , y_combine , 'bc')