<h2>Payment Fraud Detection</h2>
<h3>Feature Engineering and Model Training</h3>
<h4>Author: Akshay Pandurang Paunikar</h4

In [71]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [72]:
# set the working directory
import io
%cd "E:\iNeuron\Projects\Payment_Fraud_Detection\notebook\datasets"

E:\iNeuron\Projects\Payment_Fraud_Detection\notebook\datasets


In [73]:
# read the data
data = pd.read_csv("final_data.csv")
data.head()

Unnamed: 0,AccountAgeDays,NumItems,localTime,PaymentMethod,PaymentMethodAgeDays,Label
0,29,1,4.75,paypal,28.2,Good
1,725,1,4.74,storecredit,0.0,Good
2,845,1,4.92,creditcard,0.0,Good
3,503,1,4.89,creditcard,0.0,Good
4,2000,1,5.04,creditcard,0.0,Good


In [74]:
# split data into independent features and dependent target variable
X = data.drop(['Label'], axis=1)
y = data['Label']

In [75]:
# create an instance of Label Encoder, One Hot Encode, Standard Scaler
le = LabelEncoder()
one_hot = OneHotEncoder()
ss = StandardScaler()

In [76]:
# numerical columns and categorical columns
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

print("Numerical Columns:\n", num_features)
print("Categorical Columns:\n", cat_features)

Numerical Columns:
 Index(['AccountAgeDays', 'NumItems', 'localTime', 'PaymentMethodAgeDays'], dtype='object')
Categorical Columns:
 Index(['PaymentMethod'], dtype='object')


In [77]:
# create pipelines for categorical and numerical data
num_pipeline = Pipeline(
    steps=[
        ("Scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("one hot", OneHotEncoder()),
        ("Scaler", StandardScaler(with_mean=False))
    ]
)

In [78]:
# creating preprocessor object
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_features),
    ("cat_pipeline", cat_pipeline, cat_features)
])

In [79]:
# applying preprocesing object to features
X = preprocessor.fit_transform(X)

In [80]:
# Label encode target variable
y = le.fit_transform(y)

In [81]:
# divide the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=333)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (21329, 7)
y_train: (21329,)
X_test: (9142, 7)
y_test: (9142,)


In [82]:
# set the working directory
import io
%cd "E:\iNeuron\Projects\Payment_Fraud_Detection\notebook\"

E:\iNeuron\Projects\Payment_Fraud_Detection\notebook


#### Model Training

In [83]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [84]:
# Create an Evaluate Function to give all metrics after model Training
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmatrix = confusion_matrix(true, predicted)    
    classificationreport = classification_report(true, predicted)
    return accuracy, confusionmatrix, classificationreport

In [85]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Support Vector Classifier': SVC(),
    'Gaussian Naive Bayes': GaussianNB(),
    'K-Neighbors Classifier': KNeighborsClassifier(),
    'CatBoost Classifier': CatBoostClassifier(verbose=False),
    'XGBoost Classifier': XGBClassifier()
}

model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    train_accuracy, train_confusionmatrix, train_classificationreport = evaluate_model(y_train,y_train_pred)

    test_accuracy, test_confusionmatrix, test_classificationreport = evaluate_model(y_test, y_test_pred)
        
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("**Accuracy Score:", train_accuracy)
    print("**Confusion Matrix: \n", train_confusionmatrix)
    print("**Classification Report: \n", train_classificationreport)

    print('-'*35)
    
    print('Model performance for Test set')
    print("**Accuracy Score:", test_accuracy)
    print("**Confusion Matrix: \n", test_confusionmatrix)
    print("**Classification Report: \n", test_classificationreport)
    
    accuracy_list.append(test_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
**Accuracy Score: 0.9950302405176051
**Confusion Matrix: 
 [[    0   105]
 [    1 21223]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       105
           1       1.00      1.00      1.00     21224

    accuracy                           1.00     21329
   macro avg       0.50      0.50      0.50     21329
weighted avg       0.99      1.00      0.99     21329

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.9945307372566178
**Confusion Matrix: 
 [[   0   49]
 [   1 9092]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.99      1.00      1.00      9093

    accuracy                           0.99      9142
   macro avg       0.50      0.50      0.50      9142
weighted avg       0.99      0.99      0.99      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Classifier
Model performance for Training set
**Accuracy Score: 0.995077125041024
**Confusion Matrix: 
 [[    0   105]
 [    0 21224]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       105
           1       1.00      1.00      1.00     21224

    accuracy                           1.00     21329
   macro avg       0.50      0.50      0.50     21329
weighted avg       0.99      1.00      0.99     21329

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.9946401225114855
**Confusion Matrix: 
 [[   0   49]
 [   0 9093]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.99      1.00      1.00      9093

    accuracy                           0.99      9142
   macro avg       0.50      0.50      0.50      9142
weighted avg       0.99      0.99      0.99 

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K-Neighbors Classifier
Model performance for Training set
**Accuracy Score: 0.9958741619391439
**Confusion Matrix: 
 [[   25    80]
 [    8 21216]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.76      0.24      0.36       105
           1       1.00      1.00      1.00     21224

    accuracy                           1.00     21329
   macro avg       0.88      0.62      0.68     21329
weighted avg       1.00      1.00      0.99     21329

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.9946401225114855
**Confusion Matrix: 
 [[   7   42]
 [   7 9086]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.14      0.22        49
           1       1.00      1.00      1.00      9093

    accuracy                           0.99      9142
   macro avg       0.75      0.57      0.61      9142
weighted avg       0.99      0.99      0.99   

In [86]:
# Results
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy Score']).sort_values(by=["Accuracy Score"],
                                                                                                         ascending=False)

Unnamed: 0,Model Name,Accuracy Score
1,Decision Tree Classifier,1.0
2,Random Forest Classifier,1.0
3,Gradient Boosting Classifier,1.0
4,AdaBoost Classifier,1.0
6,Gaussian Naive Bayes,1.0
8,CatBoost Classifier,1.0
9,XGBoost Classifier,1.0
5,Support Vector Classifier,0.99464
7,K-Neighbors Classifier,0.99464
0,Logistic Regression,0.994531


In [87]:
# we wil use XGBoost Classifier
model_xgboost = XGBClassifier()

In [88]:
# fit training data
model_xgboost.fit(X_train, y_train)
model_xgboost.score(X_train, y_train)

1.0

In [89]:
# make predictions on test data
predictions = model_xgboost.predict(X_test)

In [90]:
# performance metrics
print("Accuracy Score:", accuracy_score(y_test, predictions).round(4)*100)
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy Score: 100.0
Confusion Matrix:
 [[  49    0]
 [   0 9093]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        49
           1       1.00      1.00      1.00      9093

    accuracy                           1.00      9142
   macro avg       1.00      1.00      1.00      9142
weighted avg       1.00      1.00      1.00      9142



In [91]:
# Difference between Actual and Predicted Values
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':predictions})
pred_df

Unnamed: 0,Actual Value,Predicted Value
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
9137,1,1
9138,1,1
9139,1,1
9140,1,1
