In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,  RandomizedSearchCV, StratifiedKFold
import pickle

from scipy   import stats
from boruta  import BorutaPy
from category_encoders import OneHotEncoder

from IPython.display      import Image
from IPython.core.display import HTML

import joblib
import warnings


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from xgboost import plot_importance

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.metrics import recall_score, f1_score, make_scorer, cohen_kappa_score
from sklearn.preprocessing import MinMaxScaler
 
warnings.filterwarnings('ignore')


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Loading Data

In [None]:
df = pd.read_csv('/kaggle/input/payment-systems-transactions-synthetic-dataset/PS_20174392719_1491204439457_log.csv')
df.head()

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.corr()

# Data Dictionary

**step** - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

**type** - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

**amount** - amount of the transaction in local currency.

**nameOrig** - customer who started the transaction

**oldbalanceOrg** - initial balance before the transaction

**newbalanceOrig** - new balance after the transaction

**nameDest** - customer who is the recipient of the transaction

**oldbalanceDest** - initial balance recipient before the transaction.

**newbalanceDest** - new balance recipient after the transaction. 

**isFraud** - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

**isFlaggedFraud** - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

**Column Rename**

In [None]:
df.columns

**Check NA**

In [None]:
df.isna().mean()

**Description Statistics**

In [None]:
num_attributes = df.select_dtypes(exclude='object')
cat_attributes = df.select_dtypes(include='object')

**Numerical Attributes**

In [None]:
describe = num_attributes.describe().T

describe['range'] = (num_attributes.max() - num_attributes.min()).tolist()
describe['variation coefficient'] = (num_attributes.std() / num_attributes.mean()).tolist()
describe['skew'] = num_attributes.skew().tolist()
describe['kurtosis'] = num_attributes.kurtosis().tolist()

describe

**Categorical Attributes**

In [None]:
cat_attributes.describe()

* The majority type is cash_out with 2237500.

* There's a lot of variability in name_orig, so it could be hard to use one hot encoding.

* There's less name_orig than name_dest. There's more users sending than receiving, however use one hot encoding will not help.

* There's more fraud than the flagged fraud, it shows that the current method can't recognize fraud efficiently.

**Feature Engineering**

let's look in type column

In [None]:
# Count the occurrences of each transaction type
type_counts = df['type'].value_counts()

# Plot the counts using a pie chart
plt.figure(figsize=(8, 8))
plt.pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is circular.
plt.title('Proportion of Bank Account Transaction Types')
plt.show()

In [None]:
# Create a box plot for each transaction type category
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='type', y='amount', palette='muted')
plt.xlabel('Transaction Type')
plt.ylabel('Transaction Amount')
plt.title('Box Plot of Transaction Amount for Each Type Category')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Group the data by 'type' and calculate the sum of 'isfraud' for each group
fraud_counts = df.groupby('type')['isFraud'].sum()

# Plot the results using a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x=fraud_counts.index, y=fraud_counts.values, palette='muted')
plt.xlabel('Transaction Type')
plt.ylabel('Number of Fraudulent Transactions')
plt.title('Number of Fraudulent Transactions for Each Type')
plt.xticks(rotation=45)
plt.show()

print(fraud_counts)

Only type Cash_out & Transfer were Frauds payments
**Interesting**

# Feature Engineering

In [None]:
# step
df['step_days'] = df['step'].apply(lambda i: i/24)
df['step_weeks'] = df['step'].apply(lambda i: i/(24*7))

# difference between initial balance before the transaction and new balance after the transaction
df['diff_new_old_balance'] = df['newbalanceOrig'] - df['oldbalanceOrg']

# difference between initial balance recipient before the transaction and new balance recipient after the transaction.
df['diff_new_old_destiny'] = df['newbalanceDest'] - df['oldbalanceDest']

# name orig and name dest
df['nameOrig'] = df['nameOrig'].apply(lambda i: i[0])
df['nameDest'] = df['nameDest'].apply(lambda i: i[0])

**Selecting Columns**

**Response Variable**

In [None]:
ax = sns.countplot(y='isFraud', data=df);

total = df['isFraud'].size
for p in ax.patches:
        percentage = ' {:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

dataset is **imbalanced** as expected

**Numerical Variables**

In [None]:
num_attributes = df.select_dtypes(exclude='object')
columns = num_attributes.columns.tolist()
j = 1


num_rows = (len(columns) - 1) // 4 + 1
num_cols = min(len(columns), 4)


plt.figure(figsize=(15, 5*num_rows))  
for column in columns:
    plt.subplot(num_rows, num_cols, j)
    sns.distplot(num_attributes[column])
    plt.title(column)
    
    j += 1

plt.tight_layout()
plt.show()

**Categorical Variables**

In [None]:
cat_attributes = df.select_dtypes(include='object')
columns = cat_attributes.columns.tolist()
j = 1

for column in columns:
    plt.subplot(3, 2, j)
    ax = sns.countplot(y=column, data=cat_attributes)
    
    total = cat_attributes[column].size
    for p in ax.patches:
        percentage = ' {:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))
    
    j += 1

# Data Preparation

**Spliting into Train, Valid and Test**

In [None]:
X=df.drop(columns=['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest', 
                      'step_weeks', 'step_days'], axis = 1)

y = df['isFraud']

In [None]:
# spliting into temp and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=.2, stratify=y)

# spliting into train and valid
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=.2, stratify=y_temp)

**One Hot Encoder**

In [None]:
ohe = OneHotEncoder(cols=['type'], use_cat_names=True)

X_train = ohe.fit_transform(X_train)
X_valid = ohe.transform(X_valid)

X_temp = ohe.fit_transform(X_temp)
X_test = ohe.transform(X_test)

**Rescaling**

In [None]:
num_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
               'diff_new_old_balance', 'diff_new_old_destiny']
mm = MinMaxScaler()
X_params = X_temp.copy()

X_train[num_columns] = mm.fit_transform(X_train[num_columns])
X_valid[num_columns] = mm.transform(X_valid[num_columns])

X_params[num_columns] = mm.fit_transform(X_temp[num_columns])
X_test[num_columns] = mm.transform(X_test[num_columns])

**Feature Selection**

In [None]:
final_columns_selected = ['step', 'oldbalanceOrg', 
                          'newbalanceOrig', 'newbalanceDest', 
                          'diff_new_old_balance', 'diff_new_old_destiny', 
                          'type_TRANSFER']

**Machine Learning Modeling**

In [None]:
X_train_cs = X_train[final_columns_selected]
X_valid_cs = X_valid[final_columns_selected]

X_temp_cs = X_temp[final_columns_selected]
X_test_cs = X_test[final_columns_selected]

X_params_cs = X_params[final_columns_selected]

# Helper functions

In [None]:
def ml_scores(model_name, y_true, y_pred):
    
    accuracy = balanced_accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return pd.DataFrame({'Balanced Accuracy': np.round(accuracy, 3), 
                         'Precision': np.round(precision, 3), 
                         'Recall': np.round(recall, 3),
                         'F1': np.round(f1, 3),
                         'Kappa': np.round(kappa, 3)}, 
                        index=[model_name])

In [None]:
def ml_cv_results(model_name, model, x, y, verbose=1):
    
    '''initial'''
    balanced_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    kappas = []
    
    mm = MinMaxScaler()
    
    x_ = x.to_numpy()
    y_ = y.to_numpy()
    
    count = 0
    
    '''cross-validation'''
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    for index_train, index_test in skf.split(x_, y_):
        ## Showing the Fold
        if verbose > 0:
            count += 1
            print('Fold K=%i' % (count))
    
        ## selecting train and test
        x_train, x_test = x.iloc[index_train], x.iloc[index_test]
        y_train, y_test = y.iloc[index_train], y.iloc[index_test]
        
        ## applying the scale
        x_train = mm.fit_transform(x_train)
        x_test = mm.transform(x_test)
    
        ## training the model
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        ## saving the metrics
        balanced_accuracies.append(balanced_accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        kappas.append(cohen_kappa_score(y_test, y_pred))
        
    '''results'''    
    accuracy_mean, accuracy_std = np.round(np.mean(balanced_accuracies), 3), np.round(np.std(balanced_accuracies), 3)
    precision_mean, precision_std = np.round(np.mean(precisions), 3), np.round(np.std(precisions), 3)
    recall_mean, recall_std = np.round(np.mean(recalls), 3), np.round(np.std(recalls), 3)
    f1_mean, f1_std = np.round(np.mean(f1s), 3), np.round(np.std(f1s), 3)
    kappa_mean, kappa_std = np.round(np.mean(kappas), 3), np.round(np.std(kappas), 3)
    
    ## saving the results in a dataframe
    return pd.DataFrame({"Balanced Accuracy": "{} +/- {}".format(accuracy_mean, accuracy_std),
                        "Precision": "{} +/- {}".format(precision_mean, precision_std),
                        "Recall": "{} +/- {}".format(recall_mean, recall_std),
                        "F1": "{} +/- {}".format(f1_mean, f1_std),
                        "Kappa": "{} +/- {}".format(kappa_mean, kappa_std)},
                       index=[model_name]) 

# XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train_cs, y_train)

y_pred = xgb.predict(X_valid_cs)

In [None]:
xgb_results = ml_scores('XGBoost', y_valid, y_pred)
xgb_results

**Classification Report**

In [None]:
print(classification_report(y_valid, y_pred))

**Cross Validation**

In [None]:
xgb_cv = ml_cv_results('XGBoost', XGBClassifier(),
    
                       
                       X_temp_cs, y_temp)
xgb_cv

# LightGBM

In [None]:
lightgbm = LGBMClassifier()
lightgbm.fit(X_train_cs, y_train)

y_pred = lightgbm.predict(X_valid_cs)

In [None]:
lightgbm_results = ml_scores('LightGBM', y_valid, y_pred)
lightgbm_results

**Classification Report**

In [None]:
print(classification_report(y_valid, y_pred))

**Cross Validation**

In [None]:
lightgbm_cv = ml_cv_results('LightGDM', LGBMClassifier(), 
                            X_temp_cs, y_temp)
lightgbm_cv

# Comparing Model's Performance

In [None]:
modeling_performance = pd.concat([xgb_results, lightgbm_results ])
modeling_performance.sort_values(by="F1", ascending=True)

# Hyper Fine Tunning

In [None]:
f1 = make_scorer(f1_score)

In [None]:
params = {
    'booster': ['gbtree'],
    'eta': [0.3,0.2],
    'scale_pos_weight': [1]
}

In [None]:
gs = GridSearchCV(XGBClassifier(), 
                  param_grid=params, 
                  scoring=f1, 
                  cv=StratifiedKFold(n_splits=5))

gs.fit(X_params_cs, y_temp)

In [None]:
best_params = gs.best_params_
best_params

In [None]:
#best_params = {'booster': 'gbtree', 'eta': 0.3, 'scale_pos_weight': 1}

In [None]:
gs.best_score_

# Results

In [None]:
xgb_gs = XGBClassifier(
    booster=best_params['booster'],
    eta=best_params['eta'],
    scale_pos_weight=best_params['scale_pos_weight']
)

In [None]:
xgb_gs.fit(X_train_cs, y_train)

In [None]:
y_pred = xgb_gs.predict(X_valid_cs)

# single Result

In [None]:
xgb_gs_results = ml_scores('XGBoost GS', y_valid, y_pred)
xgb_gs_results

# Conclusions

# Final Model

In [None]:
final_model = XGBClassifier(
    booster=best_params['booster'],
    eta=best_params['eta'],
    scale_pos_weight=best_params['scale_pos_weight']
)

final_model.fit(X_params_cs, y_temp)

# Unseen Data Score

In [None]:
y_pred = final_model.predict(X_test_cs)

In [None]:
unseen_scores = ml_scores('unseen', y_test, y_pred)
unseen_scores

# Model's Performance

In [None]:
print('For unseen data, the values of balanced accuracy is equal %.2f and precision is equal %.2f.' % (unseen_scores['Balanced Accuracy'], unseen_scores['Precision']))