# **<font color='white gray'>panData</font>**
## <font color='white gray'>Deep Learning for Artificial Intelligence Applications with Python and C++</font>

### <font color='white gray'>Deep Learning for Fraud Detection in Financial Transactions with Cryptocurrencies</font>


## **Installing and Loading the Packages**


In [48]:
!pip install -q -U watermark

In [49]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [50]:
# Imports
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras import Input
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [51]:
%reload_ext watermark
%watermark -a "panData"

Author: panData



## **Loading and Understanding the Data**


In [52]:
# Load the data
df = pd.read_csv('dataset.csv')

In [53]:
# Shape
df.shape

(9841, 51)

In [54]:
# Load the data
df = pd.read_csv('dataset.csv')

In [55]:
# Target variable
df.FLAG.value_counts()

Unnamed: 0_level_0,count
FLAG,Unnamed: 1_level_1
0,7662
1,2179


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

## **Cleaning Data**

In [57]:
# View the first rows
df.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [58]:
# Adjust column names to lowercase
df.columns = [x.lower() for x in df.columns]

In [59]:
# These columns will not be relevant for analysis
cols_to_drop = ['erc20 most sent token type',
                'erc20_most_rec_token_type',
                'address',
                'index',
                'unnamed: 0']

In [60]:
# Select attributes by filtering out columns to be dropped and the target variable
attributes = [x for x in df.columns if (x != 'flag' and x not in cols_to_drop)]

In [61]:
attributes

['avg min between sent tnx',
 'avg min between received tnx',
 'time diff between first and last (mins)',
 'sent tnx',
 'received tnx',
 'number of created contracts',
 'unique received from addresses',
 'unique sent to addresses',
 'min value received',
 'max value received ',
 'avg val received',
 'min val sent',
 'max val sent',
 'avg val sent',
 'min value sent to contract',
 'max val sent to contract',
 'avg value sent to contract',
 'total transactions (including tnx to create contract',
 'total ether sent',
 'total ether received',
 'total ether sent contracts',
 'total ether balance',
 ' total erc20 tnxs',
 ' erc20 total ether received',
 ' erc20 total ether sent',
 ' erc20 total ether sent contract',
 ' erc20 uniq sent addr',
 ' erc20 uniq rec addr',
 ' erc20 uniq sent addr.1',
 ' erc20 uniq rec contract addr',
 ' erc20 avg time between sent tnx',
 ' erc20 avg time between rec tnx',
 ' erc20 avg time between rec 2 tnx',
 ' erc20 avg time between contract tnx',
 ' erc20 min val

In [62]:
# Extract unique values
unique_values = df.nunique()

In [63]:
unique_values

Unnamed: 0,0
unnamed: 0,9841
index,4729
address,9816
flag,2
avg min between sent tnx,5013
avg min between received tnx,6223
time diff between first and last (mins),7810
sent tnx,641
received tnx,727
number of created contracts,20


In [64]:
# Keep only attributes with more than one unique value (attributes that are not constant)
attributes = [x for x in attributes if x in unique_values.loc[(unique_values > 1)]]

In [65]:
# Display information about the DataFrame with selected attributes
df[attributes].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 40 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   avg min between sent tnx                              9841 non-null   float64
 1   avg min between received tnx                          9841 non-null   float64
 2   time diff between first and last (mins)               9841 non-null   float64
 3   sent tnx                                              9841 non-null   int64  
 4   received tnx                                          9841 non-null   int64  
 5   number of created contracts                           9841 non-null   int64  
 6   unique received from addresses                        9841 non-null   int64  
 7   unique sent to addresses                              9841 non-null   int64  
 8   min value received                                    9841

## **Creation of the Pre-Processing Pipeline**
- **BaseEstimator:** This is the base class for all estimators in scikit-learn. An estimator is any object that can estimate some parameters based on a dataset. For example, a machine learning algorithm like linear regression or a decision tree is an estimator. The BaseEstimator class provides basic functionalities, such as methods to set an estimator's parameters and to get these parameters with get_params() and set_params().

- **TransformerMixin:** This is a mixin class used to add transformation functionality to an estimator. In scikit-learn, a "transformer" is a type of estimator that can transform a dataset. For instance, it might be used to normalize or standardize data, select or extract features, etc. The TransformerMixin class adds the fit_transform() method, which is a convenient method that first fits the transformer to the dataset (using fit()) and then applies the transformation to the same dataset (using transform()). This is useful to avoid having to call fit() and transform() separately during data preprocessing.

The combination of these two classes is often used when creating a custom estimator or transformer with scikit-learn. By inheriting from these classes, you ensure that your custom estimator integrates well with other scikit-learn functionalities, such as pipelines and other modeling tools.


In [66]:
# Definition of a custom class that inherits from BaseEstimator and TransformerMixin
class PipeSteps(BaseEstimator, TransformerMixin):

    # Constructor method to initialize the class with a list of columns
    def __init__(self, columns=[]):
        # Assign the columns argument to the instance attribute self.columns
        self.columns = columns

    # Fit method used to fit (train) the transformation on the training data
    def fit(self, X, y=None):
        # Returns the instance itself, indicating the method does not make modifications
        return self

    # Transform method to transform the input data
    def transform(self, X):
        # Makes a copy of the input data to avoid altering the original data
        X = X.copy()
        # Returns the data without modifications
        return X

In [67]:
# Definition of a class that inherits from DSAPipeSteps
class SelectColumns(PipeSteps):

    # Transform method to transform the input data
    def transform(self, X):

        # Makes a copy of the input data to avoid altering the original data
        X = X.copy()

        # Selects and returns only the columns specified in self.columns
        return X[self.columns]

In [68]:
# Definition of a class that inherits from PipeSteps
class FillData(PipeSteps):

    # Fit method to fit the transformation on the training data
    def fit(self, X, y=None):

        # Calculates the mean of each column specified in self.columns and stores it in the self.means dictionary
        self.means = {col: X[col].mean() for col in self.columns}

        # Returns the instance itself, indicating the method does not make modifications
        return self

    # Transform method to transform the input data
    def transform(self, X):

        # Makes a copy of the input data to avoid altering the original data
        X = X.copy()

        # Iterates over each column specified in self.columns
        for col in self.columns:

            # Fills missing values in the column with the mean calculated during the fit phase
            X[col] = X[col].fillna(self.means[col])

        # Returns the transformed data
        return X

In [69]:
# Definition of a class that inherits from PipeSteps
class NormalizeData(PipeSteps):

    # Fit method to fit the scaler on the training data
    def fit(self, X, y=None):

        # Initializes an instance of StandardScaler to normalize the data
        self.scaler = StandardScaler()

        # Fits the scaler on the columns specified in self.columns
        self.scaler.fit(X[self.columns])

        # Returns the instance itself, indicating the method does not make modifications
        return self

    # Transform method to transform the input data
    def transform(self, X):

        # Makes a copy of the input data to avoid altering the original data
        X = X.copy()

        # Applies the normalization transformation to the specified columns
        X[self.columns] = self.scaler.transform(X[self.columns])

        # Returns the transformed data
        return X

In [70]:
# Definition of a class that inherits from PipeSteps
class RetrieveData(PipeSteps):

    # Transform method to transform the input data
    def transform(self, X):

        # Makes a copy of the input data to avoid altering the original data
        X = X.copy()

        # Check if X is a pandas DataFrame, and if so, convert it to a NumPy array
        if isinstance(X, pd.DataFrame):
            return X.values
        else:
            # If X is already a NumPy array, return it as-is
            return X

In scikit-learn, a Pipeline is a tool that chains multiple steps of transformation and a final estimator into a single object. This allows you to set up a machine learning process that includes preprocessing steps (such as normalization, standardization, and feature extraction) followed by the application of a machine learning model. Each step in the pipeline is represented by a tuple containing a name and a transformation or model object.

In [71]:
# Ensure correct data types in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='ignore')

# Remove categorical columns (strings) if they are not relevant for the model
categorical_columns = df.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    print(f"Categorical columns detected and removed: {categorical_columns}")
    df.drop(columns=categorical_columns, inplace=True)

# Update the attributes list to reflect only the remaining columns
attributes = [col for col in df.columns if col != 'flag' and col not in categorical_columns]


Categorical columns detected and removed: Index(['address', ' erc20 most sent token type', ' erc20_most_rec_token_type'], dtype='object')


In [72]:
# Create the pipeline
preprocessing_pipeline = Pipeline([
    ('feature_selection', SelectColumns(attributes)),
    ('fill_missing', FillData(attributes)),
    ('standard_scaling', StandardScaler()),
    ('return_values', RetrieveData())
])

In [73]:
# Input variables
X = df[attributes]

In [74]:
# Output variable
y = df['flag']

In [75]:
# Adjusts the type of the output variable
y = to_categorical(y)

In [76]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [77]:
# Standardize the data
X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.transform(X_test)

In [78]:
X_train

array([[ 1.24220965, -0.83984471, -0.23259911, ...,  0.        ,
        -0.21172682, -0.2313969 ],
       [ 0.4022775 ,  1.07325309, -0.23259911, ...,  0.        ,
        -0.21172682, -0.17365057],
       [ 1.42085852, -0.42791812, -0.23259911, ...,  0.        ,
        -0.21172682, -0.2313969 ],
       ...,
       [ 0.15831234,  0.51072093, -0.23259911, ...,  0.        ,
        -0.21172682, -0.28914324],
       [-1.44105404, -0.78937149, -0.22473065, ...,  0.        ,
        -0.21172682, -0.28914324],
       [ 0.82206705,  2.04119917, -0.23250192, ...,  0.        ,
        -0.21172682, -0.28914324]])

In [79]:
X_test

array([[ 9.96126126e-01, -1.40726138e+00, -1.54379550e-01, ...,
         0.00000000e+00, -2.11726819e-01, -4.11559255e-04],
       [ 1.31105655e+00, -6.81098299e-01, -9.86466733e-02, ...,
         0.00000000e+00, -2.11726819e-01, -2.31396905e-01],
       [-1.70726205e+00, -1.40319096e+00, -2.25103708e-01, ...,
         0.00000000e+00, -2.11726819e-01, -2.89143241e-01],
       ...,
       [-1.33619492e+00, -5.47588495e-01, -2.32514779e-01, ...,
         0.00000000e+00, -2.11726819e-01, -2.89143241e-01],
       [ 1.00424653e+00, -1.38853744e+00, -2.32599109e-01, ...,
         0.00000000e+00, -2.11726819e-01, -2.31396905e-01],
       [-1.20379704e+00, -2.42306930e-01, -2.31476133e-01, ...,
         0.00000000e+00, -2.11726819e-01, -2.89143241e-01]])

## **Construction of the Deep Learning Model**

In [80]:
# Create sequence of layers
model = Sequential()

In [81]:
# Add an input layer to the model with the shape specified by the length of 'attributes'
model.add(Input(shape=(len(attributes),)))

# Add a dense layer to the model with 'len(attributes)' units and the 'relu' activation function
model.add(Dense(len(attributes), activation='relu'))

# Add a dense layer to the model with 20 units and the 'relu' activation function
model.add(Dense(20, activation='relu'))

# Add a dense layer to the model with 5 units and the 'relu' activation function
model.add(Dense(5, activation='relu'))

# Add a dense layer to the model with 2 units and the 'softmax' activation function (output layer)
model.add(Dense(2, activation='softmax'))

In [82]:
# Model compilation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [83]:
# Summary
model.summary()

## **Training the Model**

In [84]:
%%time
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8236 - loss: 0.3527 - val_accuracy: 0.9936 - val_loss: 0.1390
Epoch 2/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9954 - loss: 0.1377 - val_accuracy: 0.9970 - val_loss: 0.0976
Epoch 3/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9988 - loss: 0.0940 - val_accuracy: 0.9997 - val_loss: 0.0719
Epoch 4/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0682 - val_accuracy: 0.9997 - val_loss: 0.0561
Epoch 5/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9998 - loss: 0.0511 - val_accuracy: 0.9993 - val_loss: 0.0446
Epoch 6/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9999 - loss: 0.0394 - val_accuracy: 0.9993 - val_loss: 0.0365
Epoch 7/10
[1m216/216[0m 

<keras.src.callbacks.history.History at 0x7b142f1caa70>

In [85]:
# Predictions on test data
test_predictions = [np.argmax(x) for x in model.predict(X_test)]

[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [86]:
# Calculate Accuracy
acc = metrics.accuracy_score(test_predictions, [np.argmax(y) for y in y_test])

Accuracy is an evaluation metric for classification models. It represents the proportion of correct predictions made by the model relative to the total number of samples evaluated. In other words, it's the number of correct predictions divided by the total number of predictions made, indicating how well the model is classifying the samples correctly.


In [87]:
print(f'Accuracy on Test Data: {acc:.2%}')

Accuracy on Test Data: 99.90%


In [88]:
# Calculate AUC
auc = metrics.roc_auc_score([np.argmax(y) for y in y_test], model.predict(X_test)[:, 1])

[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


The Area Under the Curve (AUC) is a metric used to evaluate the performance of binary classification models. It is based on the Receiver Operating Characteristic (ROC) curve, which is a plot that shows the relationship between the true positive rate (sensitivity) and the false positive rate (1 - specificity) for different decision thresholds.

The AUC represents the total area under the ROC curve and can range from 0 to 1. An AUC of 0.5 indicates random performance, while an AUC of 1.0 indicates perfect classification. Therefore, the closer the AUC value is to 1, the better the model's ability to distinguish between positive and negative classes.

In [89]:
print(f'AUC on Test Data - {auc:.2%}')

AUC on Test Data - 99.90%


## **Deploy of the Model and Fraud Detection in New Cryptocurrency Transactions**

In [90]:
import pandas as pd

# Load the CSV files
file_1 = pd.read_csv('dataset.csv')
file_2 = pd.read_csv('new_data.csv')

# Identify the columns that are common to both files
common_columns = file_1.columns.intersection(file_2.columns)

# Keep only the common columns in both DataFrames
file_1_filtered = file_1[common_columns]
file_2_filtered = file_2[common_columns]

# Save the filtered DataFrames (optional)
file_1_filtered.to_csv('dataset_filtered.csv', index=False)
file_2_filtered.to_csv('new_data_filtered.csv', index=False)

print("Different columns have been removed. Filtered files have been saved.")

Different columns have been removed. Filtered files have been saved.


In [91]:
# Load new transaction data
new_data = pd.read_csv('new_data.csv')

In [92]:
# Data in its original format
new_data

Unnamed: 0,avg min between sent tnx,avg min between received tnx,time diff between first and last (mins),sent tnx,received tnx,number of created contracts,unique received from addresses,unique sent to addresses,min value received,max value received,...,erc20 uniq sent addr.1,erc20 uniq rec contract addr,erc20 min val rec,erc20 max val rec,erc20 avg val rec,erc20 min val sent,erc20 max val sent,erc20 avg val sent,erc20 uniq sent token name,erc20 uniq rec token name
0,2570.59,3336.01,30572.7,8,3,0,2,4,0.1,40.0,...,0.0,1.0,600.0,600.0,600.0,0.0,0.0,0.0,0.0,1.0


In [93]:
# Align the new dataset columns with the columns used in training
new_data = new_data.reindex(columns=attributes, fill_value=0)

# Apply the same pipeline used on the training data
transformed_new_data = preprocessing_pipeline.transform(new_data)

In [94]:
# Data in the format the model expects to receive
transformed_new_data

array([[-1.74468652, -1.49029797, -0.11012514, -0.20890417, -0.5852175 ,
        -0.14846488, -0.17435227, -0.02949086, -0.10071957, -0.08219271,
        -0.12770293, -0.03945017, -0.0281488 , -0.04686515, -0.05420181,
        -0.110975  ,  0.        , -0.01204994, -0.01204994, -0.20568049,
        -0.02975031, -0.03630726, -0.01204994, -0.0197842 , -0.08228942,
        -0.06411176, -0.01425745, -0.02258837, -0.05367989, -0.09036273,
        -0.05319145, -0.22783703,  0.        ,  0.        ,  0.        ,
         0.        ,  0.00627018, -0.05510854, -0.02401607, -0.01364824,
        -0.0135741 , -0.01331307,  0.        ,  0.        ,  0.        ,
        -0.21172682, -0.2313969 ]])

In [95]:
# Extract the prediction with the highest probability
prediction = [np.argmax(x) for x in model.predict(transformed_new_data)]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


In [96]:
type(prediction)

list

In [97]:
# Result
if prediction[0] == 0:
    print("According to the model, this transaction does not represent a Fraud.")
else:
    print("According to the model, this transaction might represent a Fraud. Trigger human verification!")

According to the model, this transaction does not represent a Fraud.


In [98]:
%watermark -a "panData"

Author: panData



In [99]:
#%watermark -v -m

In [100]:
#%watermark --iversions

# The End