## Installing and Loading Packages

In [None]:
#!pip install -q -U watermark

In [2]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [6]:
# Imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
from sklearn.pipeline import Pipeline
from keras.models import Sequential
import matplotlib.pyplot as plt
from keras.layers import Dense
from sklearn import metrics
from keras import Input
import tensorflow as tf
import pandas as pd 
import numpy as np 
import warnings
import sklearn


warnings.filterwarnings('ignore')

In [5]:
%reload_ext watermark
%watermark -a "Pack Version"

Author: Pack Version



## Loading and Understanding Data

In [7]:
# Load the data
df = pd.read_csv('dataset.csv')

In [8]:
# Shape
df.shape

(9841, 51)

In [9]:
# View the first lines
df.head()

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [10]:
# Target variable
df.FLAG.value_counts()

FLAG
0    7662
1    2179
Name: count, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9841 entries, 0 to 9840
Data columns (total 51 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            9841 non-null   int64  
 1   Index                                                 9841 non-null   int64  
 2   Address                                               9841 non-null   object 
 3   FLAG                                                  9841 non-null   int64  
 4   Avg min between sent tnx                              9841 non-null   float64
 5   Avg min between received tnx                          9841 non-null   float64
 6   Time Diff between first and last (Mins)               9841 non-null   float64
 7   Sent tnx                                              9841 non-null   int64  
 8   Received Tnx                                          9841

## Data Cleansing

In [11]:
# Set the name to lower case
df.columns = [x.lower() for x in df.columns]

In [12]:
# Removing irrelevant columns for analysis
cols_to_drop = [' erc20 most sent token type',
                ' erc20_most_rec_token_type',
                'address',
                'index',
                'unnamed: 0']

In [13]:
# Selects the attributes by filtering the columns that will be removed and the target variable
attributes = [x for x in df.columns if (x != 'flag' and x not in cols_to_drop)]

In [14]:
# Extract unique values
unique_values = df.nunique()

In [16]:
# Keeps only attributes with more than one unique value (attributes that are not constant)
attributes = [x for x in attributes if x in unique_values.loc[(unique_values > 1)]]

## Pre-Processing Pipeline

In [17]:
# Defining a custom class that inherits from BaseEstimator and TransformerMixin
class PipeSteps(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns=[]):        
        
        self.columns = columns
    
    def fit(self, X, y = None):        
        
        return self
    
    def transform(self, X):        
       
        X = X.copy()        
    
        return X

In [18]:
# Defining a class that inherits from PipeSteps
class SelectColumns(PipeSteps):
    
    def transform(self, X):        
        
        X = X.copy()        
       
        return X[self.columns]

In [19]:
# Defining a class that inherits from PipeSteps
class FillData(PipeSteps):
    
    def fit(self, X, y = None):        
        
        self.means = { col: X[col].mean() for col in self.columns }        
        
        return self
    
    def transform(self, X):        
        
        X = X.copy()        
        
        for col in self.columns:            
            
            X[col] = X[col].fillna(self.means[col])        
        
        return X

In [None]:
# Defining a class that inherits from PipeSteps
class StandardizeData(PipeSteps):
    
    def fit(self, X, y = None):        
       
        self.scaler = StandardScaler()        
        
        self.scaler.fit(X[self.columns])        
        
        return self
    
    def transform(self, X):        
        
        X = X.copy()        
        
        X[self.columns] = self.scaler.transform(X[self.columns])        
        
        return X

In [22]:
# Defining a class that inherits from PipeSteps
class GetData(PipeSteps):
    
    def transform(self, X):        
        
        X = X.copy()        
        
        return X.values

In [23]:
# Create the pipeline
pipe_pre_processing = Pipeline([('feature_selection', SelectColumns(attributes)),
                                       ('fill_missing', FillData(attributes)),
                                       ('standard_scaling', StandardizeData(attributes)),
                                       ('returnValues', GetData())])

In [24]:
# Input variables
X = df[attributes]

# Output variable
y = df['flag']

# Adjusts the type of the output variable
y = to_categorical(y)

In [25]:
# Split data into training and testing
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [26]:
# Standardizes data
X_treino = pipe_pre_processing.fit_transform(X_treino)
X_teste = pipe_pre_processing.transform(X_teste)

## Deep Learning Model Construction

In [28]:
# Create layer sequence
model = Sequential()

In [29]:
# Adds an input layer to the model with the shape specified by the length of 'attributes'
model.add(Input(shape = (len(attributes),)))

# Adds a dense layer to the model with 'len(attributes)' units and the 'relu' activation function
model.add(Dense(len(attributes), activation = 'relu'))

# Adds a dense layer to the model with 20 units and the 'relu' activation function
model.add(Dense(20, activation = 'relu'))

# Adds a dense layer to the model with 5 units and the 'relu' activation function
model.add(Dense(5, activation = 'relu'))

# Adds a dense layer to the model with 2 units and the 'softmax' activation function (output layer)
model.add(Dense(2, activation = 'softmax'))

In [30]:
# Model compilation
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Model Training and Assessment

In [35]:

model.fit(X_treino, y_treino, validation_data = (X_teste, y_teste), epochs = 10)

Epoch 1/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7078 - loss: 0.6306 - val_accuracy: 0.8527 - val_loss: 0.4131
Epoch 2/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step - accuracy: 0.8470 - loss: 0.3738 - val_accuracy: 0.8584 - val_loss: 0.3641
Epoch 3/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step - accuracy: 0.8664 - loss: 0.3048 - val_accuracy: 0.8767 - val_loss: 0.2543
Epoch 4/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 789us/step - accuracy: 0.9092 - loss: 0.2534 - val_accuracy: 0.9343 - val_loss: 0.2284
Epoch 5/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step - accuracy: 0.9263 - loss: 0.2342 - val_accuracy: 0.9363 - val_loss: 0.2128
Epoch 6/10
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step - accuracy: 0.9428 - loss: 0.1966 - val_accuracy: 0.9404 - val_loss: 0.1949
Epoch 7/10
[1m216

<keras.src.callbacks.history.History at 0x1c5e5c4d640>

In [36]:
# Predictions with test data
predictions_test = [np.argmax(x) for x in model.predict(X_teste)]

[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step


In [47]:
# Calculates Accuracy
acc = metrics.accuracy_score(predictions_test, [np.argmax(y) for y in y_teste])

print(f'Test Data Accuracy: {acc:,.2%}')

# Calculates AUC
auc = (metrics.roc_auc_score([np.argmax(y) for y in y_teste], model.predict(X_teste)[:,1])).round(2)
print(f'AUC nos Dados de Teste - {auc:,.2%}')

Test Data Accuracy: 94.72%
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 616us/step
AUC nos Dados de Teste - 97.00%


## Model Deployment and Fraud Detection in New Cryptocurrency Transactions

In [54]:
# Load new data from a transaction
new_data = pd.read_csv('new_data.csv')

# Applies the same pipeline applied to training data
new_data_transformed = pipe_pre_processing.transform(new_data)

# Extracts the highest probability prediction
forecast = [np.argmax(x) for x in model.predict(new_data_transformed)]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [55]:
# Result
if forecast[0] == 0:
    print("According to the model, this transaction does not represent Fraud.")
else:
    print("According to the model, this transaction may represent Fraud. Trigger human verification!")

According to the model, this transaction does not represent Fraud.


In [56]:
%watermark -a "Pack Version"

Author: Pack Version



In [59]:
%watermark -v -m

Python implementation: CPython
Python version       : 3.12.7
IPython version      : 8.27.0

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 11
Machine     : AMD64
Processor   : Intel64 Family 6 Model 151 Stepping 2, GenuineIntel
CPU cores   : 16
Architecture: 64bit



In [58]:
%watermark --iversions

pandas    : 2.2.2
keras     : 3.6.0
tensorflow: 2.16.2
matplotlib: 3.9.2
sklearn   : 1.5.1
numpy     : 1.26.4

