# Project 4 - Fraud Detection Etherium - Predictor

- Predict Fraud Detection Etherium with the loaded dataset, preprocessor and models

- Course Name :         Applied Machine Learning
- Course instructor :   Sohail Tehranipour
- Student Name :        Afshin Masoudi Ashtiani
- Project 4 -           Fraud Detection Etherium
- Date :                September 2024

## Install Required Libraries

In [14]:
%pip install pandas numpy joblib
%pip install scikit-learn imbalanced-learn
%pip install lightgbm xgboost catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


## Step 1: Load the data and models

In [3]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [16]:
# Constants for directories and file names
MODEL_DIR = '/content/drive/My Drive/Applied Machine Learning/Project 4 : Fraud Detection Etherium/models'
DATA_DIR = '/content/drive/My Drive/Applied Machine Learning/Project 4 : Fraud Detection Etherium/datasets'
DATA_FILE = 'cleaned_transaction_dataset.csv'
MODEL_NAMES = [
    'LGBM Classifier',
    'CatBoost Classifier',
    'XGBoost Classifier',
]

In [6]:
import os

# Load the dataset
data_path = os.path.join(DATA_DIR, DATA_FILE)
df = pd.read_csv(data_path)
df.sample(5)

Unnamed: 0,FLAG,Avg min between sent tnx,Avg min between received tnx,Time difference between first and last (mins),Sent tnx,Received tnx,Number of created contracts,Max value received,Avg value received,Avg value sent,Total either sent,Total either balance,ERC20 total either received,ERC20 total either sent,ERC20 total either sent contract,ERC20 unique sent address,ERC20 unique received token name
1799,0,0.0,6888.99,44532.2,1,6,0,250.0,53.297085,319.780809,319.780809,0.0017,601.681614,0.0,0.0,0.0,2.0
4702,0,0.97,691.83,58195.13,84,84,0,0.721084,0.215719,0.215299,18.085089,0.03528,0.0,0.0,0.0,0.0,0.0
9081,1,0.0,0.0,312.18,1,1,0,1.63,1.63,1.629559,1.629559,0.000441,15.0,0.0,0.0,0.0,1.0
992,0,0.0,0.0,2382.9,1,1,0,0.70305,0.70305,0.70243,0.70243,0.00062,0.0,0.0,0.0,0.0,0.0
4814,0,17.42,0.0,69.68,4,1,0,101.0,101.0,25.249475,100.9979,0.0021,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.model_selection import train_test_split

# Prepare features and target
X = df.drop(columns=['FLAG'])
y = df['FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
scaler

In [17]:
# Load models safely
import joblib

model_paths = {name: os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib") for name in MODEL_NAMES}
models = {}
for name, path in model_paths.items():
    try:
        models[name] = joblib.load(path)
    except Exception as e:
        print(f"Error loading model {name} from {path}: {str(e)}")
models

{'LGBM Classifier': LGBMClassifier(learning_rate=0.2, n_estimators=200, num_leaves=50,
                random_state=123, subsample=0.5),
 'CatBoost Classifier': <catboost.core.CatBoostClassifier at 0x7b497745ad10>,
 'XGBoost Classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.7, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.2, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=10, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=300, n_jobs=None,
               num_parallel_tree=None, random_state=123, ...)}

## Step 2: Make Predictions

- Calculate metrics

In [18]:
# Function for calculating metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def calculate_metrics(y_true, y_pred, average_type='binary'):
    """Calculate and return recall, F1, and precision scores."""
    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred, average=average_type)
    f1 = f1_score(y_true, y_pred, average=average_type)
    prec = precision_score(y_true, y_pred, average=average_type)
    return acc, rec, f1, prec

- Predict the sample

In [19]:
from imblearn.over_sampling import SMOTE

# Prediction and metrics evaluation function
def predict(X_sample):
    try:
        X_train_trans = scaler.transform(X_train)
        # X_sample_trans = loaded_preprocessor.transform(X_sample)
        X_sample_trans = pd.DataFrame(scaler.transform(X_sample), columns=X_sample.columns.tolist())

        # Using SMOTE to handle class imbalance
        X_resampled, y_resampled = SMOTE(random_state=123).fit_resample(X_train_trans, y_train)

        results = []
        for name, model in models.items():
            y_resampled_pred = model.predict(X_resampled)
            y_sample_pred = model.predict(X_sample_trans)

            acc, rec, f1, prec = calculate_metrics(y_resampled, y_resampled_pred)

            results.append({
                'Model': name,
                'Predicted Fraud': 'Yes' if y_sample_pred[0] == 1 else 'No',
                'Accuracy': acc,
                'Recall': rec,
                'F1': f1,
                'Precision': prec,
            })

        return pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)

    except Exception as e:
        print(f"An error occurred during model loading or prediction: {str(e)}")
        return pd.DataFrame()

- Random Sample

In [20]:
sample = df.sample(1)
X_sample = sample.drop('FLAG', axis= 1)
y_sample = sample['FLAG'].values[0]
X_sample

Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time difference between first and last (mins),Sent tnx,Received tnx,Number of created contracts,Max value received,Avg value received,Avg value sent,Total either sent,Total either balance,ERC20 total either received,ERC20 total either sent,ERC20 total either sent contract,ERC20 unique sent address,ERC20 unique received token name
369,22.79,0.0,68.37,3,1,0,101.0,101.0,33.666149,100.998446,0.001554,0.0,0.0,0.0,0.0,0.0


In [29]:
sample_pred = predict(X_sample)

print(f'>>>> The result of prediction :')
print(f'> Fraud: {"Yes" if y_sample == 1 else "No"}')
print(f'> Predicted Fraud: {sample_pred["Predicted Fraud"].values[0]}')
sample_pred

>>>> The result of prediction :
> Fraud: No
> Predicted Fraud: No


Unnamed: 0,Model,Predicted Fraud,Accuracy,Recall,F1,Precision
0,LGBM Classifier,No,0.912026,0.957779,0.915875,0.877483
2,XGBoost Classifier,No,0.910136,0.960572,0.914451,0.872556
1,CatBoost Classifier,No,0.880894,0.948579,0.888444,0.83548


- Predict the new data

In [23]:
import random

dict = {}
cols = X.columns.tolist()
for index, col in enumerate(cols):
    dict[col] = random.choice(df[col].unique().tolist())
    print(f"> {index + 1} : {col} -> {dict[col]}")

# Make predictions on new data
new_data = pd.DataFrame([dict])
new_data

> 1 : Avg min between sent tnx -> 7.69
> 2 : Avg min between received tnx -> 7285.41
> 3 : Time difference between first and last (mins) -> 392817.07
> 4 : Sent tnx -> 174
> 5 : Received tnx -> 243
> 6 : Number of created contracts -> 6453
> 7 : Max value received -> 0.5151479999999999
> 8 : Avg value received -> 1.163539
> 9 : Avg value sent -> 20.049428
> 10 : Total either sent -> 1.844063983
> 11 : Total either balance -> 0.000819
> 12 : ERC20 total either received -> 602.5745901
> 13 : ERC20 total either sent -> 0.0084
> 14 : ERC20 total either sent contract -> 3.15e-08
> 15 : ERC20 unique sent address -> 34.0
> 16 : ERC20 unique received token name -> 204.0


Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time difference between first and last (mins),Sent tnx,Received tnx,Number of created contracts,Max value received,Avg value received,Avg value sent,Total either sent,Total either balance,ERC20 total either received,ERC20 total either sent,ERC20 total either sent contract,ERC20 unique sent address,ERC20 unique received token name
0,7.69,7285.41,392817.07,174,243,6453,0.515148,1.163539,20.049428,1.844064,0.000819,602.57459,0.0084,3.15e-08,34.0,204.0


In [24]:
new_data_pred = predict(new_data)

print(f'>>>> The result of prediction :')
new_data_pred

>>>> The result of prediction :


Unnamed: 0,Model,Predicted Fraud,Accuracy,Recall,F1,Precision
0,LGBM Classifier,No,0.912026,0.957779,0.915875,0.877483
2,XGBoost Classifier,No,0.910136,0.960572,0.914451,0.872556
1,CatBoost Classifier,No,0.880894,0.948579,0.888444,0.83548
