In [1]:

import pyodbc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


# For modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, confusion_matrix, 
                             classification_report, roc_curve, auc)
from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!\n")

✓ All libraries imported successfully!



In [2]:
df = pd.read_csv('transaction_dataset.csv')

In [3]:
print("\n" + "=" * 80)
print(" EXPLORATORY DATA ANALYSIS")
print("=" * 80)


 EXPLORATORY DATA ANALYSIS


In [4]:
print("\nFirst few rows:")
print(df.head())


First few rows:
   Unnamed: 0  Index                                     Address  FLAG  \
0           0      1  0x00009277775ac7d0d59eaad8fee3d10ac6c805e8     0   
1           1      2  0x0002b44ddb1476db43c868bd494422ee4c136fed     0   
2           2      3  0x0002bda54cb772d040f779e88eb453cac0daa244     0   
3           3      4  0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e     0   
4           4      5  0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89     0   

   Avg min between sent tnx  Avg min between received tnx  \
0                    844.26                       1093.71   
1                  12709.07                       2958.44   
2                 246194.54                       2434.02   
3                  10219.60                      15785.09   
4                     36.61                      10707.77   

   Time Diff between first and last (Mins)  Sent tnx  Received Tnx  \
0                                704785.63       721            89   
1                              

In [5]:
# Standardize string values
string_cols = df.select_dtypes(include='object').columns
for col in string_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

print("✓ Column names and values standardized\n")

✓ Column names and values standardized



In [6]:
print("\nData Types:")
print(df.dtypes)


Data Types:
Unnamed: 0                                                int64
Index                                                     int64
Address                                                  object
FLAG                                                      int64
Avg min between sent tnx                                float64
Avg min between received tnx                            float64
Time Diff between first and last (Mins)                 float64
Sent tnx                                                  int64
Received Tnx                                              int64
Number of Created Contracts                               int64
Unique Received From Addresses                            int64
Unique Sent To Addresses                                  int64
min value received                                      float64
max value received                                      float64
avg val received                                        float64
min val sent               

In [7]:
print("\nMissing Values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))


Missing Values:
ERC20 most sent token type             2697
ERC20_most_rec_token_type               871
ERC20 avg val sent                      829
ERC20 total Ether received              829
ERC20 total ether sent                  829
ERC20 avg time between rec tnx          829
ERC20 avg time between rec 2 tnx        829
ERC20 max val rec                       829
ERC20 avg val rec                       829
ERC20 avg time between contract tnx     829
dtype: int64


In [8]:
print("\nBasic Statistics:")
print(df.describe())


Basic Statistics:
        Unnamed: 0        Index         FLAG  Avg min between sent tnx  \
count  9841.000000  9841.000000  9841.000000               9841.000000   
mean   4920.000000  1815.049893     0.221421               5086.878721   
std    2840.996333  1222.621830     0.415224              21486.549974   
min       0.000000     1.000000     0.000000                  0.000000   
25%    2460.000000   821.000000     0.000000                  0.000000   
50%    4920.000000  1641.000000     0.000000                 17.340000   
75%    7380.000000  2601.000000     0.000000                565.470000   
max    9840.000000  4729.000000     1.000000             430287.670000   

       Avg min between received tnx  Time Diff between first and last (Mins)  \
count                   9841.000000                             9.841000e+03   
mean                    8004.851184                             2.183333e+05   
std                    23081.714801                             3.229379e+

In [9]:
df.dtypes.value_counts()

float64    39
int64       9
object      3
Name: count, dtype: int64

In [10]:
df.select_dtypes(include=['object']).head()

Unnamed: 0,Address,ERC20 most sent token type,ERC20_most_rec_token_type
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,cofoundit,numeraire
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,livepeer_token,livepeer_token
2,0x0002bda54cb772d040f779e88eb453cac0daa244,,xenon
3,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,raiden,xenon
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,statusnetwork,eos


In [11]:
df.select_dtypes(include='float64').head()

Unnamed: 0,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),min value received,max value received,avg val received,min val sent,max val sent,avg val sent,min value sent to contract,...,ERC20 max val rec,ERC20 avg val rec,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name
0,844.26,1093.71,704785.63,0.0,45.806785,6.589513,0.0,31.22,1.200681,0.0,...,15000000.0,265586.1476,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0
1,12709.07,2958.44,1218216.73,0.0,2.613269,0.385685,0.0,1.8,0.032844,0.0,...,365.0,57.632615,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0
2,246194.54,2434.02,516729.3,0.113119,1.165453,0.358906,0.05,3.538616,1.794308,0.0,...,442.8198,65.189009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
3,10219.6,15785.09,397555.9,0.0,500.0,99.48884,0.0,450.0,70.001834,0.0,...,11412.23,1555.550174,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0
4,36.61,10707.77,382472.42,0.0,12.802411,2.671095,0.0,9.0,0.022688,0.0,...,90000.0,4934.232147,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0


In [12]:
df.select_dtypes(include='int64').head()

Unnamed: 0.1,Unnamed: 0,Index,FLAG,Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,total transactions (including tnx to create contract
0,0,1,0,721,89,0,40,118,810
1,1,2,0,94,8,0,5,14,102
2,2,3,0,2,10,0,10,2,12
3,3,4,0,25,9,0,7,13,34
4,4,5,0,4598,20,1,7,19,4619


In [13]:
#log transformation
df["Sent tnx_log"]= np.log1p(df['Sent tnx'])